basic parsing working

This commit is contained in:
2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions

65
utils/data_utils.py Normal file
View File

@@ -0,0 +1,65 @@
import pandas as pd
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print("No sentiment data found")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print("No Voice/Character context found")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
"""
Extract the theme from a tag string.
Parameters:
- tag: str, the tag string (e.g., 'VT - Personal Experience')
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
Returns:
- str, the extracted theme (e.g., 'Personal Experience')
- None if no theme found
"""
for prefix in theme_prefixes.split('|'):
if tag.startswith(prefix):
return tag.replace(prefix, '').strip()
return None