65 lines
1.9 KiB
Python
65 lines
1.9 KiB
Python
import pandas as pd
|
|
|
|
|
|
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
|
"""
|
|
Create a sentiment matrix for a specific document.
|
|
|
|
Parameters:
|
|
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
|
- document_name: Name of the document to filter by
|
|
|
|
Returns:
|
|
- DataFrame representing the sentiment matrix
|
|
"""
|
|
|
|
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
|
sentiment_rows = doc_df[
|
|
doc_df['tag'].str.contains(column_prefix, na=False)
|
|
].copy()
|
|
|
|
if sentiment_rows.empty:
|
|
print("No sentiment data found")
|
|
return pd.DataFrame()
|
|
|
|
# Filter for rows with valid Voice/Character context
|
|
valid_rows = sentiment_rows[
|
|
sentiment_rows['_context'].notna() &
|
|
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
|
].copy()
|
|
|
|
if valid_rows.empty:
|
|
print("No Voice/Character context found")
|
|
return pd.DataFrame()
|
|
|
|
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
|
# Sum sentiment scores for each combination
|
|
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
|
|
|
# Pivot to create the matrix
|
|
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
|
|
|
# # Convert to integers for cleaner display
|
|
# matrix = matrix.astype(int)
|
|
|
|
return matrix
|
|
|
|
|
|
|
|
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
|
|
"""
|
|
Extract the theme from a tag string.
|
|
|
|
Parameters:
|
|
- tag: str, the tag string (e.g., 'VT - Personal Experience')
|
|
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
|
|
|
|
Returns:
|
|
- str, the extracted theme (e.g., 'Personal Experience')
|
|
- None if no theme found
|
|
"""
|
|
for prefix in theme_prefixes.split('|'):
|
|
if tag.startswith(prefix):
|
|
return tag.replace(prefix, '').strip()
|
|
return None
|
|
|