import pandas as pd def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'): """ Create a sentiment matrix for a specific document. Parameters: - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment'] - document_name: Name of the document to filter by Returns: - DataFrame representing the sentiment matrix """ # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-) sentiment_rows = doc_df[ doc_df['tag'].str.contains(column_prefix, na=False) ].copy() if sentiment_rows.empty: print("No sentiment data found") return pd.DataFrame() # Filter for rows with valid Voice/Character context valid_rows = sentiment_rows[ sentiment_rows['_context'].notna() & (sentiment_rows['_context'].str.contains(row_prefix, na=False)) ].copy() if valid_rows.empty: print("No Voice/Character context found") return pd.DataFrame() # Create aggregation: group by Voice/Character (_context) and Theme (tag) # Sum sentiment scores for each combination matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index() # Pivot to create the matrix matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment') # # Convert to integers for cleaner display # matrix = matrix.astype(int) return matrix def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str: """ Extract the theme from a tag string. Parameters: - tag: str, the tag string (e.g., 'VT - Personal Experience') - theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ') Returns: - str, the extracted theme (e.g., 'Personal Experience') - None if no theme found """ for prefix in theme_prefixes.split('|'): if tag.startswith(prefix): return tag.replace(prefix, '').strip() return None