import marimo __generated_with = "0.18.3" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pandas as pd from pathlib import Path TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') WORKING_DIR = Path('./data/processing/03_sentiment_analysis') if not WORKING_DIR.exists(): WORKING_DIR.mkdir(parents=True) if not TAGUETTE_EXPORT_DIR.exists(): TAGUETTE_EXPORT_DIR.mkdir(parents=True) return WORKING_DIR, mo, pd @app.cell(hide_code=True) def _(mo): mo.md(r""" # Phase 1: Individual interview analysis - Create sentiment matrices for each interview (document) - Save the intermediate results to file in the `WORKING_DIR` """) return @app.cell def _(pd): import numpy as np def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'): """ Create a sentiment matrix for a specific document. Parameters: - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment'] - document_name: Name of the document to filter by Returns: - DataFrame representing the sentiment matrix """ # Filter for the specific document doc_df = df[df['document'] == document_name].copy() # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-) sentiment_rows = doc_df[ doc_df['tag'].str.contains(column_prefix, na=False) ].copy() if sentiment_rows.empty: print(f"No sentiment data found for document: {document_name}") return pd.DataFrame() # Filter for rows with valid Voice/Character context valid_rows = sentiment_rows[ sentiment_rows['_context'].notna() & (sentiment_rows['_context'].str.contains(row_prefix, na=False)) ].copy() if valid_rows.empty: print(f"No Voice/Character context found for document: {document_name}") return pd.DataFrame() # Create aggregation: group by Voice/Character (_context) and Theme (tag) # Sum sentiment scores for each combination matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index() # Pivot to create the matrix matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment') # Fill NaN with 0 (no sentiment data for that combination) matrix = matrix.fillna(0) # Convert to integers for cleaner display matrix = matrix.astype(int) return matrix return (create_sentiment_matrix,) @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 1.1: Voice Sample vs. Theme Sentiment Matrix For each interview (document), create a matrix where: - Rows represent the different Voices (based on '_V-' tags) - Columns represent the different VoiceThemes(based on 'VT -' tags) - Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination """) return @app.cell def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo): # Create matrices for each unique document documents = all_tags_df['document'].unique() matrices = {} for doc in documents: print(f"\n{'='*60}") print(f"Document: {doc}") print('='*60) matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-') if not matrix.empty: matrices[doc] = matrix print(matrix) else: print("No matrix data available") # Save to CSV timestamp = mo.utils.get_timestamp(short=True) filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv" matrix.to_csv(filename) print(f"Matrix saved to: {filename}") # Store matrices in a variable for further analysis matrices return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 1.2: Character Sample vs. Theme Sentiment Matrix For each interview (document), create a matrix where: - Rows represent the different Characters (based on '_C-' tags) - Columns represent the different CharacterThemes (based on 'CT -' tags) - Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination """) return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 1.3: Chase Brand Sentiment TODO: not sure we have enough supporting data for this yet """) return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 1.x: Save Matrices to Files Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs. """) return @app.cell def _(): # Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs. return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Phase 2: Overall Results Aggregate results of all the interviews into master matrices. """) return if __name__ == "__main__": app.run()