restructure analysis

2025-12-09 21:05:07 +01:00
parent beddfee087
commit 514570062c
3 changed files with 413 additions and 211 deletions
--- a/03_Sentiment_Analysis.py
+++ b/03_Sentiment_Analysis.py
@@ -0,0 +1,180 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    from pathlib import Path
+
+    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
+    WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
+
+    if not WORKING_DIR.exists():
+        WORKING_DIR.mkdir(parents=True)
+    if not TAGUETTE_EXPORT_DIR.exists():
+        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
+    return WORKING_DIR, mo, pd
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Phase 1: Individual interview analysis
+    - Create sentiment matrices for each interview (document)
+    - Save the intermediate results to file in the `WORKING_DIR`
+    """)
+    return
+
+
+@app.cell
+def _(pd):
+    import numpy as np
+
+    def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
+        """
+        Create a sentiment matrix for a specific document.
+
+        Parameters:
+        - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
+        - document_name: Name of the document to filter by
+
+        Returns:
+        - DataFrame representing the sentiment matrix
+        """
+        # Filter for the specific document
+        doc_df = df[df['document'] == document_name].copy()
+
+        # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
+        sentiment_rows = doc_df[
+            doc_df['tag'].str.contains(column_prefix, na=False)
+        ].copy()
+
+        if sentiment_rows.empty:
+            print(f"No sentiment data found for document: {document_name}")
+            return pd.DataFrame()
+
+        # Filter for rows with valid Voice/Character context
+        valid_rows = sentiment_rows[
+            sentiment_rows['_context'].notna() & 
+            (sentiment_rows['_context'].str.contains(row_prefix, na=False))
+        ].copy()
+
+        if valid_rows.empty:
+            print(f"No Voice/Character context found for document: {document_name}")
+            return pd.DataFrame()
+
+        # Create aggregation: group by Voice/Character (_context) and Theme (tag)
+        # Sum sentiment scores for each combination
+        matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
+
+        # Pivot to create the matrix
+        matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
+
+        # Fill NaN with 0 (no sentiment data for that combination)
+        matrix = matrix.fillna(0)
+
+        # Convert to integers for cleaner display
+        matrix = matrix.astype(int)
+
+        return matrix
+
+    return (create_sentiment_matrix,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.1: Voice Sample vs. Theme Sentiment Matrix
+
+    For each interview (document), create a matrix where:
+    - Rows represent the different Voices (based on '_V-' tags)
+    - Columns represent the different VoiceThemes(based on 'VT -' tags)
+    - Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
+    """)
+    return
+
+
+@app.cell
+def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo):
+
+    # Create matrices for each unique document
+    documents = all_tags_df['document'].unique()
+    matrices = {}
+
+    for doc in documents:
+        print(f"\n{'='*60}")
+        print(f"Document: {doc}")
+        print('='*60)
+        matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-')
+        if not matrix.empty:
+            matrices[doc] = matrix
+            print(matrix)
+        else:
+            print("No matrix data available")
+
+        # Save to CSV
+        timestamp = mo.utils.get_timestamp(short=True)
+        filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv"
+        matrix.to_csv(filename)
+        print(f"Matrix saved to: {filename}")
+
+    # Store matrices in a variable for further analysis
+    matrices
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.2: Character Sample vs. Theme Sentiment Matrix
+
+    For each interview (document), create a matrix where:
+    - Rows represent the different Characters (based on  '_C-' tags)
+    - Columns represent the different CharacterThemes (based on 'CT -' tags)
+    - Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.3: Chase Brand Sentiment
+
+    TODO: not sure we have enough supporting data for this yet
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1.x: Save Matrices to Files
+
+    Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
+    """)
+    return
+
+
+@app.cell
+def _():
+    # Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Phase 2: Overall Results
+
+    Aggregate results of all the interviews into master matrices.
+    """)
+    return
+
+
+if __name__ == "__main__":
+    app.run()