From 7f951d9ee5858372eac8e2347a0e7520c2be68b6 Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Tue, 9 Dec 2025 22:33:51 +0100
Subject: [PATCH] Aggregation step

---
 01_Taguette-Pre-Process.py  |   2 +-
 02_Taguette_Post-Process.py |   2 +
 03_Sentiment_Analysis.py    | 137 ++++++++++++++++++++----------------
 04_Sentiment_Aggregation.py |  86 ++++++++++++++++++++++
 4 files changed, 165 insertions(+), 62 deletions(-)
 create mode 100644 04_Sentiment_Aggregation.py

diff --git a/01_Taguette-Pre-Process.py b/01_Taguette-Pre-Process.py
index 6fcea98..db809cb 100644
--- a/01_Taguette-Pre-Process.py
+++ b/01_Taguette-Pre-Process.py
@@ -1,6 +1,6 @@
 import marimo
 
-__generated_with = "0.18.0"
+__generated_with = "0.18.3"
 app = marimo.App(width="medium")
 
 
diff --git a/02_Taguette_Post-Process.py b/02_Taguette_Post-Process.py
index 47c5d6c..704cae0 100644
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -344,6 +344,8 @@ def _(WORKING_DIR, datetime, interview_select, recombined_df):
     timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
     filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
     recombined_df.to_csv(filename, index=False)
+
+    print(f"✓ Saved processed data to '{filename}'")
     return
 
 
diff --git a/03_Sentiment_Analysis.py b/03_Sentiment_Analysis.py
index f8e058b..9427d62 100644
--- a/03_Sentiment_Analysis.py
+++ b/03_Sentiment_Analysis.py
@@ -10,14 +10,46 @@ def _():
     import pandas as pd
     from pathlib import Path
 
-    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
+    INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
     WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
 
     if not WORKING_DIR.exists():
         WORKING_DIR.mkdir(parents=True)
-    if not TAGUETTE_EXPORT_DIR.exists():
-        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
-    return WORKING_DIR, mo, pd
+
+    return INPUT_DIR, Path, WORKING_DIR, mo, pd
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Load Sentiment CSV
+    """)
+    return
+
+
+@app.cell
+def _(INPUT_DIR, mo):
+    csv_files = list(INPUT_DIR.glob("*.csv"))
+    file_options = {f.stem: str(f) for f in csv_files}
+
+    sentiment_csv = mo.ui.dropdown(
+        options=file_options,
+        label="Select Sentiment CSV File",
+        full_width=True
+    )
+    sentiment_csv
+    return (sentiment_csv,)
+
+
+@app.cell
+def _(Path, pd, sentiment_csv):
+    input_csv_name = Path(sentiment_csv.value).stem
+    timestamp = input_csv_name.split('_')[-1]
+    doc = input_csv_name.split('_')[0]
+
+    sentiment_df = pd.read_csv(sentiment_csv.value)
+    sentiment_df
+    return doc, sentiment_df, timestamp
 
 
 @app.cell(hide_code=True)
@@ -31,10 +63,10 @@ def _(mo):
 
 
 @app.cell
-def _(pd):
+def _(document_name, pd):
     import numpy as np
 
-    def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
+    def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
         """
         Create a sentiment matrix for a specific document.
 
@@ -45,8 +77,6 @@ def _(pd):
         Returns:
         - DataFrame representing the sentiment matrix
         """
-        # Filter for the specific document
-        doc_df = df[df['document'] == document_name].copy()
 
         # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
         sentiment_rows = doc_df[
@@ -74,14 +104,10 @@ def _(pd):
         # Pivot to create the matrix
         matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
 
-        # Fill NaN with 0 (no sentiment data for that combination)
-        matrix = matrix.fillna(0)
-
-        # Convert to integers for cleaner display
-        matrix = matrix.astype(int)
+        # # Convert to integers for cleaner display
+        # matrix = matrix.astype(int)
 
         return matrix
-
     return (create_sentiment_matrix,)
 
 
@@ -99,31 +125,28 @@ def _(mo):
 
 
 @app.cell
-def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo):
+def _(create_sentiment_matrix, sentiment_df):
+    voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
+    voice_matrix
+    return (voice_matrix,)
 
-    # Create matrices for each unique document
-    documents = all_tags_df['document'].unique()
-    matrices = {}
 
-    for doc in documents:
-        print(f"\n{'='*60}")
-        print(f"Document: {doc}")
-        print('='*60)
-        matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-')
-        if not matrix.empty:
-            matrices[doc] = matrix
-            print(matrix)
-        else:
-            print("No matrix data available")
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    SAVE TO CSV
+    """)
+    return
 
-        # Save to CSV
-        timestamp = mo.utils.get_timestamp(short=True)
-        filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv"
-        matrix.to_csv(filename)
-        print(f"Matrix saved to: {filename}")
 
-    # Store matrices in a variable for further analysis
-    matrices
+@app.cell
+def _(WORKING_DIR, doc, timestamp, voice_matrix):
+    # Save to CSV
+    voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
+
+    voice_matrix.to_csv(voice_filename)
+
+    print(f"Saved to '{voice_filename}'")
     return
 
 
@@ -140,6 +163,24 @@ def _(mo):
     return
 
 
+@app.cell
+def _(create_sentiment_matrix, sentiment_df):
+    character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
+    character_matrix
+    return (character_matrix,)
+
+
+@app.cell
+def _(WORKING_DIR, character_matrix, doc, timestamp):
+    # Save to CSV
+    character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
+
+    character_matrix.to_csv(character_filename)
+
+    print(f"Saved to '{character_filename}'")
+    return
+
+
 @app.cell(hide_code=True)
 def _(mo):
     mo.md(r"""
@@ -150,31 +191,5 @@ def _(mo):
     return
 
 
-@app.cell(hide_code=True)
-def _(mo):
-    mo.md(r"""
-    ## Step 1.x: Save Matrices to Files
-
-    Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
-    """)
-    return
-
-
-@app.cell
-def _():
-    # Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
-    return
-
-
-@app.cell(hide_code=True)
-def _(mo):
-    mo.md(r"""
-    # Phase 2: Overall Results
-
-    Aggregate results of all the interviews into master matrices.
-    """)
-    return
-
-
 if __name__ == "__main__":
     app.run()
diff --git a/04_Sentiment_Aggregation.py b/04_Sentiment_Aggregation.py
new file mode 100644
index 0000000..3b99d8b
--- /dev/null
+++ b/04_Sentiment_Aggregation.py
@@ -0,0 +1,86 @@
+import marimo
+
+__generated_with = "0.18.3"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import pandas as pd
+    from pathlib import Path
+
+    INPUT_DIR = Path("./data/processing/03_sentiment_analysis")
+    WORKING_DIR = Path('./data/processing/04_sentiment_aggregation')
+
+    if not WORKING_DIR.exists():
+        WORKING_DIR.mkdir(parents=True)
+    return INPUT_DIR, mo, pd
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Voices
+    """)
+    return
+
+
+@app.cell
+def _(INPUT_DIR, mo):
+    voice_csv_files = list(INPUT_DIR.glob("*voice*.csv"))
+    file_options = {f.stem: str(f) for f in voice_csv_files}
+
+    voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
+    voice_multiselect
+    return (voice_multiselect,)
+
+
+@app.cell
+def _(mo, voice_multiselect):
+    mo.hstack([voice_multiselect, mo.md(f"Has value: {voice_multiselect.value}")])
+    return
+
+
+@app.cell
+def _(pd, voice_multiselect):
+    # Load all voice CSV files and aggregate them so that each row-column pair is summed
+    KEY_COL = "_context"
+
+    def _read_voice_csv(path: str) -> pd.DataFrame:
+        df = pd.read_csv(path).set_index(KEY_COL)
+        df = df.apply(pd.to_numeric, errors="coerce")
+        return df
+
+    def aggregate_voice_data(files: list[str]) -> pd.DataFrame:
+        if not files:
+            return pd.DataFrame()
+
+        master = _read_voice_csv(files[0])
+        for path in files[1:]:
+            master = master.add(_read_voice_csv(path), fill_value=0)
+
+        return master.reset_index()
+
+    master_df = aggregate_voice_data(voice_multiselect.value)
+    master_df
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Characters
+    """)
+    return
+
+
+@app.cell
+def _(INPUT_DIR):
+    char_csv_files = list(INPUT_DIR.glob("*character*.csv"))
+    char_csv_files
+    return
+
+
+if __name__ == "__main__":
+    app.run()