From 821fa01edbe69cae8dcfc9a800f1425068286227 Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Tue, 9 Dec 2025 21:40:54 +0100
Subject: [PATCH] sentiments saving to intermediate csv

---
 02_Taguette_Post-Process.py | 119 +++++++++++++++++++++---------------
 1 file changed, 69 insertions(+), 50 deletions(-)

diff --git a/02_Taguette_Post-Process.py b/02_Taguette_Post-Process.py
index ba418b2..47c5d6c 100644
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -9,6 +9,7 @@ def _():
     import marimo as mo
     import pandas as pd
     from pathlib import Path
+    from datetime import datetime
 
     TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
     WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
@@ -17,7 +18,7 @@ def _():
         WORKING_DIR.mkdir(parents=True)
     if not TAGUETTE_EXPORT_DIR.exists():
         TAGUETTE_EXPORT_DIR.mkdir(parents=True)
-    return TAGUETTE_EXPORT_DIR, mo, pd
+    return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
 
 
 @app.cell(hide_code=True)
@@ -61,19 +62,19 @@ def _(mo):
 @app.cell
 def _(all_tags_df, mo):
 
-    file_dropdown = mo.ui.dropdown(
+    interview_select = mo.ui.dropdown(
         options=all_tags_df['document'].unique().tolist(),
         label="Select Interview to Process",
         full_width=True
     )
-    file_dropdown
-    return (file_dropdown,)
+    interview_select
+    return (interview_select,)
 
 
 @app.cell
-def _(all_tags_df, file_dropdown):
+def _(all_tags_df, interview_select):
     # filter all_tags_df to only the document = file_dropdown.value
-    df = all_tags_df.loc[all_tags_df['document'] == file_dropdown.value].copy()
+    df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
     return (df,)
 
 
@@ -139,7 +140,6 @@ def _(df):
             # Assign the context to all rows in this highlight
             df.loc[df['id'] == highlight_id, '_context'] = context_tag
 
-    del idx
     df
     return
 
@@ -188,7 +188,12 @@ def _(df, pd):
 
     expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
 
-    manual_rows = expanded_df_raw[expanded_df_raw['manual_analysis']]
+
+    sentiment_df = expanded_df_raw.loc[
+        expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
+    ].copy()
+
+    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
     if not manual_rows.empty:
         print(
             f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
@@ -196,15 +201,14 @@ def _(df, pd):
         )
     else:
         print("✓ No multi-context rows found")
-    return (expanded_df_raw,)
+    return (sentiment_df,)
 
 
 @app.cell
-def _(expanded_df_raw, mo):
+def _(mo, sentiment_df):
     # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
-    rows_to_edit = expanded_df_raw[
-        (expanded_df_raw['manual_analysis'])
-        & (expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False))
+    rows_to_edit = sentiment_df[
+        (sentiment_df['manual_analysis'])
     ]
 
     # Create data editor for split rows
@@ -232,43 +236,22 @@ def _(mo, rows_to_edit, split_rows_editor):
 
 
 @app.cell
-def _(expanded_df_raw, mo, pd, split_rows_editor):
-    # Reconstruct the full dataframe using the editor's current value
-    # This will update whenever the user edits the table
+def _(mo, split_rows_editor):
+    # Capture the edited manual-analysis rows for validation
     mo.stop(split_rows_editor.value is None, mo.md("Submit your changes."))
-    _edited_rows = split_rows_editor.value
-    _static_rows = expanded_df_raw[~expanded_df_raw['manual_analysis']]
-    expanded_df2 = pd.concat([_static_rows, _edited_rows]).sort_index()
-    return (expanded_df2,)
+    reviewed_manual_rows = split_rows_editor.value
 
+    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
+    if not reviewed_manual_rows.empty:
+        valid_sentiments = {-1, 0, 1}
+        needs_review = reviewed_manual_rows[
+            reviewed_manual_rows['manual_analysis']
+            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
+        ]
+        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
 
-@app.cell
-def _(expanded_df2, pd):
-    # Verify no rows have multiple contexts
-    try:
-        has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False)
-        assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)"
-
-        # Verify that rows still marked for manual analysis have sentiment values
-        manual_sent_rows = expanded_df2[expanded_df2['manual_analysis']]
-        theme_rows = manual_sent_rows[manual_sent_rows['tag'].str.startswith(('VT -', 'CT -'), na=False)]
-        missing_sentiment = theme_rows[theme_rows['sentiment'].isna()]
-
-        assert missing_sentiment.empty, (
-            f"{len(missing_sentiment)} rows marked for manual analysis "
-            "have missing sentiment values"
-        )
-
-        print("\n✓ Verification passed: Manual-analysis rows are consistent")
-
-        expanded_df_final = expanded_df2
-
-        expanded_df_final
-
-    except AssertionError as e:
-        print(f"\n❌ Verification failed: {e}")
-        print("Please review the data before proceeding")
-    return
+    print("✓ Manual-analysis rows have valid sentiment values")
+    return (reviewed_manual_rows,)
 
 
 @app.cell(hide_code=True)
@@ -287,7 +270,7 @@ def _(mo):
 
 
 @app.cell
-def _(df):
+def _(sentiment_df):
     # TODO: Implement sentiment analysis and add 'sentiment' column
 
     # for now, create an empty sentiment column with randomized dummy values for testing
@@ -299,12 +282,31 @@ def _(df):
             return random.choice([-1, 0, 1])  # Random sentiment for testing
         return None
 
-    df['sentiment'] = df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
+    # Only run on rows without manual_analysis
 
-    df
+    sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
+
+    sentiment_df[~sentiment_df['manual_analysis']]
     return
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Recombine
+    """)
+    return
+
+
+@app.cell
+def _(pd, reviewed_manual_rows, sentiment_df):
+    _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
+    recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
+
+    recombined_df
+    return (recombined_df,)
+
+
 @app.cell(hide_code=True)
 def _(mo):
     mo.md(r"""
@@ -328,5 +330,22 @@ def _():
     return
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Save to CSV
+    """)
+    return
+
+
+@app.cell
+def _(WORKING_DIR, datetime, interview_select, recombined_df):
+    # Save to CSV in working dir
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
+    recombined_df.to_csv(filename, index=False)
+    return
+
+
 if __name__ == "__main__":
     app.run()