From 821fa01edbe69cae8dcfc9a800f1425068286227 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Tue, 9 Dec 2025 21:40:54 +0100 Subject: [PATCH] sentiments saving to intermediate csv --- 02_Taguette_Post-Process.py | 119 +++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 50 deletions(-) diff --git a/02_Taguette_Post-Process.py b/02_Taguette_Post-Process.py index ba418b2..47c5d6c 100644 --- a/02_Taguette_Post-Process.py +++ b/02_Taguette_Post-Process.py @@ -9,6 +9,7 @@ def _(): import marimo as mo import pandas as pd from pathlib import Path + from datetime import datetime TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') WORKING_DIR = Path('./data/processing/02_taguette_postprocess') @@ -17,7 +18,7 @@ def _(): WORKING_DIR.mkdir(parents=True) if not TAGUETTE_EXPORT_DIR.exists(): TAGUETTE_EXPORT_DIR.mkdir(parents=True) - return TAGUETTE_EXPORT_DIR, mo, pd + return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd @app.cell(hide_code=True) @@ -61,19 +62,19 @@ def _(mo): @app.cell def _(all_tags_df, mo): - file_dropdown = mo.ui.dropdown( + interview_select = mo.ui.dropdown( options=all_tags_df['document'].unique().tolist(), label="Select Interview to Process", full_width=True ) - file_dropdown - return (file_dropdown,) + interview_select + return (interview_select,) @app.cell -def _(all_tags_df, file_dropdown): +def _(all_tags_df, interview_select): # filter all_tags_df to only the document = file_dropdown.value - df = all_tags_df.loc[all_tags_df['document'] == file_dropdown.value].copy() + df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy() return (df,) @@ -139,7 +140,6 @@ def _(df): # Assign the context to all rows in this highlight df.loc[df['id'] == highlight_id, '_context'] = context_tag - del idx df return @@ -188,7 +188,12 @@ def _(df, pd): expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True) - manual_rows = expanded_df_raw[expanded_df_raw['manual_analysis']] + + sentiment_df = expanded_df_raw.loc[ + expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False) + ].copy() + + manual_rows = sentiment_df[sentiment_df['manual_analysis']] if not manual_rows.empty: print( f"⚠️ {len(manual_rows)} rows were created from multi-context splits. " @@ -196,15 +201,14 @@ def _(df, pd): ) else: print("✓ No multi-context rows found") - return (expanded_df_raw,) + return (sentiment_df,) @app.cell -def _(expanded_df_raw, mo): +def _(mo, sentiment_df): # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -' - rows_to_edit = expanded_df_raw[ - (expanded_df_raw['manual_analysis']) - & (expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)) + rows_to_edit = sentiment_df[ + (sentiment_df['manual_analysis']) ] # Create data editor for split rows @@ -232,43 +236,22 @@ def _(mo, rows_to_edit, split_rows_editor): @app.cell -def _(expanded_df_raw, mo, pd, split_rows_editor): - # Reconstruct the full dataframe using the editor's current value - # This will update whenever the user edits the table +def _(mo, split_rows_editor): + # Capture the edited manual-analysis rows for validation mo.stop(split_rows_editor.value is None, mo.md("Submit your changes.")) - _edited_rows = split_rows_editor.value - _static_rows = expanded_df_raw[~expanded_df_raw['manual_analysis']] - expanded_df2 = pd.concat([_static_rows, _edited_rows]).sort_index() - return (expanded_df2,) + reviewed_manual_rows = split_rows_editor.value + # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1 + if not reviewed_manual_rows.empty: + valid_sentiments = {-1, 0, 1} + needs_review = reviewed_manual_rows[ + reviewed_manual_rows['manual_analysis'] + & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments) + ] + assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1" -@app.cell -def _(expanded_df2, pd): - # Verify no rows have multiple contexts - try: - has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False) - assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)" - - # Verify that rows still marked for manual analysis have sentiment values - manual_sent_rows = expanded_df2[expanded_df2['manual_analysis']] - theme_rows = manual_sent_rows[manual_sent_rows['tag'].str.startswith(('VT -', 'CT -'), na=False)] - missing_sentiment = theme_rows[theme_rows['sentiment'].isna()] - - assert missing_sentiment.empty, ( - f"{len(missing_sentiment)} rows marked for manual analysis " - "have missing sentiment values" - ) - - print("\n✓ Verification passed: Manual-analysis rows are consistent") - - expanded_df_final = expanded_df2 - - expanded_df_final - - except AssertionError as e: - print(f"\n❌ Verification failed: {e}") - print("Please review the data before proceeding") - return + print("✓ Manual-analysis rows have valid sentiment values") + return (reviewed_manual_rows,) @app.cell(hide_code=True) @@ -287,7 +270,7 @@ def _(mo): @app.cell -def _(df): +def _(sentiment_df): # TODO: Implement sentiment analysis and add 'sentiment' column # for now, create an empty sentiment column with randomized dummy values for testing @@ -299,12 +282,31 @@ def _(df): return random.choice([-1, 0, 1]) # Random sentiment for testing return None - df['sentiment'] = df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1) + # Only run on rows without manual_analysis - df + sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1) + + sentiment_df[~sentiment_df['manual_analysis']] return +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Recombine + """) + return + + +@app.cell +def _(pd, reviewed_manual_rows, sentiment_df): + _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']] + recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True) + + recombined_df + return (recombined_df,) + + @app.cell(hide_code=True) def _(mo): mo.md(r""" @@ -328,5 +330,22 @@ def _(): return +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Save to CSV + """) + return + + +@app.cell +def _(WORKING_DIR, datetime, interview_select, recombined_df): + # Save to CSV in working dir + timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") + filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv" + recombined_df.to_csv(filename, index=False) + return + + if __name__ == "__main__": app.run()