From 7f951d9ee5858372eac8e2347a0e7520c2be68b6 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Tue, 9 Dec 2025 22:33:51 +0100 Subject: [PATCH] Aggregation step --- 01_Taguette-Pre-Process.py | 2 +- 02_Taguette_Post-Process.py | 2 + 03_Sentiment_Analysis.py | 137 ++++++++++++++++++++---------------- 04_Sentiment_Aggregation.py | 86 ++++++++++++++++++++++ 4 files changed, 165 insertions(+), 62 deletions(-) create mode 100644 04_Sentiment_Aggregation.py diff --git a/01_Taguette-Pre-Process.py b/01_Taguette-Pre-Process.py index 6fcea98..db809cb 100644 --- a/01_Taguette-Pre-Process.py +++ b/01_Taguette-Pre-Process.py @@ -1,6 +1,6 @@ import marimo -__generated_with = "0.18.0" +__generated_with = "0.18.3" app = marimo.App(width="medium") diff --git a/02_Taguette_Post-Process.py b/02_Taguette_Post-Process.py index 47c5d6c..704cae0 100644 --- a/02_Taguette_Post-Process.py +++ b/02_Taguette_Post-Process.py @@ -344,6 +344,8 @@ def _(WORKING_DIR, datetime, interview_select, recombined_df): timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv" recombined_df.to_csv(filename, index=False) + + print(f"✓ Saved processed data to '{filename}'") return diff --git a/03_Sentiment_Analysis.py b/03_Sentiment_Analysis.py index f8e058b..9427d62 100644 --- a/03_Sentiment_Analysis.py +++ b/03_Sentiment_Analysis.py @@ -10,14 +10,46 @@ def _(): import pandas as pd from pathlib import Path - TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') + INPUT_DIR = Path("./data/processing/02_taguette_postprocess") WORKING_DIR = Path('./data/processing/03_sentiment_analysis') if not WORKING_DIR.exists(): WORKING_DIR.mkdir(parents=True) - if not TAGUETTE_EXPORT_DIR.exists(): - TAGUETTE_EXPORT_DIR.mkdir(parents=True) - return WORKING_DIR, mo, pd + + return INPUT_DIR, Path, WORKING_DIR, mo, pd + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Load Sentiment CSV + """) + return + + +@app.cell +def _(INPUT_DIR, mo): + csv_files = list(INPUT_DIR.glob("*.csv")) + file_options = {f.stem: str(f) for f in csv_files} + + sentiment_csv = mo.ui.dropdown( + options=file_options, + label="Select Sentiment CSV File", + full_width=True + ) + sentiment_csv + return (sentiment_csv,) + + +@app.cell +def _(Path, pd, sentiment_csv): + input_csv_name = Path(sentiment_csv.value).stem + timestamp = input_csv_name.split('_')[-1] + doc = input_csv_name.split('_')[0] + + sentiment_df = pd.read_csv(sentiment_csv.value) + sentiment_df + return doc, sentiment_df, timestamp @app.cell(hide_code=True) @@ -31,10 +63,10 @@ def _(mo): @app.cell -def _(pd): +def _(document_name, pd): import numpy as np - def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'): + def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'): """ Create a sentiment matrix for a specific document. @@ -45,8 +77,6 @@ def _(pd): Returns: - DataFrame representing the sentiment matrix """ - # Filter for the specific document - doc_df = df[df['document'] == document_name].copy() # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-) sentiment_rows = doc_df[ @@ -74,14 +104,10 @@ def _(pd): # Pivot to create the matrix matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment') - # Fill NaN with 0 (no sentiment data for that combination) - matrix = matrix.fillna(0) - - # Convert to integers for cleaner display - matrix = matrix.astype(int) + # # Convert to integers for cleaner display + # matrix = matrix.astype(int) return matrix - return (create_sentiment_matrix,) @@ -99,31 +125,28 @@ def _(mo): @app.cell -def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo): +def _(create_sentiment_matrix, sentiment_df): + voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-') + voice_matrix + return (voice_matrix,) - # Create matrices for each unique document - documents = all_tags_df['document'].unique() - matrices = {} - for doc in documents: - print(f"\n{'='*60}") - print(f"Document: {doc}") - print('='*60) - matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-') - if not matrix.empty: - matrices[doc] = matrix - print(matrix) - else: - print("No matrix data available") +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + SAVE TO CSV + """) + return - # Save to CSV - timestamp = mo.utils.get_timestamp(short=True) - filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv" - matrix.to_csv(filename) - print(f"Matrix saved to: {filename}") - # Store matrices in a variable for further analysis - matrices +@app.cell +def _(WORKING_DIR, doc, timestamp, voice_matrix): + # Save to CSV + voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv" + + voice_matrix.to_csv(voice_filename) + + print(f"Saved to '{voice_filename}'") return @@ -140,6 +163,24 @@ def _(mo): return +@app.cell +def _(create_sentiment_matrix, sentiment_df): + character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-') + character_matrix + return (character_matrix,) + + +@app.cell +def _(WORKING_DIR, character_matrix, doc, timestamp): + # Save to CSV + character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv" + + character_matrix.to_csv(character_filename) + + print(f"Saved to '{character_filename}'") + return + + @app.cell(hide_code=True) def _(mo): mo.md(r""" @@ -150,31 +191,5 @@ def _(mo): return -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - ## Step 1.x: Save Matrices to Files - - Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs. - """) - return - - -@app.cell -def _(): - # Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs. - return - - -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - # Phase 2: Overall Results - - Aggregate results of all the interviews into master matrices. - """) - return - - if __name__ == "__main__": app.run() diff --git a/04_Sentiment_Aggregation.py b/04_Sentiment_Aggregation.py new file mode 100644 index 0000000..3b99d8b --- /dev/null +++ b/04_Sentiment_Aggregation.py @@ -0,0 +1,86 @@ +import marimo + +__generated_with = "0.18.3" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import marimo as mo + import pandas as pd + from pathlib import Path + + INPUT_DIR = Path("./data/processing/03_sentiment_analysis") + WORKING_DIR = Path('./data/processing/04_sentiment_aggregation') + + if not WORKING_DIR.exists(): + WORKING_DIR.mkdir(parents=True) + return INPUT_DIR, mo, pd + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Voices + """) + return + + +@app.cell +def _(INPUT_DIR, mo): + voice_csv_files = list(INPUT_DIR.glob("*voice*.csv")) + file_options = {f.stem: str(f) for f in voice_csv_files} + + voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation") + voice_multiselect + return (voice_multiselect,) + + +@app.cell +def _(mo, voice_multiselect): + mo.hstack([voice_multiselect, mo.md(f"Has value: {voice_multiselect.value}")]) + return + + +@app.cell +def _(pd, voice_multiselect): + # Load all voice CSV files and aggregate them so that each row-column pair is summed + KEY_COL = "_context" + + def _read_voice_csv(path: str) -> pd.DataFrame: + df = pd.read_csv(path).set_index(KEY_COL) + df = df.apply(pd.to_numeric, errors="coerce") + return df + + def aggregate_voice_data(files: list[str]) -> pd.DataFrame: + if not files: + return pd.DataFrame() + + master = _read_voice_csv(files[0]) + for path in files[1:]: + master = master.add(_read_voice_csv(path), fill_value=0) + + return master.reset_index() + + master_df = aggregate_voice_data(voice_multiselect.value) + master_df + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Characters + """) + return + + +@app.cell +def _(INPUT_DIR): + char_csv_files = list(INPUT_DIR.glob("*character*.csv")) + char_csv_files + return + + +if __name__ == "__main__": + app.run()