Aggregation step

This commit is contained in:
2025-12-09 22:33:51 +01:00
parent 821fa01edb
commit 7f951d9ee5
4 changed files with 165 additions and 62 deletions

View File

@@ -10,14 +10,46 @@ def _():
import pandas as pd
from pathlib import Path
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
return WORKING_DIR, mo, pd
return INPUT_DIR, Path, WORKING_DIR, mo, pd
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Load Sentiment CSV
""")
return
@app.cell
def _(INPUT_DIR, mo):
csv_files = list(INPUT_DIR.glob("*.csv"))
file_options = {f.stem: str(f) for f in csv_files}
sentiment_csv = mo.ui.dropdown(
options=file_options,
label="Select Sentiment CSV File",
full_width=True
)
sentiment_csv
return (sentiment_csv,)
@app.cell
def _(Path, pd, sentiment_csv):
input_csv_name = Path(sentiment_csv.value).stem
timestamp = input_csv_name.split('_')[-1]
doc = input_csv_name.split('_')[0]
sentiment_df = pd.read_csv(sentiment_csv.value)
sentiment_df
return doc, sentiment_df, timestamp
@app.cell(hide_code=True)
@@ -31,10 +63,10 @@ def _(mo):
@app.cell
def _(pd):
def _(document_name, pd):
import numpy as np
def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
@@ -45,8 +77,6 @@ def _(pd):
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for the specific document
doc_df = df[df['document'] == document_name].copy()
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
@@ -74,14 +104,10 @@ def _(pd):
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# Fill NaN with 0 (no sentiment data for that combination)
matrix = matrix.fillna(0)
# Convert to integers for cleaner display
matrix = matrix.astype(int)
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
return (create_sentiment_matrix,)
@@ -99,31 +125,28 @@ def _(mo):
@app.cell
def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo):
def _(create_sentiment_matrix, sentiment_df):
voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
voice_matrix
return (voice_matrix,)
# Create matrices for each unique document
documents = all_tags_df['document'].unique()
matrices = {}
for doc in documents:
print(f"\n{'='*60}")
print(f"Document: {doc}")
print('='*60)
matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-')
if not matrix.empty:
matrices[doc] = matrix
print(matrix)
else:
print("No matrix data available")
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
SAVE TO CSV
""")
return
# Save to CSV
timestamp = mo.utils.get_timestamp(short=True)
filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv"
matrix.to_csv(filename)
print(f"Matrix saved to: {filename}")
# Store matrices in a variable for further analysis
matrices
@app.cell
def _(WORKING_DIR, doc, timestamp, voice_matrix):
# Save to CSV
voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
voice_matrix.to_csv(voice_filename)
print(f"Saved to '{voice_filename}'")
return
@@ -140,6 +163,24 @@ def _(mo):
return
@app.cell
def _(create_sentiment_matrix, sentiment_df):
character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
character_matrix
return (character_matrix,)
@app.cell
def _(WORKING_DIR, character_matrix, doc, timestamp):
# Save to CSV
character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
character_matrix.to_csv(character_filename)
print(f"Saved to '{character_filename}'")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
@@ -150,31 +191,5 @@ def _(mo):
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1.x: Save Matrices to Files
Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
""")
return
@app.cell
def _():
# Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Phase 2: Overall Results
Aggregate results of all the interviews into master matrices.
""")
return
if __name__ == "__main__":
app.run()