Aggregation step
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
import marimo
|
import marimo
|
||||||
|
|
||||||
__generated_with = "0.18.0"
|
__generated_with = "0.18.3"
|
||||||
app = marimo.App(width="medium")
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -344,6 +344,8 @@ def _(WORKING_DIR, datetime, interview_select, recombined_df):
|
|||||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
|
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
|
||||||
recombined_df.to_csv(filename, index=False)
|
recombined_df.to_csv(filename, index=False)
|
||||||
|
|
||||||
|
print(f"✓ Saved processed data to '{filename}'")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,14 +10,46 @@ def _():
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
|
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
|
||||||
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
||||||
|
|
||||||
if not WORKING_DIR.exists():
|
if not WORKING_DIR.exists():
|
||||||
WORKING_DIR.mkdir(parents=True)
|
WORKING_DIR.mkdir(parents=True)
|
||||||
if not TAGUETTE_EXPORT_DIR.exists():
|
|
||||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
return INPUT_DIR, Path, WORKING_DIR, mo, pd
|
||||||
return WORKING_DIR, mo, pd
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
# Load Sentiment CSV
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(INPUT_DIR, mo):
|
||||||
|
csv_files = list(INPUT_DIR.glob("*.csv"))
|
||||||
|
file_options = {f.stem: str(f) for f in csv_files}
|
||||||
|
|
||||||
|
sentiment_csv = mo.ui.dropdown(
|
||||||
|
options=file_options,
|
||||||
|
label="Select Sentiment CSV File",
|
||||||
|
full_width=True
|
||||||
|
)
|
||||||
|
sentiment_csv
|
||||||
|
return (sentiment_csv,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(Path, pd, sentiment_csv):
|
||||||
|
input_csv_name = Path(sentiment_csv.value).stem
|
||||||
|
timestamp = input_csv_name.split('_')[-1]
|
||||||
|
doc = input_csv_name.split('_')[0]
|
||||||
|
|
||||||
|
sentiment_df = pd.read_csv(sentiment_csv.value)
|
||||||
|
sentiment_df
|
||||||
|
return doc, sentiment_df, timestamp
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
@@ -31,10 +63,10 @@ def _(mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(pd):
|
def _(document_name, pd):
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||||
"""
|
"""
|
||||||
Create a sentiment matrix for a specific document.
|
Create a sentiment matrix for a specific document.
|
||||||
|
|
||||||
@@ -45,8 +77,6 @@ def _(pd):
|
|||||||
Returns:
|
Returns:
|
||||||
- DataFrame representing the sentiment matrix
|
- DataFrame representing the sentiment matrix
|
||||||
"""
|
"""
|
||||||
# Filter for the specific document
|
|
||||||
doc_df = df[df['document'] == document_name].copy()
|
|
||||||
|
|
||||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||||
sentiment_rows = doc_df[
|
sentiment_rows = doc_df[
|
||||||
@@ -74,14 +104,10 @@ def _(pd):
|
|||||||
# Pivot to create the matrix
|
# Pivot to create the matrix
|
||||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||||
|
|
||||||
# Fill NaN with 0 (no sentiment data for that combination)
|
# # Convert to integers for cleaner display
|
||||||
matrix = matrix.fillna(0)
|
# matrix = matrix.astype(int)
|
||||||
|
|
||||||
# Convert to integers for cleaner display
|
|
||||||
matrix = matrix.astype(int)
|
|
||||||
|
|
||||||
return matrix
|
return matrix
|
||||||
|
|
||||||
return (create_sentiment_matrix,)
|
return (create_sentiment_matrix,)
|
||||||
|
|
||||||
|
|
||||||
@@ -99,31 +125,28 @@ def _(mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo):
|
def _(create_sentiment_matrix, sentiment_df):
|
||||||
|
voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
|
||||||
|
voice_matrix
|
||||||
|
return (voice_matrix,)
|
||||||
|
|
||||||
# Create matrices for each unique document
|
|
||||||
documents = all_tags_df['document'].unique()
|
|
||||||
matrices = {}
|
|
||||||
|
|
||||||
for doc in documents:
|
@app.cell(hide_code=True)
|
||||||
print(f"\n{'='*60}")
|
def _(mo):
|
||||||
print(f"Document: {doc}")
|
mo.md(r"""
|
||||||
print('='*60)
|
SAVE TO CSV
|
||||||
matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-')
|
""")
|
||||||
if not matrix.empty:
|
return
|
||||||
matrices[doc] = matrix
|
|
||||||
print(matrix)
|
|
||||||
else:
|
|
||||||
print("No matrix data available")
|
|
||||||
|
|
||||||
# Save to CSV
|
|
||||||
timestamp = mo.utils.get_timestamp(short=True)
|
|
||||||
filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv"
|
|
||||||
matrix.to_csv(filename)
|
|
||||||
print(f"Matrix saved to: {filename}")
|
|
||||||
|
|
||||||
# Store matrices in a variable for further analysis
|
@app.cell
|
||||||
matrices
|
def _(WORKING_DIR, doc, timestamp, voice_matrix):
|
||||||
|
# Save to CSV
|
||||||
|
voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
|
||||||
|
|
||||||
|
voice_matrix.to_csv(voice_filename)
|
||||||
|
|
||||||
|
print(f"Saved to '{voice_filename}'")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@@ -140,6 +163,24 @@ def _(mo):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(create_sentiment_matrix, sentiment_df):
|
||||||
|
character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
|
||||||
|
character_matrix
|
||||||
|
return (character_matrix,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(WORKING_DIR, character_matrix, doc, timestamp):
|
||||||
|
# Save to CSV
|
||||||
|
character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
|
||||||
|
|
||||||
|
character_matrix.to_csv(character_filename)
|
||||||
|
|
||||||
|
print(f"Saved to '{character_filename}'")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(mo):
|
def _(mo):
|
||||||
mo.md(r"""
|
mo.md(r"""
|
||||||
@@ -150,31 +191,5 @@ def _(mo):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
|
||||||
def _(mo):
|
|
||||||
mo.md(r"""
|
|
||||||
## Step 1.x: Save Matrices to Files
|
|
||||||
|
|
||||||
Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
|
|
||||||
""")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _():
|
|
||||||
# Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
|
||||||
def _(mo):
|
|
||||||
mo.md(r"""
|
|
||||||
# Phase 2: Overall Results
|
|
||||||
|
|
||||||
Aggregate results of all the interviews into master matrices.
|
|
||||||
""")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
app.run()
|
app.run()
|
||||||
|
|||||||
86
04_Sentiment_Aggregation.py
Normal file
86
04_Sentiment_Aggregation.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.18.3"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import marimo as mo
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
INPUT_DIR = Path("./data/processing/03_sentiment_analysis")
|
||||||
|
WORKING_DIR = Path('./data/processing/04_sentiment_aggregation')
|
||||||
|
|
||||||
|
if not WORKING_DIR.exists():
|
||||||
|
WORKING_DIR.mkdir(parents=True)
|
||||||
|
return INPUT_DIR, mo, pd
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
# Voices
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(INPUT_DIR, mo):
|
||||||
|
voice_csv_files = list(INPUT_DIR.glob("*voice*.csv"))
|
||||||
|
file_options = {f.stem: str(f) for f in voice_csv_files}
|
||||||
|
|
||||||
|
voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
|
||||||
|
voice_multiselect
|
||||||
|
return (voice_multiselect,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo, voice_multiselect):
|
||||||
|
mo.hstack([voice_multiselect, mo.md(f"Has value: {voice_multiselect.value}")])
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pd, voice_multiselect):
|
||||||
|
# Load all voice CSV files and aggregate them so that each row-column pair is summed
|
||||||
|
KEY_COL = "_context"
|
||||||
|
|
||||||
|
def _read_voice_csv(path: str) -> pd.DataFrame:
|
||||||
|
df = pd.read_csv(path).set_index(KEY_COL)
|
||||||
|
df = df.apply(pd.to_numeric, errors="coerce")
|
||||||
|
return df
|
||||||
|
|
||||||
|
def aggregate_voice_data(files: list[str]) -> pd.DataFrame:
|
||||||
|
if not files:
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
|
master = _read_voice_csv(files[0])
|
||||||
|
for path in files[1:]:
|
||||||
|
master = master.add(_read_voice_csv(path), fill_value=0)
|
||||||
|
|
||||||
|
return master.reset_index()
|
||||||
|
|
||||||
|
master_df = aggregate_voice_data(voice_multiselect.value)
|
||||||
|
master_df
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
# Characters
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(INPUT_DIR):
|
||||||
|
char_csv_files = list(INPUT_DIR.glob("*character*.csv"))
|
||||||
|
char_csv_files
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
Reference in New Issue
Block a user