196 lines
5.1 KiB
Python
196 lines
5.1 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.18.3"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
|
|
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
|
|
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
|
|
|
if not WORKING_DIR.exists():
|
|
WORKING_DIR.mkdir(parents=True)
|
|
|
|
return INPUT_DIR, Path, WORKING_DIR, mo, pd
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Load Sentiment CSV
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(INPUT_DIR, mo):
|
|
csv_files = list(INPUT_DIR.glob("*.csv"))
|
|
file_options = {f.stem: str(f) for f in csv_files}
|
|
|
|
sentiment_csv = mo.ui.dropdown(
|
|
options=file_options,
|
|
label="Select Sentiment CSV File",
|
|
full_width=True
|
|
)
|
|
sentiment_csv
|
|
return (sentiment_csv,)
|
|
|
|
|
|
@app.cell
|
|
def _(Path, pd, sentiment_csv):
|
|
input_csv_name = Path(sentiment_csv.value).stem
|
|
timestamp = input_csv_name.split('_')[-1]
|
|
doc = input_csv_name.split('_')[0]
|
|
|
|
sentiment_df = pd.read_csv(sentiment_csv.value)
|
|
sentiment_df
|
|
return doc, sentiment_df, timestamp
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Phase 1: Individual interview analysis
|
|
- Create sentiment matrices for each interview (document)
|
|
- Save the intermediate results to file in the `WORKING_DIR`
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(document_name, pd):
|
|
import numpy as np
|
|
|
|
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
|
"""
|
|
Create a sentiment matrix for a specific document.
|
|
|
|
Parameters:
|
|
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
|
- document_name: Name of the document to filter by
|
|
|
|
Returns:
|
|
- DataFrame representing the sentiment matrix
|
|
"""
|
|
|
|
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
|
sentiment_rows = doc_df[
|
|
doc_df['tag'].str.contains(column_prefix, na=False)
|
|
].copy()
|
|
|
|
if sentiment_rows.empty:
|
|
print(f"No sentiment data found for document: {document_name}")
|
|
return pd.DataFrame()
|
|
|
|
# Filter for rows with valid Voice/Character context
|
|
valid_rows = sentiment_rows[
|
|
sentiment_rows['_context'].notna() &
|
|
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
|
].copy()
|
|
|
|
if valid_rows.empty:
|
|
print(f"No Voice/Character context found for document: {document_name}")
|
|
return pd.DataFrame()
|
|
|
|
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
|
# Sum sentiment scores for each combination
|
|
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
|
|
|
# Pivot to create the matrix
|
|
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
|
|
|
# # Convert to integers for cleaner display
|
|
# matrix = matrix.astype(int)
|
|
|
|
return matrix
|
|
return (create_sentiment_matrix,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Step 1.1: Voice Sample vs. Theme Sentiment Matrix
|
|
|
|
For each interview (document), create a matrix where:
|
|
- Rows represent the different Voices (based on '_V-' tags)
|
|
- Columns represent the different VoiceThemes(based on 'VT -' tags)
|
|
- Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(create_sentiment_matrix, sentiment_df):
|
|
voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
|
|
voice_matrix
|
|
return (voice_matrix,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
SAVE TO CSV
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(WORKING_DIR, doc, timestamp, voice_matrix):
|
|
# Save to CSV
|
|
voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"
|
|
|
|
voice_matrix.to_csv(voice_filename)
|
|
|
|
print(f"Saved to '{voice_filename}'")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Step 1.2: Character Sample vs. Theme Sentiment Matrix
|
|
|
|
For each interview (document), create a matrix where:
|
|
- Rows represent the different Characters (based on '_C-' tags)
|
|
- Columns represent the different CharacterThemes (based on 'CT -' tags)
|
|
- Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(create_sentiment_matrix, sentiment_df):
|
|
character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
|
|
character_matrix
|
|
return (character_matrix,)
|
|
|
|
|
|
@app.cell
|
|
def _(WORKING_DIR, character_matrix, doc, timestamp):
|
|
# Save to CSV
|
|
character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"
|
|
|
|
character_matrix.to_csv(character_filename)
|
|
|
|
print(f"Saved to '{character_filename}'")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Step 1.3: Chase Brand Sentiment
|
|
|
|
TODO: not sure we have enough supporting data for this yet
|
|
""")
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|