restructure analysis
This commit is contained in:
180
03_Sentiment_Analysis.py
Normal file
180
03_Sentiment_Analysis.py
Normal file
@@ -0,0 +1,180 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.3"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
|
||||
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
if not TAGUETTE_EXPORT_DIR.exists():
|
||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||
return WORKING_DIR, mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Phase 1: Individual interview analysis
|
||||
- Create sentiment matrices for each interview (document)
|
||||
- Save the intermediate results to file in the `WORKING_DIR`
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
import numpy as np
|
||||
|
||||
def create_sentiment_matrix(df, document_name, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
# Filter for the specific document
|
||||
doc_df = df[df['document'] == document_name].copy()
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print(f"No sentiment data found for document: {document_name}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print(f"No Voice/Character context found for document: {document_name}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# Fill NaN with 0 (no sentiment data for that combination)
|
||||
matrix = matrix.fillna(0)
|
||||
|
||||
# Convert to integers for cleaner display
|
||||
matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
|
||||
return (create_sentiment_matrix,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.1: Voice Sample vs. Theme Sentiment Matrix
|
||||
|
||||
For each interview (document), create a matrix where:
|
||||
- Rows represent the different Voices (based on '_V-' tags)
|
||||
- Columns represent the different VoiceThemes(based on 'VT -' tags)
|
||||
- Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(WORKING_DIR, all_tags_df, create_sentiment_matrix, mo):
|
||||
|
||||
# Create matrices for each unique document
|
||||
documents = all_tags_df['document'].unique()
|
||||
matrices = {}
|
||||
|
||||
for doc in documents:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Document: {doc}")
|
||||
print('='*60)
|
||||
matrix = create_sentiment_matrix(all_tags_df, doc, column_prefix='VT - ', row_prefix='_V-')
|
||||
if not matrix.empty:
|
||||
matrices[doc] = matrix
|
||||
print(matrix)
|
||||
else:
|
||||
print("No matrix data available")
|
||||
|
||||
# Save to CSV
|
||||
timestamp = mo.utils.get_timestamp(short=True)
|
||||
filename = WORKING_DIR / f"{doc.replace(' ', '_')}_voice_theme_matrix_{timestamp}.csv"
|
||||
matrix.to_csv(filename)
|
||||
print(f"Matrix saved to: {filename}")
|
||||
|
||||
# Store matrices in a variable for further analysis
|
||||
matrices
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.2: Character Sample vs. Theme Sentiment Matrix
|
||||
|
||||
For each interview (document), create a matrix where:
|
||||
- Rows represent the different Characters (based on '_C-' tags)
|
||||
- Columns represent the different CharacterThemes (based on 'CT -' tags)
|
||||
- Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.3: Chase Brand Sentiment
|
||||
|
||||
TODO: not sure we have enough supporting data for this yet
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Step 1.x: Save Matrices to Files
|
||||
|
||||
Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# Save the matrices to CSV files in the WORKING_DIR for intermediate storage. Include a short timestamp in the filename so we can track runs.
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Phase 2: Overall Results
|
||||
|
||||
Aggregate results of all the interviews into master matrices.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user