Files
Interview-Analysis/02_Taguette_Post-Process.py
2025-12-09 21:05:07 +01:00

333 lines
12 KiB
Python

import marimo
__generated_with = "0.18.3"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import pandas as pd
from pathlib import Path
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
return TAGUETTE_EXPORT_DIR, mo, pd
@app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo):
mo.md(rf"""
# Step 1: Export All Highlights out of Taguette
1. Go to: http://taguette.tail44fa00.ts.net/project/1
2. Select 'Highlights' on left
3. Select 'See all hightlights'
4. Top right 'Export this view' > 'CSV'
5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Step 2: Import here for processing
""")
return
@app.cell
def _(pd):
all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df.head(20)
return (all_tags_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Step 3: Process each 'Interview'
""")
return
@app.cell
def _(all_tags_df, mo):
file_dropdown = mo.ui.dropdown(
options=all_tags_df['document'].unique().tolist(),
label="Select Interview to Process",
full_width=True
)
file_dropdown
return (file_dropdown,)
@app.cell
def _(all_tags_df, file_dropdown):
# filter all_tags_df to only the document = file_dropdown.value
df = all_tags_df.loc[all_tags_df['document'] == file_dropdown.value].copy()
return (df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Add `_context` column to track Voice / Character is being referred to per highlight
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
1. Iterates through the dataframe in document order (row by row)
2. Uses a set to track which highlight IDs we've already processed
3. When we encounter a new highlight ID for the first time, we process all its rows
4. Collects all _V- or _C- tags within that highlight
5. Assigns the context to all rows with that ID
6. This preserves document order and handles multi-tag highlights correctly
Example of challenging case:
| id | document | tag | content | _seq_id | _context |
|-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
""")
return
@app.cell
def _(df):
# First pass: identify context tags within each highlight group
df['_context'] = None
last_context = None
processed_ids = set()
# Process in document order
for idx, row in df.iterrows():
highlight_id = row['id']
# If we haven't processed this highlight yet
if highlight_id not in processed_ids:
processed_ids.add(highlight_id)
# Get all rows for this highlight
highlight_rows = df[df['id'] == highlight_id]
# Collect all context tags in this highlight
context_tags = []
for _, h_row in highlight_rows.iterrows():
tag = h_row.get('tag', '')
if '_V-' in tag or '_C-' in tag:
context_tags.append(tag)
# If we found context tags, join them with comma
if context_tags:
context_tag = ', '.join(context_tags)
last_context = context_tag
else:
# If no context tag in this highlight, use the last context
context_tag = last_context
# Assign the context to all rows in this highlight
df.loc[df['id'] == highlight_id, '_context'] = context_tag
del idx
df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Resolve multi-context rows (only VT- and CT- theme tags)
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
- split these into separate rows for each context.
- Then mark these for 'manual_analysis'
""")
return
@app.cell
def _(df, pd):
# Expand rows that contain multiple contexts (comma-separated)
expanded_rows = []
for _, _row in df.iterrows():
context_value = _row['_context']
has_multiple = pd.notna(context_value) and ',' in str(context_value)
if has_multiple:
contexts = [c.strip() for c in str(context_value).split(',')]
else:
contexts = [context_value]
if has_multiple:
for ctx in contexts:
new_row = _row.copy()
new_row['_context'] = ctx
new_row['manual_analysis'] = True
if str(new_row['tag']).startswith(('VT -', 'CT -')):
new_row['sentiment'] = None
expanded_rows.append(new_row)
else:
new_row = _row.copy()
new_row['_context'] = contexts[0]
new_row['manual_analysis'] = False
expanded_rows.append(new_row)
expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
manual_rows = expanded_df_raw[expanded_df_raw['manual_analysis']]
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
else:
print("✓ No multi-context rows found")
return (expanded_df_raw,)
@app.cell
def _(expanded_df_raw, mo):
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = expanded_df_raw[
(expanded_df_raw['manual_analysis'])
& (expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False))
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
return rows_to_edit, split_rows_editor
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Uncheck `manual_analysis` when you are done reviewing a row.
3. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(expanded_df_raw, mo, pd, split_rows_editor):
# Reconstruct the full dataframe using the editor's current value
# This will update whenever the user edits the table
mo.stop(split_rows_editor.value is None, mo.md("Submit your changes."))
_edited_rows = split_rows_editor.value
_static_rows = expanded_df_raw[~expanded_df_raw['manual_analysis']]
expanded_df2 = pd.concat([_static_rows, _edited_rows]).sort_index()
return (expanded_df2,)
@app.cell
def _(expanded_df2, pd):
# Verify no rows have multiple contexts
try:
has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False)
assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)"
# Verify that rows still marked for manual analysis have sentiment values
manual_sent_rows = expanded_df2[expanded_df2['manual_analysis']]
theme_rows = manual_sent_rows[manual_sent_rows['tag'].str.startswith(('VT -', 'CT -'), na=False)]
missing_sentiment = theme_rows[theme_rows['sentiment'].isna()]
assert missing_sentiment.empty, (
f"{len(missing_sentiment)} rows marked for manual analysis "
"have missing sentiment values"
)
print("\n✓ Verification passed: Manual-analysis rows are consistent")
expanded_df_final = expanded_df2
expanded_df_final
except AssertionError as e:
print(f"\n❌ Verification failed: {e}")
print("Please review the data before proceeding")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Highlight Sentiment Analysis
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
Values to be used:
- Positive: +1
- Neutral: 0
- Negative: -1
""")
return
@app.cell
def _(df):
# TODO: Implement sentiment analysis and add 'sentiment' column
# for now, create an empty sentiment column with randomized dummy values for testing
# only for 'VT -' and 'CT -' tags
import random
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]) # Random sentiment for testing
return None
df['sentiment'] = df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Step 3: Process 'Other' tags
These need to be reviewed manually for interesting content
""")
return
@app.cell
def _(mo):
mo.md(r"""
""")
return
@app.cell
def _():
return
if __name__ == "__main__":
app.run()