463 lines
14 KiB
Python
463 lines
14 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.18.3"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
from utils import connect_qumo_ollama
|
|
|
|
OLLAMA_LOCATION= 'localhost'
|
|
# VM_NAME = 'ollama-lite'
|
|
|
|
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
|
|
|
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
|
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
|
|
|
|
if not WORKING_DIR.exists():
|
|
WORKING_DIR.mkdir(parents=True)
|
|
if not TAGUETTE_EXPORT_DIR.exists():
|
|
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
|
|
|
model_select = mo.ui.dropdown(
|
|
options=_models,
|
|
value=_models[0],
|
|
label="Select Ollama Model to use",
|
|
searchable=True,
|
|
)
|
|
model_select
|
|
return (
|
|
TAGUETTE_EXPORT_DIR,
|
|
WORKING_DIR,
|
|
client,
|
|
datetime,
|
|
mo,
|
|
model_select,
|
|
pd,
|
|
)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(TAGUETTE_EXPORT_DIR, mo):
|
|
mo.md(rf"""
|
|
# Step 1: Export Data out of Taguette
|
|
|
|
**Highlights**
|
|
1. Go to: https://taguette.qumo.io/project/1
|
|
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
|
|
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
|
|
|
|
**Tags Codebook**
|
|
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
|
|
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
|
|
|
|
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Step 2: Import here for processing
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(TAGUETTE_EXPORT_DIR, pd):
|
|
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
|
|
all_tags_df['_seq_id'] = range(len(all_tags_df))
|
|
all_tags_df
|
|
return (all_tags_df,)
|
|
|
|
|
|
@app.cell
|
|
def _(TAGUETTE_EXPORT_DIR, pd):
|
|
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
|
|
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
|
|
codebook_df
|
|
return (codebook_df,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Step 3: Process each 'Interview'
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_tags_df, mo):
|
|
|
|
interview_select = mo.ui.dropdown(
|
|
options=all_tags_df['document'].unique().tolist(),
|
|
label="Select Interview to Process",
|
|
full_width=True
|
|
)
|
|
interview_select
|
|
return (interview_select,)
|
|
|
|
|
|
@app.cell
|
|
def _(all_tags_df, interview_select, mo):
|
|
mo.stop(not interview_select.value, mo.md("Select interview to continue"))
|
|
# filter all_tags_df to only the document = file_dropdown.value
|
|
df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
|
|
return (df,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Add `_context` column to track Voice / Character is being referred to per highlight
|
|
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
|
|
|
|
1. Iterates through the dataframe in document order (row by row)
|
|
2. Uses a set to track which highlight IDs we've already processed
|
|
3. When we encounter a new highlight ID for the first time, we process all its rows
|
|
4. Collects all _V- or _C- tags within that highlight
|
|
5. Assigns the context to all rows with that ID
|
|
6. This preserves document order and handles multi-tag highlights correctly
|
|
|
|
|
|
Example of challenging case:
|
|
|
|
| tag | content | _seq_id | _context |
|
|
|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
|
|
| _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
|
|
| _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
|
|
| VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
|
|
| VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(df):
|
|
# First pass: identify context tags within each highlight group
|
|
df['_context'] = None
|
|
last_context = None
|
|
processed_ids = set()
|
|
|
|
# Process in document order
|
|
for idx, row in df.iterrows():
|
|
highlight_id = row['id']
|
|
|
|
# If we haven't processed this highlight yet
|
|
if highlight_id not in processed_ids:
|
|
processed_ids.add(highlight_id)
|
|
|
|
# Get all rows for this highlight
|
|
highlight_rows = df[df['id'] == highlight_id]
|
|
|
|
# Collect all context tags in this highlight
|
|
context_tags = []
|
|
for _, h_row in highlight_rows.iterrows():
|
|
tag = h_row.get('tag', '')
|
|
if '_V-' in tag or '_C-' in tag:
|
|
context_tags.append(tag)
|
|
|
|
# If we found context tags, join them with comma
|
|
if context_tags:
|
|
context_tag = ', '.join(context_tags)
|
|
last_context = context_tag
|
|
else:
|
|
# If no context tag in this highlight, use the last context
|
|
context_tag = last_context
|
|
|
|
# Assign the context to all rows in this highlight
|
|
df.loc[df['id'] == highlight_id, '_context'] = context_tag
|
|
|
|
df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Split multi-context rows (only VT- and CT- theme tags)
|
|
|
|
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
|
|
- split these into separate rows for each context.
|
|
- Then mark these for 'manual_analysis'
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(df, pd):
|
|
# Expand rows that contain multiple contexts (comma-separated)
|
|
expanded_rows = []
|
|
|
|
for _, _row in df.iterrows():
|
|
context_value = _row['_context']
|
|
has_multiple = pd.notna(context_value) and ',' in str(context_value)
|
|
|
|
if has_multiple:
|
|
contexts = [c.strip() for c in str(context_value).split(',')]
|
|
else:
|
|
contexts = [context_value]
|
|
|
|
if has_multiple:
|
|
for ctx in contexts:
|
|
new_row = _row.copy()
|
|
new_row['_context'] = ctx
|
|
new_row['manual_analysis'] = True
|
|
|
|
if str(new_row['tag']).startswith(('VT -', 'CT -')):
|
|
new_row['sentiment'] = None
|
|
|
|
expanded_rows.append(new_row)
|
|
else:
|
|
new_row = _row.copy()
|
|
new_row['_context'] = contexts[0]
|
|
new_row['manual_analysis'] = False
|
|
expanded_rows.append(new_row)
|
|
|
|
expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)
|
|
|
|
|
|
sentiment_df = expanded_df_raw.loc[
|
|
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
|
|
].copy()
|
|
|
|
print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
|
|
|
|
sentiment_df[sentiment_df['manual_analysis']]
|
|
return (sentiment_df,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Create 'theme' column
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(sentiment_df):
|
|
from utils import extract_theme
|
|
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
|
|
sentiment_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Extract Sentiment + Reasoning
|
|
|
|
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
|
|
|
|
Values to be used:
|
|
- Positive: +1
|
|
- Neutral: 0
|
|
- Negative: -1
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
start_processing_btn = mo.ui.button(
|
|
label="Start Sentiment Extraction",
|
|
kind="warn",
|
|
on_click=lambda val: True
|
|
)
|
|
start_processing_btn
|
|
return (start_processing_btn,)
|
|
|
|
|
|
@app.cell
|
|
def _(
|
|
client,
|
|
codebook_df,
|
|
mo,
|
|
model_select,
|
|
pd,
|
|
sentiment_df,
|
|
start_processing_btn,
|
|
):
|
|
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
|
|
|
|
# add theme_description to be used in LLM prompt
|
|
_df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
|
|
|
|
# Wait for start processing button
|
|
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
|
|
|
|
|
sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
|
|
lambda row: pd.Series(ollama_sentiment_analysis(
|
|
content=row['content'],
|
|
theme=row['theme'],
|
|
theme_description=row['theme_description'],
|
|
client=client,
|
|
model=model_select.value
|
|
)),
|
|
axis=1
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo, sentiment_df):
|
|
mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
|
|
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Multi-context tags
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo, sentiment_df):
|
|
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
|
|
split_rows_editor = None
|
|
rows_to_edit = []
|
|
|
|
if not manual_rows.empty:
|
|
print(
|
|
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
|
|
"See next cell for manual review."
|
|
)
|
|
|
|
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
|
|
rows_to_edit = sentiment_df[
|
|
(sentiment_df['manual_analysis'])
|
|
]
|
|
|
|
# Create data editor for split rows
|
|
split_rows_editor = mo.ui.data_editor(
|
|
rows_to_edit
|
|
).form(label="Update Sentiment / Manual Flag")
|
|
|
|
else:
|
|
print("✓ No multi-context rows found")
|
|
return rows_to_edit, split_rows_editor
|
|
|
|
|
|
@app.cell
|
|
def _(split_rows_editor):
|
|
split_rows_editor
|
|
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo, rows_to_edit, split_rows_editor):
|
|
if split_rows_editor is not None:
|
|
mo.vstack([
|
|
mo.md(f"""
|
|
### ⚠️ Manual Review Required
|
|
|
|
**{len(rows_to_edit)} rows** were split from multi-context entries.
|
|
Please review them below:
|
|
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
|
|
2. Click **Submit** to apply changes.
|
|
"""),
|
|
split_rows_editor
|
|
])
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo, split_rows_editor):
|
|
# Capture the edited manual-analysis rows for validation
|
|
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
|
|
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
|
|
|
|
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
|
|
|
|
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
|
|
valid_sentiments = {-1, 0, 1}
|
|
needs_review = reviewed_manual_rows[
|
|
reviewed_manual_rows['manual_analysis']
|
|
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
|
|
]
|
|
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
|
|
|
|
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
|
|
return (reviewed_manual_rows,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Recombine
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, reviewed_manual_rows, sentiment_df):
|
|
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
|
|
if isinstance(reviewed_manual_rows, pd.DataFrame):
|
|
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
|
|
else:
|
|
recombined_df = sentiment_df
|
|
|
|
recombined_df
|
|
return (recombined_df,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Step 3: Process 'Other' tags
|
|
|
|
These need to be reviewed manually for interesting content
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Save to CSV
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(WORKING_DIR, datetime, interview_select, recombined_df):
|
|
# Save to CSV in working dir
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
|
|
recombined_df.to_csv(filename, index=False)
|
|
|
|
print(f"✓ Saved processed data to '{filename}'")
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|