import marimo __generated_with = "0.18.3" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pandas as pd from pathlib import Path from datetime import datetime from utils import connect_qumo_ollama OLLAMA_LOCATION= 'localhost' # VM_NAME = 'ollama-lite' client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') WORKING_DIR = Path('./data/processing/02_taguette_postprocess') if not WORKING_DIR.exists(): WORKING_DIR.mkdir(parents=True) if not TAGUETTE_EXPORT_DIR.exists(): TAGUETTE_EXPORT_DIR.mkdir(parents=True) model_select = mo.ui.dropdown( options=_models, value=_models[0], label="Select Ollama Model to use", searchable=True, ) model_select return ( TAGUETTE_EXPORT_DIR, WORKING_DIR, client, datetime, mo, model_select, pd, ) @app.cell(hide_code=True) def _(TAGUETTE_EXPORT_DIR, mo): mo.md(rf""" # Step 1: Export All Highlights out of Taguette 1. Go to: http://taguette.tail44fa00.ts.net/project/1 2. Select 'Highlights' on left 3. Select 'See all hightlights' 4. Top right 'Export this view' > 'CSV' 5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv' """) return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Step 2: Import here for processing """) return @app.cell def _(pd): all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv') all_tags_df['_seq_id'] = range(len(all_tags_df)) all_tags_df.head(20) return (all_tags_df,) @app.cell(hide_code=True) def _(mo): mo.md(r""" # Step 3: Process each 'Interview' """) return @app.cell def _(all_tags_df, mo): interview_select = mo.ui.dropdown( options=all_tags_df['document'].unique().tolist(), label="Select Interview to Process", full_width=True ) interview_select return (interview_select,) @app.cell def _(all_tags_df, interview_select, mo): mo.stop(not interview_select.value, mo.md("Select interview to continue")) # filter all_tags_df to only the document = file_dropdown.value df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy() return (df,) @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Add `_context` column to track Voice / Character is being referred to per highlight Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom. 1. Iterates through the dataframe in document order (row by row) 2. Uses a set to track which highlight IDs we've already processed 3. When we encounter a new highlight ID for the first time, we process all its rows 4. Collects all _V- or _C- tags within that highlight 5. Assigns the context to all rows with that ID 6. This preserves document order and handles multi-tag highlights correctly Example of challenging case: | tag | content | _seq_id | _context | |------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------| | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 | | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 | | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 | | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 | """) return @app.cell def _(df): # First pass: identify context tags within each highlight group df['_context'] = None last_context = None processed_ids = set() # Process in document order for idx, row in df.iterrows(): highlight_id = row['id'] # If we haven't processed this highlight yet if highlight_id not in processed_ids: processed_ids.add(highlight_id) # Get all rows for this highlight highlight_rows = df[df['id'] == highlight_id] # Collect all context tags in this highlight context_tags = [] for _, h_row in highlight_rows.iterrows(): tag = h_row.get('tag', '') if '_V-' in tag or '_C-' in tag: context_tags.append(tag) # If we found context tags, join them with comma if context_tags: context_tag = ', '.join(context_tags) last_context = context_tag else: # If no context tag in this highlight, use the last context context_tag = last_context # Assign the context to all rows in this highlight df.loc[df['id'] == highlight_id, '_context'] = context_tag df return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Split multi-context rows (only VT- and CT- theme tags) For rows that have multiple contexts (e.g., both _V-54 and _V-41) - split these into separate rows for each context. - Then mark these for 'manual_analysis' """) return @app.cell def _(df, pd): # Expand rows that contain multiple contexts (comma-separated) expanded_rows = [] for _, _row in df.iterrows(): context_value = _row['_context'] has_multiple = pd.notna(context_value) and ',' in str(context_value) if has_multiple: contexts = [c.strip() for c in str(context_value).split(',')] else: contexts = [context_value] if has_multiple: for ctx in contexts: new_row = _row.copy() new_row['_context'] = ctx new_row['manual_analysis'] = True if str(new_row['tag']).startswith(('VT -', 'CT -')): new_row['sentiment'] = None expanded_rows.append(new_row) else: new_row = _row.copy() new_row['_context'] = contexts[0] new_row['manual_analysis'] = False expanded_rows.append(new_row) expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True) sentiment_df = expanded_df_raw.loc[ expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False) ].copy() print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts") sentiment_df[sentiment_df['manual_analysis']] return (sentiment_df,) @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Create 'theme' column """) return @app.cell def _(sentiment_df): from utils import extract_theme sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1) sentiment_df return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Extract Sentiment + Reasoning For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column. Values to be used: - Positive: +1 - Neutral: 0 - Negative: -1 """) return @app.cell def _(client, model_select, pd, sentiment_df): # for now, create an empty sentiment column with randomized dummy values for testing # only for 'VT -' and 'CT -' tags from utils import dummy_sentiment_analysis, ollama_sentiment_analysis # Only run on rows without manual_analysis # sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply( # lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])), # axis=1 # ) sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply( lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)), axis=1 ) return @app.cell def _(sentiment_df): sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']] return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Multi-context tags """) return @app.cell def _(mo, sentiment_df): manual_rows = sentiment_df[sentiment_df['manual_analysis']] split_rows_editor = None rows_to_edit = [] if not manual_rows.empty: print( f"⚠️ {len(manual_rows)} rows were created from multi-context splits. " "See next cell for manual review." ) # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -' rows_to_edit = sentiment_df[ (sentiment_df['manual_analysis']) ] # Create data editor for split rows split_rows_editor = mo.ui.data_editor( rows_to_edit ).form(label="Update Sentiment / Manual Flag") else: print("✓ No multi-context rows found") return rows_to_edit, split_rows_editor @app.cell(hide_code=True) def _(mo, rows_to_edit, split_rows_editor): if split_rows_editor is not None: mo.vstack([ mo.md(f""" ### ⚠️ Manual Review Required **{len(rows_to_edit)} rows** were split from multi-context entries. Please review them below: 1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context. 2. Click **Submit** to apply changes. """), split_rows_editor ]) return @app.cell def _(mo, split_rows_editor): # Capture the edited manual-analysis rows for validation reviewed_manual_rows = getattr(split_rows_editor, 'value', '') mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing.")) # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1 if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty): valid_sentiments = {-1, 0, 1} needs_review = reviewed_manual_rows[ reviewed_manual_rows['manual_analysis'] & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments) ] assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1" print("Verification: ✓ All Manual-analysis rows have valid sentiment values") return (reviewed_manual_rows,) @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Recombine """) return @app.cell def _(pd, reviewed_manual_rows, sentiment_df): _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']] if isinstance(reviewed_manual_rows, pd.DataFrame): recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True) else: recombined_df = sentiment_df recombined_df return (recombined_df,) @app.cell(hide_code=True) def _(mo): mo.md(r""" # Step 3: Process 'Other' tags These need to be reviewed manually for interesting content """) return @app.cell def _(mo): mo.md(r""" """) return @app.cell def _(): return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Save to CSV """) return @app.cell def _(WORKING_DIR, datetime, interview_select, recombined_df): # Save to CSV in working dir timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv" recombined_df.to_csv(filename, index=False) print(f"✓ Saved processed data to '{filename}'") return if __name__ == "__main__": app.run()