import marimo __generated_with = "0.18.3" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import pandas as pd return mo, pd @app.cell(hide_code=True) def _(mo): mo.md(r""" # Step 1: Export All Highlights 1. Go to: http://taguette.tail44fa00.ts.net/project/1 2. Select 'Highlights' on left 3. Select 'See all hightlights' 4. Top right 'Export this view' > 'CSV' 5. """) return @app.cell def _(pd): all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv') all_tags_df['_seq_id'] = range(len(all_tags_df)) all_tags_df.head(20) return (all_tags_df,) @app.cell(hide_code=True) def _(mo): mo.md(r""" ### Post-process the dataframe so it can be easily analyzed Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom. 1. Iterates through the dataframe in document order (row by row) 2. Uses a set to track which highlight IDs we've already processed 3. When we encounter a new highlight ID for the first time, we process all its rows 4. Collects all _V- or _C- tags within that highlight 5. Assigns the context to all rows with that ID 6. This preserves document order and handles multi-tag highlights correctly Example of challenging case: | id | document | tag | content | _seq_id | _context | |-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------| | 252 | P2 - Done | _C-Counselor | So we've pulled through your top personality, which was the counselor, and then we've included those same twelve voices from before. And your task now is to select which of the voices you feel best suits this character that would be, the personality and voice for Chase's digital assistant. | 115 | _C-Counselor | | 88 | P2 - Done | VT - Knowledgeable / Trust | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 116 | _V-54, _V-41 | | 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 | | 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 | | 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 | | 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 | | 90 | P2 - Done | VT - Personal 'click' | I picked the female because her voice is so unique. | 121 | _V-41 | | 90 | P2 - Done | _V-41 | I picked the female because her voice is so unique. | 122 | _V-41 | """) return @app.cell def _(all_tags_df): # First pass: identify context tags within each highlight group all_tags_df['_context'] = None last_context = None processed_ids = set() # Process in document order for idx, row in all_tags_df.iterrows(): highlight_id = row['id'] # If we haven't processed this highlight yet if highlight_id not in processed_ids: processed_ids.add(highlight_id) # Get all rows for this highlight highlight_rows = all_tags_df[all_tags_df['id'] == highlight_id] # Collect all context tags in this highlight context_tags = [] for _, h_row in highlight_rows.iterrows(): tag = h_row.get('tag', '') if '_V-' in tag or '_C-' in tag: context_tags.append(tag) # If we found context tags, join them with comma if context_tags: context_tag = ', '.join(context_tags) last_context = context_tag else: # If no context tag in this highlight, use the last context context_tag = last_context # Assign the context to all rows in this highlight all_tags_df.loc[all_tags_df['id'] == highlight_id, '_context'] = context_tag del idx all_tags_df return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Step 2: Sentiment Analysis For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column. Values to be used: - Positive: +1 - Neutral: 0 - Negative: -1 """) return @app.cell def _(all_tags_df): # TODO: Implement sentiment analysis and add 'sentiment' column # for now, create an empty sentiment column with randomized dummy values for testing # only for 'VT -' and 'CT -' tags import random def dummy_sentiment_analysis(content, tag): if tag.startswith('VT -') or tag.startswith('CT -'): return random.choice([-1, 0, 1]) # Random sentiment for testing return None all_tags_df['sentiment'] = all_tags_df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1) all_tags_df return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 2b: Resolve multi-context rows For rows that have multiple contexts (e.g., both _V-54 and _V-41), split these into separate rows for each context, removing the content and sentiment analysis for each new row. Then mark these for manual review. Use marimo's interactive notebook editing features to facilitate this process. This ensures that each row corresponds to a single context for clearer analysis in subsequent steps. Add verification column to mark these rows for review. Run assert at the end to ensure no rows have multiple contexts and if that passes, drop the verification column. """) return @app.cell def _(all_tags_df, pd): # Identify rows with multiple contexts (comma-separated) all_tags_df['_needs_split'] = all_tags_df['_context'].apply( lambda x: ',' in str(x) if pd.notna(x) else False ) # Create expanded rows for multi-context entries expanded_rows = [] for _, _row in all_tags_df.iterrows(): if _row['_needs_split']: # Split the context by comma contexts = [c.strip() for c in str(_row['_context']).split(',')] # Create a new row for each context for ctx in contexts: new_row = _row.copy() new_row['_context'] = ctx new_row['_was_split'] = True # Mark for manual review expanded_rows.append(new_row) else: # Keep single-context rows as-is new_row = _row.copy() new_row['_was_split'] = False expanded_rows.append(new_row) # Create the new dataframe expanded_df2 = pd.DataFrame(expanded_rows).reset_index(drop=True) # Display rows that were split for review split_rows = expanded_df2[expanded_df2['_was_split']] if not split_rows.empty: split_rows # print(f"⚠️ {len(split_rows)} rows were created from multi-context splits") # print("These are marked with '_was_split' = True for manual review\n") # print("Sample of split rows:") # split_rows[['id', 'document', 'tag', '_context', 'sentiment', '_was_split']] else: print("✓ No multi-context rows found") expanded_df2[expanded_df2['_was_split']] return (expanded_df2,) @app.cell def _(): # Using marimo's interactive notebook editing features, have the user manually update the sentiment values for the split rows as needed. (only for 'VT -' and 'CT -' tags) return @app.cell def _(expanded_df2, pd): # Verify no rows have multiple contexts try: has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False) assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)" # assert that all have manual checks have been completed assert expanded_df2['_was_split'].sum() == 0, "Some rows still need manual review" print("\n✓ Verification passed: All rows have single contexts") # Drop verification columns since verification passed expanded_df_final = expanded_df2.drop(columns=['_needs_split', '_was_split']) print("✓ Verification columns dropped") expanded_df_final except AssertionError as e: print(f"\n❌ Verification failed: {e}") print("Please review the data before proceeding") return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Step 3: Create Matrices for each interview For each interview (document), create a matrix where: - Rows represent the different Voices/Characters (based on '_V-' and '_C-' tags) - Columns represent the different VoiceThemes/CharacterThemes (based on 'VT -' and 'CT -' tags) - Each cell contains the aggregated sentiment score for that Voice/Character regarding that combination """) return @app.cell def _(all_tags_df, pd): import numpy as np def create_sentiment_matrix(df, document_name): """ Create a sentiment matrix for a specific document. Parameters: - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment'] - document_name: Name of the document to filter by Returns: - DataFrame representing the sentiment matrix """ # Filter for the specific document doc_df = df[df['document'] == document_name].copy() # Filter for rows that have sentiment values (VT- and CT- tags) sentiment_rows = doc_df[doc_df['sentiment'].notna()].copy() if sentiment_rows.empty: print(f"No sentiment data found for document: {document_name}") return pd.DataFrame() # Filter for rows with valid Voice/Character context valid_rows = sentiment_rows[ sentiment_rows['_context'].notna() & (sentiment_rows['_context'].str.contains('_V-|_C-', na=False)) ].copy() if valid_rows.empty: print(f"No Voice/Character context found for document: {document_name}") return pd.DataFrame() # Create aggregation: group by Voice/Character (_context) and Theme (tag) # Sum sentiment scores for each combination matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index() # Pivot to create the matrix matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment') # Fill NaN with 0 (no sentiment data for that combination) matrix = matrix.fillna(0) # Convert to integers for cleaner display matrix = matrix.astype(int) return matrix # Create matrices for each unique document documents = all_tags_df['document'].unique() matrices = {} for doc in documents: print(f"\n{'='*60}") print(f"Document: {doc}") print('='*60) matrix = create_sentiment_matrix(all_tags_df, doc) if not matrix.empty: matrices[doc] = matrix print(matrix) else: print("No matrix data available") # Store matrices in a variable for further analysis matrices return if __name__ == "__main__": app.run()