315 lines
14 KiB
Python
315 lines
14 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.18.3"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import pandas as pd
|
|
return mo, pd
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Step 1: Export All Highlights
|
|
|
|
1. Go to: http://taguette.tail44fa00.ts.net/project/1
|
|
2. Select 'Highlights' on left
|
|
3. Select 'See all hightlights'
|
|
4. Top right 'Export this view' > 'CSV'
|
|
5.
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd):
|
|
all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
|
|
all_tags_df['_seq_id'] = range(len(all_tags_df))
|
|
all_tags_df.head(20)
|
|
return (all_tags_df,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
### Post-process the dataframe so it can be easily analyzed
|
|
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
|
|
|
|
1. Iterates through the dataframe in document order (row by row)
|
|
2. Uses a set to track which highlight IDs we've already processed
|
|
3. When we encounter a new highlight ID for the first time, we process all its rows
|
|
4. Collects all _V- or _C- tags within that highlight
|
|
5. Assigns the context to all rows with that ID
|
|
6. This preserves document order and handles multi-tag highlights correctly
|
|
|
|
|
|
Example of challenging case:
|
|
|
|
| id | document | tag | content | _seq_id | _context |
|
|
|-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
|
|
| 252 | P2 - Done | _C-Counselor | So we've pulled through your top personality, which was the counselor, and then we've included those same twelve voices from before. And your task now is to select which of the voices you feel best suits this character that would be, the personality and voice for Chase's digital assistant. | 115 | _C-Counselor |
|
|
| 88 | P2 - Done | VT - Knowledgeable / Trust | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 116 | _V-54, _V-41 |
|
|
| 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
|
|
| 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
|
|
| 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
|
|
| 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
|
|
| 90 | P2 - Done | VT - Personal 'click' | I picked the female because her voice is so unique. | 121 | _V-41 |
|
|
| 90 | P2 - Done | _V-41 | I picked the female because her voice is so unique. | 122 | _V-41 |
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_tags_df):
|
|
# First pass: identify context tags within each highlight group
|
|
all_tags_df['_context'] = None
|
|
last_context = None
|
|
processed_ids = set()
|
|
|
|
# Process in document order
|
|
for idx, row in all_tags_df.iterrows():
|
|
highlight_id = row['id']
|
|
|
|
# If we haven't processed this highlight yet
|
|
if highlight_id not in processed_ids:
|
|
processed_ids.add(highlight_id)
|
|
|
|
# Get all rows for this highlight
|
|
highlight_rows = all_tags_df[all_tags_df['id'] == highlight_id]
|
|
|
|
# Collect all context tags in this highlight
|
|
context_tags = []
|
|
for _, h_row in highlight_rows.iterrows():
|
|
tag = h_row.get('tag', '')
|
|
if '_V-' in tag or '_C-' in tag:
|
|
context_tags.append(tag)
|
|
|
|
# If we found context tags, join them with comma
|
|
if context_tags:
|
|
context_tag = ', '.join(context_tags)
|
|
last_context = context_tag
|
|
else:
|
|
# If no context tag in this highlight, use the last context
|
|
context_tag = last_context
|
|
|
|
# Assign the context to all rows in this highlight
|
|
all_tags_df.loc[all_tags_df['id'] == highlight_id, '_context'] = context_tag
|
|
|
|
del idx
|
|
all_tags_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Step 2: Sentiment Analysis
|
|
|
|
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
|
|
|
|
Values to be used:
|
|
- Positive: +1
|
|
- Neutral: 0
|
|
- Negative: -1
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_tags_df):
|
|
# TODO: Implement sentiment analysis and add 'sentiment' column
|
|
|
|
# for now, create an empty sentiment column with randomized dummy values for testing
|
|
# only for 'VT -' and 'CT -' tags
|
|
import random
|
|
|
|
def dummy_sentiment_analysis(content, tag):
|
|
if tag.startswith('VT -') or tag.startswith('CT -'):
|
|
return random.choice([-1, 0, 1]) # Random sentiment for testing
|
|
return None
|
|
|
|
all_tags_df['sentiment'] = all_tags_df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
|
|
|
|
all_tags_df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Step 2b: Resolve multi-context rows
|
|
|
|
For rows that have multiple contexts (e.g., both _V-54 and _V-41), split these into separate rows for each context, removing the content and sentiment analysis for each new row. Then mark these for manual review. Use marimo's interactive notebook editing features to facilitate this process.
|
|
|
|
This ensures that each row corresponds to a single context for clearer analysis in subsequent steps. Add verification column to mark these rows for review. Run assert at the end to ensure no rows have multiple contexts and if that passes, drop the verification column.
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_tags_df, pd):
|
|
# Identify rows with multiple contexts (comma-separated)
|
|
all_tags_df['_needs_split'] = all_tags_df['_context'].apply(
|
|
lambda x: ',' in str(x) if pd.notna(x) else False
|
|
)
|
|
|
|
# Create expanded rows for multi-context entries
|
|
expanded_rows = []
|
|
for _, _row in all_tags_df.iterrows():
|
|
if _row['_needs_split']:
|
|
# Split the context by comma
|
|
contexts = [c.strip() for c in str(_row['_context']).split(',')]
|
|
|
|
# Create a new row for each context
|
|
for ctx in contexts:
|
|
new_row = _row.copy()
|
|
new_row['_context'] = ctx
|
|
new_row['_was_split'] = True # Mark for manual review
|
|
expanded_rows.append(new_row)
|
|
else:
|
|
# Keep single-context rows as-is
|
|
new_row = _row.copy()
|
|
new_row['_was_split'] = False
|
|
expanded_rows.append(new_row)
|
|
|
|
# Create the new dataframe
|
|
|
|
expanded_df2 = pd.DataFrame(expanded_rows).reset_index(drop=True)
|
|
|
|
# Display rows that were split for review
|
|
split_rows = expanded_df2[expanded_df2['_was_split']]
|
|
if not split_rows.empty:
|
|
split_rows
|
|
# print(f"⚠️ {len(split_rows)} rows were created from multi-context splits")
|
|
# print("These are marked with '_was_split' = True for manual review\n")
|
|
# print("Sample of split rows:")
|
|
# split_rows[['id', 'document', 'tag', '_context', 'sentiment', '_was_split']]
|
|
else:
|
|
print("✓ No multi-context rows found")
|
|
|
|
expanded_df2[expanded_df2['_was_split']]
|
|
return (expanded_df2,)
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
# Using marimo's interactive notebook editing features, have the user manually update the sentiment values for the split rows as needed. (only for 'VT -' and 'CT -' tags)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(expanded_df2, pd):
|
|
# Verify no rows have multiple contexts
|
|
try:
|
|
has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False)
|
|
assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)"
|
|
|
|
# assert that all have manual checks have been completed
|
|
assert expanded_df2['_was_split'].sum() == 0, "Some rows still need manual review"
|
|
print("\n✓ Verification passed: All rows have single contexts")
|
|
|
|
# Drop verification columns since verification passed
|
|
expanded_df_final = expanded_df2.drop(columns=['_needs_split', '_was_split'])
|
|
print("✓ Verification columns dropped")
|
|
|
|
expanded_df_final
|
|
|
|
except AssertionError as e:
|
|
print(f"\n❌ Verification failed: {e}")
|
|
print("Please review the data before proceeding")
|
|
|
|
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Step 3: Create Matrices for each interview
|
|
|
|
For each interview (document), create a matrix where:
|
|
- Rows represent the different Voices/Characters (based on '_V-' and '_C-' tags)
|
|
- Columns represent the different VoiceThemes/CharacterThemes (based on 'VT -' and 'CT -' tags)
|
|
- Each cell contains the aggregated sentiment score for that Voice/Character regarding that combination
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(all_tags_df, pd):
|
|
import numpy as np
|
|
|
|
def create_sentiment_matrix(df, document_name):
|
|
"""
|
|
Create a sentiment matrix for a specific document.
|
|
|
|
Parameters:
|
|
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
|
- document_name: Name of the document to filter by
|
|
|
|
Returns:
|
|
- DataFrame representing the sentiment matrix
|
|
"""
|
|
# Filter for the specific document
|
|
doc_df = df[df['document'] == document_name].copy()
|
|
|
|
# Filter for rows that have sentiment values (VT- and CT- tags)
|
|
sentiment_rows = doc_df[doc_df['sentiment'].notna()].copy()
|
|
|
|
if sentiment_rows.empty:
|
|
print(f"No sentiment data found for document: {document_name}")
|
|
return pd.DataFrame()
|
|
|
|
# Filter for rows with valid Voice/Character context
|
|
valid_rows = sentiment_rows[
|
|
sentiment_rows['_context'].notna() &
|
|
(sentiment_rows['_context'].str.contains('_V-|_C-', na=False))
|
|
].copy()
|
|
|
|
if valid_rows.empty:
|
|
print(f"No Voice/Character context found for document: {document_name}")
|
|
return pd.DataFrame()
|
|
|
|
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
|
# Sum sentiment scores for each combination
|
|
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
|
|
|
# Pivot to create the matrix
|
|
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
|
|
|
# Fill NaN with 0 (no sentiment data for that combination)
|
|
matrix = matrix.fillna(0)
|
|
|
|
# Convert to integers for cleaner display
|
|
matrix = matrix.astype(int)
|
|
|
|
return matrix
|
|
|
|
|
|
# Create matrices for each unique document
|
|
documents = all_tags_df['document'].unique()
|
|
matrices = {}
|
|
|
|
for doc in documents:
|
|
print(f"\n{'='*60}")
|
|
print(f"Document: {doc}")
|
|
print('='*60)
|
|
matrix = create_sentiment_matrix(all_tags_df, doc)
|
|
if not matrix.empty:
|
|
matrices[doc] = matrix
|
|
print(matrix)
|
|
else:
|
|
print("No matrix data available")
|
|
|
|
# Store matrices in a variable for further analysis
|
|
matrices
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|