Interview-Analysis/02_Taguette_Post-Process.py

import marimo

__generated_with = "0.18.3"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    from pathlib import Path
    from datetime import datetime

    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
    return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd


@app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo):
    mo.md(rf"""
    # Step 1: Export All Highlights out of Taguette

    1. Go to: http://taguette.tail44fa00.ts.net/project/1
    2. Select 'Highlights' on left
    3. Select 'See all hightlights'
    4. Top right 'Export this view' > 'CSV'
    5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 2: Import here for processing
    """)
    return


@app.cell
def _(pd):
    all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
    all_tags_df.head(20)
    return (all_tags_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Process each 'Interview'
    """)
    return


@app.cell
def _(all_tags_df, mo):

    interview_select = mo.ui.dropdown(
        options=all_tags_df['document'].unique().tolist(),
        label="Select Interview to Process",
        full_width=True
    )
    interview_select
    return (interview_select,)


@app.cell
def _(all_tags_df, interview_select):
    # filter all_tags_df to only the document = file_dropdown.value
    df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
    return (df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ### Add `_context` column to track Voice / Character is being referred to per highlight
    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.

    1. Iterates through the dataframe in document order (row by row)
    2. Uses a set to track which highlight IDs we've already processed
    3. When we encounter a new highlight ID for the first time, we process all its rows
    4. Collects all _V- or _C- tags within that highlight
    5. Assigns the context to all rows with that ID
    6. This preserves document order and handles multi-tag highlights correctly


    Example of challenging case:

    | id  | document | tag                                | content | _seq_id | _context         |
    |-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
    | 88  | P2 - Done   | _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
    | 88  | P2 - Done   | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
    | 88  | P2 - Done   | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
    | 88  | P2 - Done   | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
    """)
    return


@app.cell
def _(df):
    # First pass: identify context tags within each highlight group
    df['_context'] = None
    last_context = None
    processed_ids = set()

    # Process in document order
    for idx, row in df.iterrows():
        highlight_id = row['id']

        # If we haven't processed this highlight yet
        if highlight_id not in processed_ids:
            processed_ids.add(highlight_id)

            # Get all rows for this highlight
            highlight_rows = df[df['id'] == highlight_id]

            # Collect all context tags in this highlight
            context_tags = []
            for _, h_row in highlight_rows.iterrows():
                tag = h_row.get('tag', '')
                if '_V-' in tag or '_C-' in tag:
                    context_tags.append(tag)

            # If we found context tags, join them with comma
            if context_tags:
                context_tag = ', '.join(context_tags)
                last_context = context_tag
            else:
                # If no context tag in this highlight, use the last context
                context_tag = last_context

            # Assign the context to all rows in this highlight
            df.loc[df['id'] == highlight_id, '_context'] = context_tag

    df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Resolve multi-context rows (only VT- and CT- theme tags)

    For rows that have multiple contexts (e.g., both _V-54 and _V-41)
    - split these into separate rows for each context.
    - Then mark these for 'manual_analysis'
    """)
    return


@app.cell
def _(df, pd):
    # Expand rows that contain multiple contexts (comma-separated)
    expanded_rows = []

    for _, _row in df.iterrows():
        context_value = _row['_context']
        has_multiple = pd.notna(context_value) and ',' in str(context_value)

        if has_multiple:
            contexts = [c.strip() for c in str(context_value).split(',')]
        else:
            contexts = [context_value]

        if has_multiple:
            for ctx in contexts:
                new_row = _row.copy()
                new_row['_context'] = ctx
                new_row['manual_analysis'] = True

                if str(new_row['tag']).startswith(('VT -', 'CT -')):
                    new_row['sentiment'] = None

                expanded_rows.append(new_row)
        else:
            new_row = _row.copy()
            new_row['_context'] = contexts[0]
            new_row['manual_analysis'] = False
            expanded_rows.append(new_row)

    expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)


    sentiment_df = expanded_df_raw.loc[
        expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
    ].copy()

    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
    if not manual_rows.empty:
        print(
            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
            "See next cell for manual review."
        )
    else:
        print("✓ No multi-context rows found")
    return (sentiment_df,)


@app.cell
def _(mo, sentiment_df):
    # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
    rows_to_edit = sentiment_df[
        (sentiment_df['manual_analysis'])
    ]

    # Create data editor for split rows
    split_rows_editor = mo.ui.data_editor(
        rows_to_edit
    ).form(label="Update Sentiment / Manual Flag")
    return rows_to_edit, split_rows_editor


@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
    mo.vstack([
        mo.md(f"""
        ### ⚠️ Manual Review Required

        **{len(rows_to_edit)} rows** were split from multi-context entries.
        Please review them below:
        1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
        2. Click **Submit** to apply changes.
        """),
        split_rows_editor
    ])
    return


@app.cell
def _(mo, split_rows_editor):
    # Capture the edited manual-analysis rows for validation
    mo.stop(split_rows_editor.value is None, mo.md("Submit your changes."))
    reviewed_manual_rows = split_rows_editor.value

    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
    if not reviewed_manual_rows.empty:
        valid_sentiments = {-1, 0, 1}
        needs_review = reviewed_manual_rows[
            reviewed_manual_rows['manual_analysis']
            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
        ]
        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"

    print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
    return (reviewed_manual_rows,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Highlight Sentiment Analysis

    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.

    Values to be used:
    - Positive: +1
    - Neutral: 0
    - Negative: -1
    """)
    return


@app.cell
def _(sentiment_df):
    # for now, create an empty sentiment column with randomized dummy values for testing
    # only for 'VT -' and 'CT -' tags
    import random

    def dummy_sentiment_analysis(content, tag):
        if tag.startswith('VT -') or tag.startswith('CT -'):
            return random.choice([-1, 0, 1])  # Random sentiment for testing
        return None

    # Only run on rows without manual_analysis

    sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)

    sentiment_df[~sentiment_df['manual_analysis']]
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Recombine
    """)
    return


@app.cell
def _(pd, reviewed_manual_rows, sentiment_df):
    _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
    recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)

    recombined_df
    return (recombined_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Process 'Other' tags

    These need to be reviewed manually for interesting content
    """)
    return


@app.cell
def _(mo):
    mo.md(r"""

    """)
    return


@app.cell
def _():
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Save to CSV
    """)
    return


@app.cell
def _(WORKING_DIR, datetime, interview_select, recombined_df):
    # Save to CSV in working dir
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
    recombined_df.to_csv(filename, index=False)

    print(f"✓ Saved processed data to '{filename}'")
    return


if __name__ == "__main__":
    app.run()