Interview-Analysis/02_Taguette_Post-Process.py

import marimo

__generated_with = "0.18.3"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    from pathlib import Path

    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
    return TAGUETTE_EXPORT_DIR, mo, pd


@app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo):
    mo.md(rf"""
    # Step 1: Export All Highlights out of Taguette

    1. Go to: http://taguette.tail44fa00.ts.net/project/1
    2. Select 'Highlights' on left
    3. Select 'See all hightlights'
    4. Top right 'Export this view' > 'CSV'
    5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 2: Import here for processing
    """)
    return


@app.cell
def _(pd):
    all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
    all_tags_df.head(20)
    return (all_tags_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Process each 'Interview'
    """)
    return


@app.cell
def _(all_tags_df, mo):

    file_dropdown = mo.ui.dropdown(
        options=all_tags_df['document'].unique().tolist(),
        label="Select Interview to Process",
        full_width=True
    )
    file_dropdown
    return (file_dropdown,)


@app.cell
def _(all_tags_df, file_dropdown):
    # filter all_tags_df to only the document = file_dropdown.value
    df = all_tags_df.loc[all_tags_df['document'] == file_dropdown.value].copy()
    return (df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ### Add `_context` column to track Voice / Character is being referred to per highlight
    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.

    1. Iterates through the dataframe in document order (row by row)
    2. Uses a set to track which highlight IDs we've already processed
    3. When we encounter a new highlight ID for the first time, we process all its rows
    4. Collects all _V- or _C- tags within that highlight
    5. Assigns the context to all rows with that ID
    6. This preserves document order and handles multi-tag highlights correctly


    Example of challenging case:

    | id  | document | tag                                | content | _seq_id | _context         |
    |-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
    | 88  | P2 - Done   | _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
    | 88  | P2 - Done   | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
    | 88  | P2 - Done   | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
    | 88  | P2 - Done   | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
    """)
    return


@app.cell
def _(df):
    # First pass: identify context tags within each highlight group
    df['_context'] = None
    last_context = None
    processed_ids = set()

    # Process in document order
    for idx, row in df.iterrows():
        highlight_id = row['id']

        # If we haven't processed this highlight yet
        if highlight_id not in processed_ids:
            processed_ids.add(highlight_id)

            # Get all rows for this highlight
            highlight_rows = df[df['id'] == highlight_id]

            # Collect all context tags in this highlight
            context_tags = []
            for _, h_row in highlight_rows.iterrows():
                tag = h_row.get('tag', '')
                if '_V-' in tag or '_C-' in tag:
                    context_tags.append(tag)

            # If we found context tags, join them with comma
            if context_tags:
                context_tag = ', '.join(context_tags)
                last_context = context_tag
            else:
                # If no context tag in this highlight, use the last context
                context_tag = last_context

            # Assign the context to all rows in this highlight
            df.loc[df['id'] == highlight_id, '_context'] = context_tag

    del idx
    df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Resolve multi-context rows (only VT- and CT- theme tags)

    For rows that have multiple contexts (e.g., both _V-54 and _V-41)
    - split these into separate rows for each context.
    - Then mark these for 'manual_analysis'
    """)
    return


@app.cell
def _(df, pd):
    # Expand rows that contain multiple contexts (comma-separated)
    expanded_rows = []

    for _, _row in df.iterrows():
        context_value = _row['_context']
        has_multiple = pd.notna(context_value) and ',' in str(context_value)

        if has_multiple:
            contexts = [c.strip() for c in str(context_value).split(',')]
        else:
            contexts = [context_value]

        if has_multiple:
            for ctx in contexts:
                new_row = _row.copy()
                new_row['_context'] = ctx
                new_row['manual_analysis'] = True

                if str(new_row['tag']).startswith(('VT -', 'CT -')):
                    new_row['sentiment'] = None

                expanded_rows.append(new_row)
        else:
            new_row = _row.copy()
            new_row['_context'] = contexts[0]
            new_row['manual_analysis'] = False
            expanded_rows.append(new_row)

    expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)

    manual_rows = expanded_df_raw[expanded_df_raw['manual_analysis']]
    if not manual_rows.empty:
        print(
            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
            "See next cell for manual review."
        )
    else:
        print("✓ No multi-context rows found")
    return (expanded_df_raw,)


@app.cell
def _(expanded_df_raw, mo):
    # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
    rows_to_edit = expanded_df_raw[
        (expanded_df_raw['manual_analysis'])
        & (expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False))
    ]

    # Create data editor for split rows
    split_rows_editor = mo.ui.data_editor(
        rows_to_edit
    ).form(label="Update Sentiment / Manual Flag")
    return rows_to_edit, split_rows_editor


@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
    mo.vstack([
        mo.md(f"""
        ### ⚠️ Manual Review Required

        **{len(rows_to_edit)} rows** were split from multi-context entries.
        Please review them below:
        1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
        2. Uncheck `manual_analysis` when you are done reviewing a row.
        3. Click **Submit** to apply changes.
        """),
        split_rows_editor
    ])
    return


@app.cell
def _(expanded_df_raw, mo, pd, split_rows_editor):
    # Reconstruct the full dataframe using the editor's current value
    # This will update whenever the user edits the table
    mo.stop(split_rows_editor.value is None, mo.md("Submit your changes."))
    _edited_rows = split_rows_editor.value
    _static_rows = expanded_df_raw[~expanded_df_raw['manual_analysis']]
    expanded_df2 = pd.concat([_static_rows, _edited_rows]).sort_index()
    return (expanded_df2,)


@app.cell
def _(expanded_df2, pd):
    # Verify no rows have multiple contexts
    try:
        has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False)
        assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)"

        # Verify that rows still marked for manual analysis have sentiment values
        manual_sent_rows = expanded_df2[expanded_df2['manual_analysis']]
        theme_rows = manual_sent_rows[manual_sent_rows['tag'].str.startswith(('VT -', 'CT -'), na=False)]
        missing_sentiment = theme_rows[theme_rows['sentiment'].isna()]

        assert missing_sentiment.empty, (
            f"{len(missing_sentiment)} rows marked for manual analysis "
            "have missing sentiment values"
        )

        print("\n✓ Verification passed: Manual-analysis rows are consistent")

        expanded_df_final = expanded_df2

        expanded_df_final

    except AssertionError as e:
        print(f"\n❌ Verification failed: {e}")
        print("Please review the data before proceeding")
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Highlight Sentiment Analysis

    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.

    Values to be used:
    - Positive: +1
    - Neutral: 0
    - Negative: -1
    """)
    return


@app.cell
def _(df):
    # TODO: Implement sentiment analysis and add 'sentiment' column

    # for now, create an empty sentiment column with randomized dummy values for testing
    # only for 'VT -' and 'CT -' tags
    import random

    def dummy_sentiment_analysis(content, tag):
        if tag.startswith('VT -') or tag.startswith('CT -'):
            return random.choice([-1, 0, 1])  # Random sentiment for testing
        return None

    df['sentiment'] = df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)

    df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Process 'Other' tags

    These need to be reviewed manually for interesting content
    """)
    return


@app.cell
def _(mo):
    mo.md(r"""

    """)
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()