Interview-Analysis/02_Taguette_Post-Process.py

import marimo

__generated_with = "0.18.3"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    from pathlib import Path
    from datetime import datetime

    from utils import connect_qumo_ollama

    OLLAMA_LOCATION= 'localhost'
    # VM_NAME = 'ollama-lite'

    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)

    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)

    model_select = mo.ui.dropdown(
        options=_models,
        value=_models[0],
        label="Select Ollama Model to use",
        searchable=True,
    )
    model_select
    return (
        TAGUETTE_EXPORT_DIR,
        WORKING_DIR,
        client,
        datetime,
        mo,
        model_select,
        pd,
    )


@app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo):
    mo.md(rf"""
    # Step 1: Export Data out of Taguette

    **Highlights**
    1. Go to: https://taguette.qumo.io/project/1
    2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
    3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'

    **Tags Codebook**
    1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
    2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'

    _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 2: Import here for processing
    """)
    return


@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
    all_tags_df
    return (all_tags_df,)


@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
    codebook_df
    return (codebook_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Process each 'Interview'
    """)
    return


@app.cell
def _(all_tags_df, mo):

    interview_select = mo.ui.dropdown(
        options=all_tags_df['document'].unique().tolist(),
        label="Select Interview to Process",
        full_width=True
    )
    interview_select
    return (interview_select,)


@app.cell
def _(all_tags_df, interview_select, mo):
    mo.stop(not interview_select.value, mo.md("Select interview to continue"))
    # filter all_tags_df to only the document = file_dropdown.value
    df = all_tags_df.loc[all_tags_df['document'] == interview_select.value].copy()
    return (df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Add `_context` column to track Voice / Character is being referred to per highlight
    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.

    1. Iterates through the dataframe in document order (row by row)
    2. Uses a set to track which highlight IDs we've already processed
    3. When we encounter a new highlight ID for the first time, we process all its rows
    4. Collects all _V- or _C- tags within that highlight
    5. Assigns the context to all rows with that ID
    6. This preserves document order and handles multi-tag highlights correctly


    Example of challenging case:

    | tag                                | content | _seq_id | _context         |
    |------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
    |  _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
    | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
    | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
    | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
    """)
    return


@app.cell
def _(df):
    # First pass: identify context tags within each highlight group
    df['_context'] = None
    last_context = None
    processed_ids = set()

    # Process in document order
    for idx, row in df.iterrows():
        highlight_id = row['id']

        # If we haven't processed this highlight yet
        if highlight_id not in processed_ids:
            processed_ids.add(highlight_id)

            # Get all rows for this highlight
            highlight_rows = df[df['id'] == highlight_id]

            # Collect all context tags in this highlight
            context_tags = []
            for _, h_row in highlight_rows.iterrows():
                tag = h_row.get('tag', '')
                if '_V-' in tag or '_C-' in tag:
                    context_tags.append(tag)

            # If we found context tags, join them with comma
            if context_tags:
                context_tag = ', '.join(context_tags)
                last_context = context_tag
            else:
                # If no context tag in this highlight, use the last context
                context_tag = last_context

            # Assign the context to all rows in this highlight
            df.loc[df['id'] == highlight_id, '_context'] = context_tag

    df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Split multi-context rows (only VT- and CT- theme tags)

    For rows that have multiple contexts (e.g., both _V-54 and _V-41)
    - split these into separate rows for each context.
    - Then mark these for 'manual_analysis'
    """)
    return


@app.cell
def _(df, pd):
    # Expand rows that contain multiple contexts (comma-separated)
    expanded_rows = []

    for _, _row in df.iterrows():
        context_value = _row['_context']
        has_multiple = pd.notna(context_value) and ',' in str(context_value)

        if has_multiple:
            contexts = [c.strip() for c in str(context_value).split(',')]
        else:
            contexts = [context_value]

        if has_multiple:
            for ctx in contexts:
                new_row = _row.copy()
                new_row['_context'] = ctx
                new_row['manual_analysis'] = True

                if str(new_row['tag']).startswith(('VT -', 'CT -')):
                    new_row['sentiment'] = None

                expanded_rows.append(new_row)
        else:
            new_row = _row.copy()
            new_row['_context'] = contexts[0]
            new_row['manual_analysis'] = False
            expanded_rows.append(new_row)

    expanded_df_raw = pd.DataFrame(expanded_rows).reset_index(drop=True)


    sentiment_df = expanded_df_raw.loc[
        expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
    ].copy()

    print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")

    sentiment_df[sentiment_df['manual_analysis']]
    return (sentiment_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Create 'theme' column
    """)
    return


@app.cell
def _(sentiment_df):
    from utils import extract_theme
    sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
    sentiment_df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Extract Sentiment + Reasoning

    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.

    Values to be used:
    - Positive: +1
    - Neutral: 0
    - Negative: -1
    """)
    return


@app.cell
def _(mo):
    start_processing_btn = mo.ui.button(
        label="Start Sentiment Extraction",
        kind="warn",
        on_click=lambda val: True
    )
    start_processing_btn
    return (start_processing_btn,)


@app.cell
def _(
    client,
    codebook_df,
    mo,
    model_select,
    pd,
    sentiment_df,
    start_processing_btn,
):
    from utils import dummy_sentiment_analysis, ollama_sentiment_analysis

    # add theme_description to be used in LLM prompt
    _df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))

    # Wait for start processing button
    mo.stop(not start_processing_btn.value, "Click button above to start processing")


    sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
        lambda row: pd.Series(ollama_sentiment_analysis(
            content=row['content'],
            theme=row['theme'],
            theme_description=row['theme_description'],
            client=client,
            model=model_select.value
        )),
        axis=1
    )
    return


@app.cell
def _(mo, sentiment_df):
    mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
    sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Multi-context tags
    """)
    return


@app.cell
def _(mo, sentiment_df):
    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
    split_rows_editor = None
    rows_to_edit = []

    if not manual_rows.empty:
        print(
            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
            "See next cell for manual review."
        )

        # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
        rows_to_edit = sentiment_df[
            (sentiment_df['manual_analysis'])
        ]

        # Create data editor for split rows
        split_rows_editor = mo.ui.data_editor(
            rows_to_edit
    ).form(label="Update Sentiment / Manual Flag")

    else:
        print("✓ No multi-context rows found")
    return rows_to_edit, split_rows_editor


@app.cell
def _(split_rows_editor):
    split_rows_editor

    return


@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
    if split_rows_editor is not None:
        mo.vstack([
            mo.md(f"""
            ### ⚠️ Manual Review Required

            **{len(rows_to_edit)} rows** were split from multi-context entries.
            Please review them below:
            1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
            2. Click **Submit** to apply changes.
            """),
            split_rows_editor
        ])
    return


@app.cell
def _(mo, split_rows_editor):
    # Capture the edited manual-analysis rows for validation
    reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
    mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))

    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1

    if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
        valid_sentiments = {-1, 0, 1}
        needs_review = reviewed_manual_rows[
            reviewed_manual_rows['manual_analysis']
            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
        ]
        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"

        print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
    return (reviewed_manual_rows,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Recombine
    """)
    return


@app.cell
def _(pd, reviewed_manual_rows, sentiment_df):
    _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
    if isinstance(reviewed_manual_rows, pd.DataFrame):
        recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
    else:
        recombined_df = sentiment_df

    recombined_df
    return (recombined_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Process 'Other' tags

    These need to be reviewed manually for interesting content
    """)
    return


@app.cell
def _(mo):
    mo.md(r"""

    """)
    return


@app.cell
def _():
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Save to CSV
    """)
    return


@app.cell
def _(WORKING_DIR, datetime, interview_select, recombined_df):
    # Save to CSV in working dir
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
    recombined_df.to_csv(filename, index=False)

    print(f"✓ Saved processed data to '{filename}'")
    return


if __name__ == "__main__":
    app.run()