Interview-Analysis/02_Taguette_Post-Process.py

import marimo

__generated_with = "0.18.3"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    return mo, pd


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 1: Export All Highlights

    1. Go to: http://taguette.tail44fa00.ts.net/project/1
    2. Select 'Highlights' on left
    3. Select 'See all hightlights'
    4. Top right 'Export this view' > 'CSV'
    5.
    """)
    return


@app.cell
def _(pd):
    all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
    all_tags_df.head(20)
    return (all_tags_df,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ### Post-process the dataframe so it can be easily analyzed
    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.

    1. Iterates through the dataframe in document order (row by row)
    2. Uses a set to track which highlight IDs we've already processed
    3. When we encounter a new highlight ID for the first time, we process all its rows
    4. Collects all _V- or _C- tags within that highlight
    5. Assigns the context to all rows with that ID
    6. This preserves document order and handles multi-tag highlights correctly


    Example of challenging case:

    | id  | document | tag                                | content | _seq_id | _context         |
    |-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
    | 252 | P2 - Done   | _C-Counselor                       | So we've pulled through your top personality, which was the counselor, and then we've included those same twelve voices from before. And your task now is to select which of the voices you feel best suits this character that would be, the personality and voice for Chase's digital assistant.                                                                         | 115        | _C-Counselor         |
    | 88  | P2 - Done   | VT - Knowledgeable / Trust         | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 116        | _V-54, _V-41         |
    | 88  | P2 - Done   | _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
    | 88  | P2 - Done   | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
    | 88  | P2 - Done   | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
    | 88  | P2 - Done   | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
    | 90  | P2 - Done   | VT - Personal 'click'              | I picked the female because her voice is so unique.                                                                                                                                                                                                                                                                                                                      | 121        | _V-41                |
    | 90  | P2 - Done   | _V-41                              | I picked the female because her voice is so unique.                                                                                                                                                                                                                                                                                                                      | 122        | _V-41                |
    """)
    return


@app.cell
def _(all_tags_df):
    # First pass: identify context tags within each highlight group
    all_tags_df['_context'] = None
    last_context = None
    processed_ids = set()

    # Process in document order
    for idx, row in all_tags_df.iterrows():
        highlight_id = row['id']

        # If we haven't processed this highlight yet
        if highlight_id not in processed_ids:
            processed_ids.add(highlight_id)

            # Get all rows for this highlight
            highlight_rows = all_tags_df[all_tags_df['id'] == highlight_id]

            # Collect all context tags in this highlight
            context_tags = []
            for _, h_row in highlight_rows.iterrows():
                tag = h_row.get('tag', '')
                if '_V-' in tag or '_C-' in tag:
                    context_tags.append(tag)

            # If we found context tags, join them with comma
            if context_tags:
                context_tag = ', '.join(context_tags)
                last_context = context_tag
            else:
                # If no context tag in this highlight, use the last context
                context_tag = last_context

            # Assign the context to all rows in this highlight
            all_tags_df.loc[all_tags_df['id'] == highlight_id, '_context'] = context_tag

    del idx
    all_tags_df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 2: Sentiment Analysis

    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.

    Values to be used:
    - Positive: +1
    - Neutral: 0
    - Negative: -1
    """)
    return


@app.cell
def _(all_tags_df):
    # TODO: Implement sentiment analysis and add 'sentiment' column

    # for now, create an empty sentiment column with randomized dummy values for testing
    # only for 'VT -' and 'CT -' tags
    import random

    def dummy_sentiment_analysis(content, tag):
        if tag.startswith('VT -') or tag.startswith('CT -'):
            return random.choice([-1, 0, 1])  # Random sentiment for testing
        return None

    all_tags_df['sentiment'] = all_tags_df.apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)

    all_tags_df
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 2b: Resolve multi-context rows

    For rows that have multiple contexts (e.g., both _V-54 and _V-41), split these into separate rows for each context, removing the content and sentiment analysis for each new row. Then mark these for manual review. Use marimo's interactive notebook editing features to facilitate this process.

    This ensures that each row corresponds to a single context for clearer analysis in subsequent steps. Add verification column to mark these rows for review. Run assert at the end to ensure no rows have multiple contexts and if that passes, drop the verification column.
    """)
    return


@app.cell
def _(all_tags_df, pd):
    # Identify rows with multiple contexts (comma-separated)
    all_tags_df['_needs_split'] = all_tags_df['_context'].apply(
        lambda x: ',' in str(x) if pd.notna(x) else False
    )

    # Create expanded rows for multi-context entries
    expanded_rows = []
    for _, _row in all_tags_df.iterrows():
        if _row['_needs_split']:
            # Split the context by comma
            contexts = [c.strip() for c in str(_row['_context']).split(',')]

            # Create a new row for each context
            for ctx in contexts:
                new_row = _row.copy()
                new_row['_context'] = ctx
                new_row['_was_split'] = True  # Mark for manual review
                expanded_rows.append(new_row)
        else:
            # Keep single-context rows as-is
            new_row = _row.copy()
            new_row['_was_split'] = False
            expanded_rows.append(new_row)

    # Create the new dataframe

    expanded_df2 = pd.DataFrame(expanded_rows).reset_index(drop=True)

    # Display rows that were split for review
    split_rows = expanded_df2[expanded_df2['_was_split']]
    if not split_rows.empty:
        split_rows
    #     print(f"⚠️  {len(split_rows)} rows were created from multi-context splits")
    #     print("These are marked with '_was_split' = True for manual review\n")
    #     print("Sample of split rows:")
    #     split_rows[['id', 'document', 'tag', '_context', 'sentiment', '_was_split']]
    else:
        print("✓ No multi-context rows found")

    expanded_df2[expanded_df2['_was_split']]
    return (expanded_df2,)


@app.cell
def _():
    # Using marimo's interactive notebook editing features, have the user manually update the sentiment values for the split rows as needed. (only for 'VT -' and 'CT -' tags)
    return


@app.cell
def _(expanded_df2, pd):
    # Verify no rows have multiple contexts
    try:
        has_comma = expanded_df2['_context'].apply(lambda x: ',' in str(x) if pd.notna(x) else False)
        assert not has_comma.any(), "Some rows still have multiple contexts (comma-separated)"

        # assert that all have manual checks have been completed
        assert expanded_df2['_was_split'].sum() == 0, "Some rows still need manual review"
        print("\n✓ Verification passed: All rows have single contexts")

        # Drop verification columns since verification passed
        expanded_df_final = expanded_df2.drop(columns=['_needs_split', '_was_split'])
        print("✓ Verification columns dropped")

        expanded_df_final

    except AssertionError as e:
        print(f"\n❌ Verification failed: {e}")
        print("Please review the data before proceeding")


    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Step 3: Create Matrices for each interview

    For each interview (document), create a matrix where:
    - Rows represent the different Voices/Characters (based on '_V-' and '_C-' tags)
    - Columns represent the different VoiceThemes/CharacterThemes (based on 'VT -' and 'CT -' tags)
    - Each cell contains the aggregated sentiment score for that Voice/Character regarding that combination
    """)
    return


@app.cell
def _(all_tags_df, pd):
    import numpy as np

    def create_sentiment_matrix(df, document_name):
        """
        Create a sentiment matrix for a specific document.

        Parameters:
        - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
        - document_name: Name of the document to filter by

        Returns:
        - DataFrame representing the sentiment matrix
        """
        # Filter for the specific document
        doc_df = df[df['document'] == document_name].copy()

        # Filter for rows that have sentiment values (VT- and CT- tags)
        sentiment_rows = doc_df[doc_df['sentiment'].notna()].copy()

        if sentiment_rows.empty:
            print(f"No sentiment data found for document: {document_name}")
            return pd.DataFrame()

        # Filter for rows with valid Voice/Character context
        valid_rows = sentiment_rows[
            sentiment_rows['_context'].notna() &
            (sentiment_rows['_context'].str.contains('_V-|_C-', na=False))
        ].copy()

        if valid_rows.empty:
            print(f"No Voice/Character context found for document: {document_name}")
            return pd.DataFrame()

        # Create aggregation: group by Voice/Character (_context) and Theme (tag)
        # Sum sentiment scores for each combination
        matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()

        # Pivot to create the matrix
        matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')

        # Fill NaN with 0 (no sentiment data for that combination)
        matrix = matrix.fillna(0)

        # Convert to integers for cleaner display
        matrix = matrix.astype(int)

        return matrix


    # Create matrices for each unique document
    documents = all_tags_df['document'].unique()
    matrices = {}

    for doc in documents:
        print(f"\n{'='*60}")
        print(f"Document: {doc}")
        print('='*60)
        matrix = create_sentiment_matrix(all_tags_df, doc)
        if not matrix.empty:
            matrices[doc] = matrix
            print(matrix)
        else:
            print("No matrix data available")

    # Store matrices in a variable for further analysis
    matrices
    return


if __name__ == "__main__":
    app.run()