Interview-Analysis/03_Sentiment_Analysis.py

import marimo

__generated_with = "0.18.3"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import pandas as pd
    from pathlib import Path

    INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
    WORKING_DIR = Path('./data/processing/03_sentiment_analysis')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)

    return INPUT_DIR, Path, WORKING_DIR, mo, pd


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Load Sentiment CSV
    """)
    return


@app.cell
def _(INPUT_DIR, mo):
    csv_files = list(INPUT_DIR.glob("*.csv"))
    file_options = {f.stem: str(f) for f in csv_files}

    sentiment_csv = mo.ui.dropdown(
        options=file_options,
        label="Select Sentiment CSV File",
        full_width=True
    )
    sentiment_csv
    return (sentiment_csv,)


@app.cell
def _(Path, pd, sentiment_csv):
    input_csv_name = Path(sentiment_csv.value).stem
    timestamp = input_csv_name.split('_')[-1]
    doc = input_csv_name.split('_')[0]

    sentiment_df = pd.read_csv(sentiment_csv.value)
    sentiment_df
    return doc, sentiment_df, timestamp


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Phase 1: Individual interview analysis
    - Create sentiment matrices for each interview (document)
    - Save the intermediate results to file in the `WORKING_DIR`
    """)
    return


@app.cell
def _(document_name, pd):
    import numpy as np

    def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
        """
        Create a sentiment matrix for a specific document.

        Parameters:
        - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
        - document_name: Name of the document to filter by

        Returns:
        - DataFrame representing the sentiment matrix
        """

        # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
        sentiment_rows = doc_df[
            doc_df['tag'].str.contains(column_prefix, na=False)
        ].copy()

        if sentiment_rows.empty:
            print(f"No sentiment data found for document: {document_name}")
            return pd.DataFrame()

        # Filter for rows with valid Voice/Character context
        valid_rows = sentiment_rows[
            sentiment_rows['_context'].notna() &
            (sentiment_rows['_context'].str.contains(row_prefix, na=False))
        ].copy()

        if valid_rows.empty:
            print(f"No Voice/Character context found for document: {document_name}")
            return pd.DataFrame()

        # Create aggregation: group by Voice/Character (_context) and Theme (tag)
        # Sum sentiment scores for each combination
        matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()

        # Pivot to create the matrix
        matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')

        # # Convert to integers for cleaner display
        # matrix = matrix.astype(int)

        return matrix
    return (create_sentiment_matrix,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 1.1: Voice Sample vs. Theme Sentiment Matrix

    For each interview (document), create a matrix where:
    - Rows represent the different Voices (based on '_V-' tags)
    - Columns represent the different VoiceThemes(based on 'VT -' tags)
    - Each cell contains the aggregated sentiment score (sum) for that Voice/Theme combination
    """)
    return


@app.cell
def _(create_sentiment_matrix, sentiment_df):
    voice_matrix = create_sentiment_matrix(sentiment_df, column_prefix='VT - ', row_prefix='_V-')
    voice_matrix
    return (voice_matrix,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    SAVE TO CSV
    """)
    return


@app.cell
def _(WORKING_DIR, doc, timestamp, voice_matrix):
    # Save to CSV
    voice_filename = WORKING_DIR / f"{doc}_voice_theme_matrix_{timestamp}.csv"

    voice_matrix.to_csv(voice_filename)

    print(f"Saved to '{voice_filename}'")
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 1.2: Character Sample vs. Theme Sentiment Matrix

    For each interview (document), create a matrix where:
    - Rows represent the different Characters (based on  '_C-' tags)
    - Columns represent the different CharacterThemes (based on 'CT -' tags)
    - Each cell contains the aggregated sentiment score (sum) for that Character/Theme combination
    """)
    return


@app.cell
def _(create_sentiment_matrix, sentiment_df):
    character_matrix = create_sentiment_matrix(sentiment_df, column_prefix='CT - ', row_prefix='_C-')
    character_matrix
    return (character_matrix,)


@app.cell
def _(WORKING_DIR, character_matrix, doc, timestamp):
    # Save to CSV
    character_filename = WORKING_DIR / f"{doc}_character_theme_matrix_{timestamp}.csv"

    character_matrix.to_csv(character_filename)

    print(f"Saved to '{character_filename}'")
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 1.3: Chase Brand Sentiment

    TODO: not sure we have enough supporting data for this yet
    """)
    return


if __name__ == "__main__":
    app.run()