JPMC-quant/plots.py

"""Plotting functions for Voice Branding analysis."""

import plotly.graph_objects as go
import polars as pl
from theme import ColorPalette


def plot_average_scores_with_counts(
    df: pl.DataFrame,
    title: str = "General Impression (1-10)<br>Per Voice with Number of Participants Who Rated It",
    x_label: str = "Stimuli",
    y_label: str = "Average General Impression Rating (1-10)",
    color: str = ColorPalette.PRIMARY,
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Create a bar plot showing average scores and count of non-null values for each column.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing numeric columns to analyze.
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    color : str, optional
        Bar color (hex code or named color).
    height : int, optional
        Plot height in pixels.
    width : int, optional
        Plot width in pixels.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    # Calculate average and count of non-null values for each column
    # Exclude _recordId column
    stats = []
    for col in [c for c in df.columns if c != '_recordId']:
        avg_score = df[col].mean()
        non_null_count = df[col].drop_nulls().len()
        stats.append({
            'column': col,
            'average': avg_score,
            'count': non_null_count
        })

    # Sort by average score in descending order
    stats_df = pl.DataFrame(stats).sort('average', descending=True)

    # Extract voice identifiers from column names (e.g., "V14" from "Voice_Scale_1_10__V14")
    labels = [col.split('__')[-1] if '__' in col else col for col in stats_df['column']]

    # Create the plot
    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=labels,
        y=stats_df['average'],
        text=stats_df['count'],
        textposition='inside',
        textfont=dict(size=10, color='black'),
        marker_color=color,
        hovertemplate='<b>%{x}</b><br>Average: %{y:.2f}<br>Count: %{text}<extra></extra>'
    ))

    fig.update_layout(
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            range=[0, 10],
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        font=dict(size=11)
    )

    return fig


def plot_top3_ranking_distribution(
    df: pl.DataFrame,
    title: str = "Top 3 Rankings Distribution<br>Count of 1st, 2nd, and 3rd Place Votes per Voice",
    x_label: str = "Voices",
    y_label: str = "Number of Mentions in Top 3",
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Create a stacked bar chart showing how often each voice was ranked 1st, 2nd, or 3rd.

    The total height of the bar represents the popularity (frequency of being in Top 3),
    while the segments show the quality of those rankings.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing ranking columns (values 1, 2, 3).
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    height : int, optional
        Plot height in pixels.
    width : int, optional
        Plot width in pixels.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    # Exclude _recordId column
    stats = []
    for col in [c for c in df.columns if c != '_recordId']:
        # Count occurrences of each rank (1, 2, 3)
        # We ensure we're just counting the specific integer values
        rank1 = df.filter(pl.col(col) == 1).height
        rank2 = df.filter(pl.col(col) == 2).height
        rank3 = df.filter(pl.col(col) == 3).height
        total = rank1 + rank2 + rank3

        # Only include if it received at least one vote (optional, but keeps chart clean)
        if total > 0:
            stats.append({
                'column': col,
                'Rank 1': rank1,
                'Rank 2': rank2,
                'Rank 3': rank3,
                'Total': total
            })

    # Sort by Total count descending (Most popular overall)
    # Tie-break with Rank 1 count
    stats_df = pl.DataFrame(stats).sort(['Total', 'Rank 1'], descending=[True, True])

    # Extract voice identifiers from column names
    labels = [col.split('__')[-1] if '__' in col else col for col in stats_df['column']]

    fig = go.Figure()

    # Add traces for Rank 1, 2, and 3.
    # Stack order: Rank 1 at bottom (Base) -> Rank 2 -> Rank 3
    # This makes it easy to compare the "First Choice" volume across bars.

    fig.add_trace(go.Bar(
        name='Rank 1 (1st Choice)',
        x=labels,
        y=stats_df['Rank 1'],
        marker_color=ColorPalette.RANK_1,
        hovertemplate='<b>%{x}</b><br>Rank 1: %{y}<extra></extra>'
    ))

    fig.add_trace(go.Bar(
        name='Rank 2 (2nd Choice)',
        x=labels,
        y=stats_df['Rank 2'],
        marker_color=ColorPalette.RANK_2,
        hovertemplate='<b>%{x}</b><br>Rank 2: %{y}<extra></extra>'
    ))

    fig.add_trace(go.Bar(
        name='Rank 3 (3rd Choice)',
        x=labels,
        y=stats_df['Rank 3'],
        marker_color=ColorPalette.RANK_3,
        hovertemplate='<b>%{x}</b><br>Rank 3: %{y}<extra></extra>'
    ))

    fig.update_layout(
        barmode='stack',
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            traceorder="normal"
        ),
        font=dict(size=11)
    )

    return fig


def plot_ranking_distribution(
    df: pl.DataFrame,
    title: str = "Rankings Distribution<br>(1st to 4th Place)",
    x_label: str = "Item",
    y_label: str = "Number of Votes",
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Create a stacked bar chart showing the distribution of rankings (1st to 4th) for characters or voices.
    Sorted by the number of Rank 1 votes.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing ranking columns.
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    height : int, optional
        Plot height in pixels.
    width : int, optional
        Plot width in pixels.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    stats = []
    # Identify ranking columns (assume all columns except _recordId)
    ranking_cols = [c for c in df.columns if c != '_recordId']

    for col in ranking_cols:
        # Count occurrences of each rank (1, 2, 3, 4)
        # Using height/len to count rows in the filtered frame
        r1 = df.filter(pl.col(col) == 1).height
        r2 = df.filter(pl.col(col) == 2).height
        r3 = df.filter(pl.col(col) == 3).height
        r4 = df.filter(pl.col(col) == 4).height
        total = r1 + r2 + r3 + r4

        if total > 0:
            stats.append({
                'column': col,
                'Rank 1': r1,
                'Rank 2': r2,
                'Rank 3': r3,
                'Rank 4': r4
            })

    if not stats:
        return go.Figure()

    # Sort by Rank 1 (Most "Best" votes) descending to show the winner first
    # Secondary sort by Rank 2
    stats_df = pl.DataFrame(stats).sort(['Rank 1', 'Rank 2'], descending=[True, True])

    # Clean up labels: Remove prefix and underscores
    # e.g. "Character_Ranking_The_Coach" -> "The Coach"
    labels = [
        col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
        for col in stats_df['column']
    ]

    fig = go.Figure()

    # Rank 1 (Best)
    fig.add_trace(go.Bar(
        name='Rank 1 (Best)',
        x=labels,
        y=stats_df['Rank 1'],
        marker_color=ColorPalette.RANK_1,
        hovertemplate='<b>%{x}</b><br>Rank 1: %{y}<extra></extra>'
    ))

    # Rank 2
    fig.add_trace(go.Bar(
        name='Rank 2',
        x=labels,
        y=stats_df['Rank 2'],
        marker_color=ColorPalette.RANK_2,
        hovertemplate='<b>%{x}</b><br>Rank 2: %{y}<extra></extra>'
    ))

    # Rank 3
    fig.add_trace(go.Bar(
        name='Rank 3',
        x=labels,
        y=stats_df['Rank 3'],
        marker_color=ColorPalette.RANK_3,
        hovertemplate='<b>%{x}</b><br>Rank 3: %{y}<extra></extra>'
    ))

    # Rank 4 (Worst)
    # Using a neutral grey as a fallback for the lowest rank to keep focus on top ranks
    fig.add_trace(go.Bar(
        name='Rank 4 (Worst)',
        x=labels,
        y=stats_df['Rank 4'],
        marker_color=ColorPalette.RANK_4,
        hovertemplate='<b>%{x}</b><br>Rank 4: %{y}<extra></extra>'
    ))

    fig.update_layout(
        barmode='stack',
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        legend=dict(
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            traceorder="normal"
        ),
        font=dict(size=11)
    )

    return fig


def plot_most_ranked_1(
    df: pl.DataFrame,
    title: str = "Most Popular Choice<br>(Number of Times Ranked 1st)",
    x_label: str = "Item",
    y_label: str = "Count of 1st Place Rankings",
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Create a bar chart showing which item (character/voice) was ranked #1 the most.
    Top 3 items are highlighted.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing ranking columns.
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    height : int, optional
        Plot height in pixels.
    width : int, optional
        Plot width in pixels.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    stats = []
    # Identify ranking columns (assume all columns except _recordId)
    ranking_cols = [c for c in df.columns if c != '_recordId']

    for col in ranking_cols:
        # Count occurrences of rank 1
        count_rank_1 = df.filter(pl.col(col) == 1).height

        stats.append({
            'column': col,
            'count': count_rank_1
        })

    # Sort by count descending
    stats_df = pl.DataFrame(stats).sort('count', descending=True)

    # Clean up labels
    labels = [
        col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
        for col in stats_df['column']
    ]

    # Assign colors: Top 3 get PRIMARY (Blue), others get NEUTRAL (Grey)
    colors = [
        ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
        for i in range(len(stats_df))
    ]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=labels,
        y=stats_df['count'],
        text=stats_df['count'],
        textposition='inside',
        textfont=dict(size=10, color='white'),
        marker_color=colors,
        hovertemplate='<b>%{x}</b><br>1st Place Votes: %{y}<extra></extra>'
    ))

    fig.update_layout(
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        font=dict(size=11)
    )

    return fig


def plot_weighted_ranking_score(
    weighted_df: pl.DataFrame,
    title: str = "Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)",
    x_label: str = "Character Personality",
    y_label: str = "Total Weighted Score",
    color: str = ColorPalette.PRIMARY,
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Create a bar chart showing the weighted ranking score for each character.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing ranking columns.
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    color : str, optional
        Bar color.
    height : int, optional
        Plot height.
    width : int, optional
        Plot width.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=weighted_df['Character'],
        y=weighted_df['Weighted Score'],
        text=weighted_df['Weighted Score'],
        textposition='inside',
        textfont=dict(size=11, color='white'),
        marker_color=color,
        hovertemplate='<b>%{x}</b><br>Score: %{y}<extra></extra>'
    ))

    fig.update_layout(
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        font=dict(size=11)
    )

    return fig


def plot_voice_selection_counts(
    df: pl.DataFrame,
    target_column: str = "8_Combined",
    title: str = "Most Frequently Chosen Voices<br>(Top 8 Highlighted)",
    x_label: str = "Voice",
    y_label: str = "Number of Times Chosen",
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Create a bar plot showing the frequency of voice selections.
    Takes a column containing comma-separated values (e.g. "Voice 1, Voice 2..."),
    counts occurrences, and highlights the top 8 most frequent voices.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing the selection column.
    target_column : str, optional
        Name of the column containing comma-separated voice selections.
        Defaults to "8_Combined".
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    height : int, optional
        Plot height in pixels.
    width : int, optional
        Plot width in pixels.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    if target_column not in df.columns:
        return go.Figure()

    # Process the data:
    # 1. Select the relevant column and remove nulls
    # 2. Split the comma-separated string into a list
    # 3. Explode the list so each voice gets its own row
    # 4. Strip whitespace ensuring "Voice 1" and " Voice 1" match
    # 5. Count occurrences
    stats_df = (
        df.select(pl.col(target_column))
        .drop_nulls()
        .with_columns(pl.col(target_column).str.split(","))
        .explode(target_column)
        .with_columns(pl.col(target_column).str.strip_chars())
        .filter(pl.col(target_column) != "")
        .group_by(target_column)
        .agg(pl.len().alias("count"))
        .sort("count", descending=True)
    )

    # Define colors: Top 8 get PRIMARY, rest get NEUTRAL
    colors = [
        ColorPalette.PRIMARY if i < 8 else ColorPalette.NEUTRAL
        for i in range(len(stats_df))
    ]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=stats_df[target_column],
        y=stats_df['count'],
        text=stats_df['count'],
        textposition='outside',
        marker_color=colors,
        hovertemplate='<b>%{x}</b><br>Selections: %{y}<extra></extra>'
    ))

    fig.update_layout(
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        font=dict(size=11),
    )

    return fig


def plot_top3_selection_counts(
    df: pl.DataFrame,
    target_column: str = "3_Ranked",
    title: str = "Most Frequently Chosen Top 3 Voices<br>(Top 3 Highlighted)",
    x_label: str = "Voice",
    y_label: str = "Count of Mentions in Top 3",
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Question: Which 3 voices are chosen the most out of 18?

    How many times does each voice end up in the top 3?
    (this is based on the survey question where participants need to choose 3 out
    of the earlier selected 8 voices). So how often each of the 18 stimuli ended
    up in participants' Top 3, after they first selected 8 out of 18.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing the ranking column (comma-separated strings).
    target_column : str, optional
        Name of the column containing comma-separated Top 3 voice elections.
        Defaults to "3_Ranked".
    title : str, optional
        Plot title.
    x_label : str, optional
        X-axis label.
    y_label : str, optional
        Y-axis label.
    height : int, optional
        Plot height in pixels.
    width : int, optional
        Plot width in pixels.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    if target_column not in df.columns:
        return go.Figure()

    # Process the data:
    # Same logic as plot_voice_selection_counts: explode comma-separated string
    stats_df = (
        df.select(pl.col(target_column))
        .drop_nulls()
        .with_columns(pl.col(target_column).str.split(","))
        .explode(target_column)
        .with_columns(pl.col(target_column).str.strip_chars())
        .filter(pl.col(target_column) != "")
        .group_by(target_column)
        .agg(pl.len().alias("count"))
        .sort("count", descending=True)
    )

    # Define colors: Top 3 get PRIMARY, rest get NEUTRAL
    colors = [
        ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
        for i in range(len(stats_df))
    ]

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=stats_df[target_column],
        y=stats_df['count'],
        text=stats_df['count'],
        textposition='outside',
        marker_color=colors,
        hovertemplate='<b>%{x}</b><br>In Top 3: %{y} times<extra></extra>'
    ))

    fig.update_layout(
        title=title,
        xaxis_title=x_label,
        yaxis_title=y_label,
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        xaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            tickangle=-45
        ),
        yaxis=dict(
            showgrid=True,
            gridcolor=ColorPalette.GRID
        ),
        font=dict(size=11),
    )

    return fig


def plot_speaking_style_trait_scores(
    df: pl.DataFrame,
    trait_description: str = None,
    left_anchor: str = None,
    right_anchor: str = None,
    title: str = "Speaking Style Trait Analysis",
    height: int = 500,
    width: int = 1000,
) -> go.Figure:
    """
    Plot scores for a single speaking style trait across multiple voices.

    The plot shows the average score per Voice, sorted by score.
    It expects the DataFrame to contain 'Voice' and 'score' columns,
    typically filtered for a single trait/description.

    Parameters
    ----------
    df : pl.DataFrame
        DataFrame containing at least 'Voice' and 'score' columns.
        Produced by utils.process_speaking_style_data and filtered.
    trait_description : str, optional
        Description of the trait being analyzed (e.g. "Indifferent : Attentive").
        If not provided, it will be constructed from annotations.
    left_anchor : str, optional
        Label for the lower end of the scale (e.g. "Indifferent").
        If not provided, attempts to read 'Left_Anchor' column from df.
    right_anchor : str, optional
        Label for the upper end of the scale (e.g. "Attentive").
        If not provided, attempts to read 'Right_Anchor' column from df.
    title : str, optional
        Plot title.
    height : int, optional
        Plot height.
    width : int, optional
        Plot width.

    Returns
    -------
    go.Figure
        Plotly figure object.
    """
    if df.is_empty():
        return go.Figure()

    required_cols = ["Voice", "score"]
    if not all(col in df.columns for col in required_cols):
         return go.Figure()

    # Calculate stats: Mean, Count
    stats = (
        df.filter(pl.col("score").is_not_null())
        .group_by("Voice")
        .agg([
            pl.col("score").mean().alias("mean_score"),
            pl.col("score").count().alias("count")
        ])
        .sort("mean_score", descending=True) # Descending for Left-to-Right
    )

    # Attempt to extract anchors from DF if not provided
    if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns:
        head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1)
        if not head.is_empty():
            if left_anchor is None: left_anchor = head["Left_Anchor"][0]
            if right_anchor is None: right_anchor = head["Right_Anchor"][0]

    if trait_description is None:
        if left_anchor and right_anchor:
            trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}"
        else:
             # Try getting from Description column
             if "Description" in df.columns:
                 head = df.filter(pl.col("Description").is_not_null()).head(1)
                 if not head.is_empty():
                      trait_description = head["Description"][0]
                 else:
                      trait_description = ""
             else:
                 trait_description = ""

    fig = go.Figure()

    fig.add_trace(go.Bar(
        x=stats["Voice"], # X is Voice
        y=stats["mean_score"], # Y is Score
        text=stats["count"],
        textposition='inside',
        texttemplate='%{text}', # Count on bar
        marker_color=ColorPalette.PRIMARY,
        hovertemplate='<b>%{x}</b><br>Average: %{y:.2f}<br>Count: %{text}<extra></extra>'
    ))

    # Add annotations for anchors
    annotations = []

    # Place anchors on the right side
    if left_anchor:
        annotations.append(dict(
            xref='paper', yref='y',
            x=1.01, y=1,
            xanchor='left', yanchor='middle',
            text=f"<b>1: {left_anchor.split('|')[0]}</b>",
            showarrow=False,
            font=dict(size=10, color='gray')
        ))
    if right_anchor:
        annotations.append(dict(
            xref='paper', yref='y',
            x=1.01, y=5,
            xanchor='left', yanchor='middle',
            text=f"<b>5: {right_anchor.split('|')[0]}</b>",
            showarrow=False,
            font=dict(size=10, color='gray')
        ))

    fig.update_layout(
         title=dict(
            text=f"{title}<br><sub>{trait_description}</sub><br><sub>(Numbers on bars indicate respondent count)</sub>",
            y=0.92
        ),
        xaxis_title="Voice",
        yaxis_title="Average Score (1-5)",
        height=height,
        width=width,
        plot_bgcolor=ColorPalette.BACKGROUND,
        yaxis=dict(
            range=[1, 5],
            showgrid=True,
            gridcolor=ColorPalette.GRID,
            zeroline=False
        ),
        xaxis=dict(
            showgrid=False
        ),
        margin=dict(r=150),
        annotations=annotations,
        font=dict(size=11)
    )
    return fig