speaking style trait scores vertical

2026-01-23 12:26:47 +01:00
parent 424355f4a1
commit 84a0f8052e
5 changed files with 615 additions and 90 deletions
--- a/plots.py
+++ b/plots.py
@@ -216,22 +216,22 @@ def plot_top3_ranking_distribution(
    return fig


-def plot_character_ranking_distribution(
+def plot_ranking_distribution(
    df: pl.DataFrame,
-    title: str = "Character Personality Rankings<br>Distribution of Votes (1st to 4th Place)",
-    x_label: str = "Character Personality",
+    title: str = "Rankings Distribution<br>(1st to 4th Place)",
+    x_label: str = "Item",
    y_label: str = "Number of Votes",
    height: int = 500,
    width: int = 1000,
 ) -> go.Figure:
    """
-    Create a stacked bar chart showing the distribution of rankings (1st to 4th) for character personalities.
-    Sorted by the number of Rank 1 votes to highlight the 'Best' options.
+    Create a stacked bar chart showing the distribution of rankings (1st to 4th) for characters or voices.
+    Sorted by the number of Rank 1 votes.

    Parameters
    ----------
    df : pl.DataFrame
-        DataFrame containing character ranking columns (prefix 'Character_Ranking').
+        DataFrame containing ranking columns.
    title : str, optional
        Plot title.
    x_label : str, optional
@@ -249,8 +249,8 @@ def plot_character_ranking_distribution(
        Plotly figure object.
    """
    stats = []
-    # Identify columns related to Character Ranking (excluding ID)
-    ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+    # Identify ranking columns (assume all columns except _recordId)
+    ranking_cols = [c for c in df.columns if c != '_recordId']

    for col in ranking_cols:
        # Count occurrences of each rank (1, 2, 3, 4)
@@ -280,7 +280,7 @@ def plot_character_ranking_distribution(
    # Clean up labels: Remove prefix and underscores
    # e.g. "Character_Ranking_The_Coach" -> "The Coach"
    labels = [
-        col.replace('Character_Ranking_', '').replace('_', ' ').strip() 
+        col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip() 
        for col in stats_df['column']
    ]

@@ -354,21 +354,22 @@ def plot_character_ranking_distribution(
    return fig


-def plot_most_ranked_1_character(
+def plot_most_ranked_1(
    df: pl.DataFrame,
-    title: str = "Most Popular Character Personality<br>(Number of Times Ranked 1st)",
-    x_label: str = "Character Personality",
+    title: str = "Most Popular Choice<br>(Number of Times Ranked 1st)",
+    x_label: str = "Item",
    y_label: str = "Count of 1st Place Rankings",
    height: int = 500,
    width: int = 1000,
 ) -> go.Figure:
    """
-    Create a bar chart showing which character personality was ranked #1 the most.
+    Create a bar chart showing which item (character/voice) was ranked #1 the most.
+    Top 3 items are highlighted.

    Parameters
    ----------
    df : pl.DataFrame
-        DataFrame containing character ranking columns.
+        DataFrame containing ranking columns.
    title : str, optional
        Plot title.
    x_label : str, optional
@@ -386,8 +387,8 @@ def plot_most_ranked_1_character(
        Plotly figure object.
    """
    stats = []
-    # Identify columns related to Character Ranking
-    ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+    # Identify ranking columns (assume all columns except _recordId)
+    ranking_cols = [c for c in df.columns if c != '_recordId']

    for col in ranking_cols:
        # Count occurrences of rank 1
@@ -403,19 +404,25 @@ def plot_most_ranked_1_character(

    # Clean up labels
    labels = [
-        col.replace('Character_Ranking_', '').replace('_', ' ').strip() 
+        col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip() 
        for col in stats_df['column']
    ]

-    fig = go.Figure()
+    # Assign colors: Top 3 get PRIMARY (Blue), others get NEUTRAL (Grey)
+    colors = [
+        ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
+        for i in range(len(stats_df))
+    ]

+    fig = go.Figure()
+    
    fig.add_trace(go.Bar(
        x=labels,
        y=stats_df['count'],
        text=stats_df['count'],
        textposition='inside',
        textfont=dict(size=10, color='white'),
-        marker_color=ColorPalette.PRIMARY,
+        marker_color=colors,
        hovertemplate='<b>%{x}</b><br>1st Place Votes: %{y}<extra></extra>'
    ))

@@ -444,7 +451,7 @@ def plot_most_ranked_1_character(

 def plot_weighted_ranking_score(
    weighted_df: pl.DataFrame,
-    title: str = "Character Popularity Score<br>(Weighted: 1st=3pts, 2nd=2pts, 3rd=1pt)",
+    title: str = "Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)",
    x_label: str = "Character Personality",
    y_label: str = "Total Weighted Score",
    color: str = ColorPalette.PRIMARY,
@@ -508,4 +515,339 @@ def plot_weighted_ranking_score(
        font=dict(size=11)
    )

-    return fig
+    return fig
+
+
+def plot_voice_selection_counts(
+    df: pl.DataFrame,
+    target_column: str = "8_Combined",
+    title: str = "Most Frequently Chosen Voices<br>(Top 8 Highlighted)",
+    x_label: str = "Voice",
+    y_label: str = "Number of Times Chosen",
+    height: int = 500,
+    width: int = 1000,
+) -> go.Figure:
+    """
+    Create a bar plot showing the frequency of voice selections.
+    Takes a column containing comma-separated values (e.g. "Voice 1, Voice 2..."),
+    counts occurrences, and highlights the top 8 most frequent voices.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing the selection column.
+    target_column : str, optional
+        Name of the column containing comma-separated voice selections.
+        Defaults to "8_Combined".
+    title : str, optional
+        Plot title.
+    x_label : str, optional
+        X-axis label.
+    y_label : str, optional
+        Y-axis label.
+    height : int, optional
+        Plot height in pixels.
+    width : int, optional
+        Plot width in pixels.
+
+    Returns
+    -------
+    go.Figure
+        Plotly figure object.
+    """
+    if target_column not in df.columns:
+        return go.Figure()
+
+    # Process the data:
+    # 1. Select the relevant column and remove nulls
+    # 2. Split the comma-separated string into a list
+    # 3. Explode the list so each voice gets its own row
+    # 4. Strip whitespace ensuring "Voice 1" and " Voice 1" match
+    # 5. Count occurrences
+    stats_df = (
+        df.select(pl.col(target_column))
+        .drop_nulls()
+        .with_columns(pl.col(target_column).str.split(","))
+        .explode(target_column)
+        .with_columns(pl.col(target_column).str.strip_chars())
+        .filter(pl.col(target_column) != "")
+        .group_by(target_column)
+        .agg(pl.len().alias("count"))
+        .sort("count", descending=True)
+    )
+
+    # Define colors: Top 8 get PRIMARY, rest get NEUTRAL
+    colors = [
+        ColorPalette.PRIMARY if i < 8 else ColorPalette.NEUTRAL 
+        for i in range(len(stats_df))
+    ]
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Bar(
+        x=stats_df[target_column],
+        y=stats_df['count'],
+        text=stats_df['count'],
+        textposition='outside',
+        marker_color=colors,
+        hovertemplate='<b>%{x}</b><br>Selections: %{y}<extra></extra>'
+    ))
+
+    fig.update_layout(
+        title=title,
+        xaxis_title=x_label,
+        yaxis_title=y_label,
+        height=height,
+        width=width,
+        plot_bgcolor=ColorPalette.BACKGROUND,
+        xaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID,
+            tickangle=-45
+        ),
+        yaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID
+        ),
+        font=dict(size=11),
+    )
+
+    return fig
+
+
+def plot_top3_selection_counts(
+    df: pl.DataFrame,
+    target_column: str = "3_Ranked",
+    title: str = "Most Frequently Chosen Top 3 Voices<br>(Top 3 Highlighted)",
+    x_label: str = "Voice",
+    y_label: str = "Count of Mentions in Top 3",
+    height: int = 500,
+    width: int = 1000,
+) -> go.Figure:
+    """
+    Question: Which 3 voices are chosen the most out of 18?
+    
+    How many times does each voice end up in the top 3?
+    (this is based on the survey question where participants need to choose 3 out 
+    of the earlier selected 8 voices). So how often each of the 18 stimuli ended 
+    up in participants' Top 3, after they first selected 8 out of 18.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing the ranking column (comma-separated strings).
+    target_column : str, optional
+        Name of the column containing comma-separated Top 3 voice elections.
+        Defaults to "3_Ranked".
+    title : str, optional
+        Plot title.
+    x_label : str, optional
+        X-axis label.
+    y_label : str, optional
+        Y-axis label.
+    height : int, optional
+        Plot height in pixels.
+    width : int, optional
+        Plot width in pixels.
+
+    Returns
+    -------
+    go.Figure
+        Plotly figure object.
+    """
+    if target_column not in df.columns:
+        return go.Figure()
+
+    # Process the data:
+    # Same logic as plot_voice_selection_counts: explode comma-separated string
+    stats_df = (
+        df.select(pl.col(target_column))
+        .drop_nulls()
+        .with_columns(pl.col(target_column).str.split(","))
+        .explode(target_column)
+        .with_columns(pl.col(target_column).str.strip_chars())
+        .filter(pl.col(target_column) != "")
+        .group_by(target_column)
+        .agg(pl.len().alias("count"))
+        .sort("count", descending=True)
+    )
+
+    # Define colors: Top 3 get PRIMARY, rest get NEUTRAL
+    colors = [
+        ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL 
+        for i in range(len(stats_df))
+    ]
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Bar(
+        x=stats_df[target_column],
+        y=stats_df['count'],
+        text=stats_df['count'],
+        textposition='outside',
+        marker_color=colors,
+        hovertemplate='<b>%{x}</b><br>In Top 3: %{y} times<extra></extra>'
+    ))
+
+    fig.update_layout(
+        title=title,
+        xaxis_title=x_label,
+        yaxis_title=y_label,
+        height=height,
+        width=width,
+        plot_bgcolor=ColorPalette.BACKGROUND,
+        xaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID,
+            tickangle=-45
+        ),
+        yaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID
+        ),
+        font=dict(size=11),
+    )
+
+    return fig
+
+
+def plot_speaking_style_trait_scores(
+    df: pl.DataFrame,
+    trait_description: str = None,
+    left_anchor: str = None,
+    right_anchor: str = None,
+    title: str = "Speaking Style Trait Analysis",
+    height: int = 500,
+    width: int = 1000,
+) -> go.Figure:
+    """
+    Plot scores for a single speaking style trait across multiple voices.
+    
+    The plot shows the average score per Voice, sorted by score.
+    It expects the DataFrame to contain 'Voice' and 'score' columns, 
+    typically filtered for a single trait/description.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing at least 'Voice' and 'score' columns.
+        Produced by utils.process_speaking_style_data and filtered.
+    trait_description : str, optional
+        Description of the trait being analyzed (e.g. "Indifferent : Attentive").
+        If not provided, it will be constructed from annotations.
+    left_anchor : str, optional
+        Label for the lower end of the scale (e.g. "Indifferent").
+        If not provided, attempts to read 'Left_Anchor' column from df.
+    right_anchor : str, optional
+        Label for the upper end of the scale (e.g. "Attentive").
+        If not provided, attempts to read 'Right_Anchor' column from df.
+    title : str, optional
+        Plot title.
+    height : int, optional
+        Plot height.
+    width : int, optional
+        Plot width.
+
+    Returns
+    -------
+    go.Figure
+        Plotly figure object.
+    """
+    if df.is_empty():
+        return go.Figure()
+        
+    required_cols = ["Voice", "score"]
+    if not all(col in df.columns for col in required_cols):
+         return go.Figure()
+
+    # Calculate stats: Mean, Count
+    stats = (
+        df.filter(pl.col("score").is_not_null())
+        .group_by("Voice")
+        .agg([
+            pl.col("score").mean().alias("mean_score"),
+            pl.col("score").count().alias("count")
+        ])
+        .sort("mean_score", descending=True) # Descending for Left-to-Right
+    )
+
+    # Attempt to extract anchors from DF if not provided
+    if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns:
+        head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1)
+        if not head.is_empty():
+            if left_anchor is None: left_anchor = head["Left_Anchor"][0]
+            if right_anchor is None: right_anchor = head["Right_Anchor"][0]
+
+    if trait_description is None:
+        if left_anchor and right_anchor:
+            trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}"
+        else:
+             # Try getting from Description column
+             if "Description" in df.columns:
+                 head = df.filter(pl.col("Description").is_not_null()).head(1)
+                 if not head.is_empty():
+                      trait_description = head["Description"][0]
+                 else:
+                      trait_description = ""
+             else:
+                 trait_description = ""
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Bar(
+        x=stats["Voice"], # X is Voice
+        y=stats["mean_score"], # Y is Score
+        text=stats["count"],
+        textposition='inside',
+        texttemplate='%{text}', # Count on bar
+        marker_color=ColorPalette.PRIMARY,
+        hovertemplate='<b>%{x}</b><br>Average: %{y:.2f}<br>Count: %{text}<extra></extra>'
+    ))
+
+    # Add annotations for anchors
+    annotations = []
+    
+    # Place anchors on the right side
+    if left_anchor:
+        annotations.append(dict(
+            xref='paper', yref='y',
+            x=1.01, y=1,
+            xanchor='left', yanchor='middle',
+            text=f"<b>1: {left_anchor.split('|')[0]}</b>",
+            showarrow=False,
+            font=dict(size=10, color='gray')
+        ))
+    if right_anchor:
+        annotations.append(dict(
+            xref='paper', yref='y',
+            x=1.01, y=5,
+            xanchor='left', yanchor='middle',
+            text=f"<b>5: {right_anchor.split('|')[0]}</b>",
+            showarrow=False,
+            font=dict(size=10, color='gray')
+        ))
+
+    fig.update_layout(
+         title=dict(
+            text=f"{title}<br><sub>{trait_description}</sub><br><sub>(Numbers on bars indicate respondent count)</sub>",
+            y=0.92
+        ),
+        xaxis_title="Voice",
+        yaxis_title="Average Score (1-5)",
+        height=height,
+        width=width,
+        plot_bgcolor=ColorPalette.BACKGROUND,
+        yaxis=dict(
+            range=[1, 5],
+            showgrid=True,
+            gridcolor=ColorPalette.GRID,
+            zeroline=False
+        ),
+        xaxis=dict(
+            showgrid=False
+        ),
+        margin=dict(r=150),
+        annotations=annotations,
+        font=dict(size=11)
+    )
+    return fig