Character personality ranking complete

2026-01-23 10:50:26 +01:00
parent 5327b50ab0
commit 424355f4a1
5 changed files with 421 additions and 7 deletions
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -44,6 +44,14 @@ def _(survey):
    return


+app._unparsable_cell(
+    r"""
+    data.
+    """,
+    name="_"
+)
+
+
@app.cell
 def _(mo):
    mo.md(r"""
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -11,15 +11,20 @@ def _():
    from pathlib import Path

    from validation import check_progress, duration_validation
-    from utils import JPMCSurvey, combine_exclusive_columns
-    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
+    from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
+    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_character_ranking_distribution, plot_most_ranked_1_character, plot_weighted_ranking_score
    return (
        JPMCSurvey,
        Path,
+        calculate_weighted_ranking_scores,
        check_progress,
        duration_validation,
        mo,
        plot_average_scores_with_counts,
+        plot_character_ranking_distribution,
+        plot_most_ranked_1_character,
+        plot_top3_ranking_distribution,
+        plot_weighted_ranking_score,
    )


@@ -108,12 +113,49 @@ def _(mo):
    mo.md(r"""
    ## Character personality ranking

-    1. Which character personality is ranked best?
-    2. Which character personality is ranked number 1 the most?
+    ### 1. Which character personality is ranked best?
    """)
    return


+@app.cell
+def _(data, survey):
+    char_rank = survey.get_character_ranking(data)[0].collect()
+
+    return (char_rank,)
+
+
+@app.cell
+def _(char_rank, plot_character_ranking_distribution):
+    plot_character_ranking_distribution(char_rank, x_label='Character Personality', width=1000)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ### 2. Which character personality is ranked number 1 the most?
+    """)
+    return
+
+
+@app.cell
+def _(
+    calculate_weighted_ranking_scores,
+    char_rank,
+    plot_weighted_ranking_score,
+):
+    char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
+    plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000)
+    return
+
+
+@app.cell
+def _(char_rank, plot_most_ranked_1_character):
+    plot_most_ranked_1_character(char_rank, x_label='Character Personality', width=1000)
+    return
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -122,6 +164,13 @@ def _(mo):
    return


+@app.cell
+def _(data, survey):
+    v_18_8_3 = survey.get_18_8_3(data)[0].collect()
+    print(v_18_8_3.head())
+    return
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -147,6 +196,12 @@ def _(mo):
    return


+@app.cell
+def _(plot_top3_ranking_distribution, top3_voices):
+    plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
+    return
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
--- a/plots.py
+++ b/plots.py
@@ -5,6 +5,7 @@ import polars as pl
 from theme import ColorPalette


+
 def plot_average_scores_with_counts(
    df: pl.DataFrame,
    title: str = "General Impression (1-10)<br>Per Voice with Number of Participants Who Rated It",
@@ -213,3 +214,298 @@ def plot_top3_ranking_distribution(
    )

    return fig
+
+
+def plot_character_ranking_distribution(
+    df: pl.DataFrame,
+    title: str = "Character Personality Rankings<br>Distribution of Votes (1st to 4th Place)",
+    x_label: str = "Character Personality",
+    y_label: str = "Number of Votes",
+    height: int = 500,
+    width: int = 1000,
+) -> go.Figure:
+    """
+    Create a stacked bar chart showing the distribution of rankings (1st to 4th) for character personalities.
+    Sorted by the number of Rank 1 votes to highlight the 'Best' options.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing character ranking columns (prefix 'Character_Ranking').
+    title : str, optional
+        Plot title.
+    x_label : str, optional
+        X-axis label.
+    y_label : str, optional
+        Y-axis label.
+    height : int, optional
+        Plot height in pixels.
+    width : int, optional
+        Plot width in pixels.
+
+    Returns
+    -------
+    go.Figure
+        Plotly figure object.
+    """
+    stats = []
+    # Identify columns related to Character Ranking (excluding ID)
+    ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+
+    for col in ranking_cols:
+        # Count occurrences of each rank (1, 2, 3, 4)
+        # Using height/len to count rows in the filtered frame
+        r1 = df.filter(pl.col(col) == 1).height
+        r2 = df.filter(pl.col(col) == 2).height
+        r3 = df.filter(pl.col(col) == 3).height
+        r4 = df.filter(pl.col(col) == 4).height
+        total = r1 + r2 + r3 + r4
+
+        if total > 0:
+            stats.append({
+                'column': col,
+                'Rank 1': r1,
+                'Rank 2': r2,
+                'Rank 3': r3,
+                'Rank 4': r4
+            })
+
+    if not stats:
+        return go.Figure()
+
+    # Sort by Rank 1 (Most "Best" votes) descending to show the winner first
+    # Secondary sort by Rank 2
+    stats_df = pl.DataFrame(stats).sort(['Rank 1', 'Rank 2'], descending=[True, True])
+
+    # Clean up labels: Remove prefix and underscores
+    # e.g. "Character_Ranking_The_Coach" -> "The Coach"
+    labels = [
+        col.replace('Character_Ranking_', '').replace('_', ' ').strip() 
+        for col in stats_df['column']
+    ]
+
+    fig = go.Figure()
+
+    # Rank 1 (Best)
+    fig.add_trace(go.Bar(
+        name='Rank 1 (Best)',
+        x=labels,
+        y=stats_df['Rank 1'],
+        marker_color=ColorPalette.RANK_1,
+        hovertemplate='<b>%{x}</b><br>Rank 1: %{y}<extra></extra>'
+    ))
+
+    # Rank 2
+    fig.add_trace(go.Bar(
+        name='Rank 2',
+        x=labels,
+        y=stats_df['Rank 2'],
+        marker_color=ColorPalette.RANK_2,
+        hovertemplate='<b>%{x}</b><br>Rank 2: %{y}<extra></extra>'
+    ))
+
+    # Rank 3
+    fig.add_trace(go.Bar(
+        name='Rank 3',
+        x=labels,
+        y=stats_df['Rank 3'],
+        marker_color=ColorPalette.RANK_3,
+        hovertemplate='<b>%{x}</b><br>Rank 3: %{y}<extra></extra>'
+    ))
+
+    # Rank 4 (Worst)
+    # Using a neutral grey as a fallback for the lowest rank to keep focus on top ranks
+    fig.add_trace(go.Bar(
+        name='Rank 4 (Worst)',
+        x=labels,
+        y=stats_df['Rank 4'],
+        marker_color=ColorPalette.RANK_4, 
+        hovertemplate='<b>%{x}</b><br>Rank 4: %{y}<extra></extra>'
+    ))
+
+    fig.update_layout(
+        barmode='stack',
+        title=title,
+        xaxis_title=x_label,
+        yaxis_title=y_label,
+        height=height,
+        width=width,
+        plot_bgcolor=ColorPalette.BACKGROUND,
+        xaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID,
+            tickangle=-45
+        ),
+        yaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1,
+            traceorder="normal"
+        ),
+        font=dict(size=11)
+    )
+
+    return fig
+
+
+def plot_most_ranked_1_character(
+    df: pl.DataFrame,
+    title: str = "Most Popular Character Personality<br>(Number of Times Ranked 1st)",
+    x_label: str = "Character Personality",
+    y_label: str = "Count of 1st Place Rankings",
+    height: int = 500,
+    width: int = 1000,
+) -> go.Figure:
+    """
+    Create a bar chart showing which character personality was ranked #1 the most.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing character ranking columns.
+    title : str, optional
+        Plot title.
+    x_label : str, optional
+        X-axis label.
+    y_label : str, optional
+        Y-axis label.
+    height : int, optional
+        Plot height in pixels.
+    width : int, optional
+        Plot width in pixels.
+
+    Returns
+    -------
+    go.Figure
+        Plotly figure object.
+    """
+    stats = []
+    # Identify columns related to Character Ranking
+    ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+
+    for col in ranking_cols:
+        # Count occurrences of rank 1
+        count_rank_1 = df.filter(pl.col(col) == 1).height
+        
+        stats.append({
+            'column': col,
+            'count': count_rank_1
+        })
+
+    # Sort by count descending
+    stats_df = pl.DataFrame(stats).sort('count', descending=True)
+
+    # Clean up labels
+    labels = [
+        col.replace('Character_Ranking_', '').replace('_', ' ').strip() 
+        for col in stats_df['column']
+    ]
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Bar(
+        x=labels,
+        y=stats_df['count'],
+        text=stats_df['count'],
+        textposition='inside',
+        textfont=dict(size=10, color='white'),
+        marker_color=ColorPalette.PRIMARY,
+        hovertemplate='<b>%{x}</b><br>1st Place Votes: %{y}<extra></extra>'
+    ))
+
+    fig.update_layout(
+        title=title,
+        xaxis_title=x_label,
+        yaxis_title=y_label,
+        height=height,
+        width=width,
+        plot_bgcolor=ColorPalette.BACKGROUND,
+        xaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID,
+            tickangle=-45
+        ),
+        yaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID
+        ),
+        font=dict(size=11)
+    )
+
+    return fig
+
+
+
+def plot_weighted_ranking_score(
+    weighted_df: pl.DataFrame,
+    title: str = "Character Popularity Score<br>(Weighted: 1st=3pts, 2nd=2pts, 3rd=1pt)",
+    x_label: str = "Character Personality",
+    y_label: str = "Total Weighted Score",
+    color: str = ColorPalette.PRIMARY,
+    height: int = 500,
+    width: int = 1000,
+) -> go.Figure:
+    """
+    Create a bar chart showing the weighted ranking score for each character.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing ranking columns.
+    title : str, optional
+        Plot title.
+    x_label : str, optional
+        X-axis label.
+    y_label : str, optional
+        Y-axis label.
+    color : str, optional
+        Bar color.
+    height : int, optional
+        Plot height.
+    width : int, optional
+        Plot width.
+
+    Returns
+    -------
+    go.Figure
+        Plotly figure object.
+    """
+
+    fig = go.Figure()
+
+    fig.add_trace(go.Bar(
+        x=weighted_df['Character'],
+        y=weighted_df['Weighted Score'],
+        text=weighted_df['Weighted Score'],
+        textposition='inside',
+        textfont=dict(size=11, color='white'),
+        marker_color=color,
+        hovertemplate='<b>%{x}</b><br>Score: %{y}<extra></extra>'
+    ))
+
+    fig.update_layout(
+        title=title,
+        xaxis_title=x_label,
+        yaxis_title=y_label,
+        height=height,
+        width=width,
+        plot_bgcolor=ColorPalette.BACKGROUND,
+        xaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID,
+            tickangle=-45
+        ),
+        yaxis=dict(
+            showgrid=True,
+            gridcolor=ColorPalette.GRID
+        ),
+        font=dict(size=11)
+    )
+
+    return fig
--- a/theme.py
+++ b/theme.py
@@ -14,6 +14,7 @@ class ColorPalette:
    RANK_1 = "#004C6D"   # Dark Blue (1st Choice)
    RANK_2 = "#008493"   # Teal (2nd Choice)
    RANK_3 = "#5AAE95"   # Sea Green (3rd Choice)
+    RANK_4 = "#9E9E9E"   # Grey (4th Choice / Worst)

    # General UI elements
    TEXT = "black"
--- a/utils.py
+++ b/utils.py
@@ -55,6 +55,45 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe



+def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
+    """
+    Calculate weighted scores for character rankings.
+    Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt.
+
+    Parameters
+    ----------
+    df : pl.DataFrame
+        DataFrame containing character ranking columns.
+
+    Returns
+    -------
+    pl.DataFrame
+        DataFrame with columns 'Character' and 'Weighted Score', sorted by score.
+    """
+    scores = []
+    # Identify columns related to Character Ranking
+    ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+
+    for col in ranking_cols:
+        # Calculate score:
+        # (Count of Rank 1 * 3) + (Count of Rank 2 * 2) + (Count of Rank 3 * 1)
+        r1_count = df.filter(pl.col(col) == 1).height
+        r2_count = df.filter(pl.col(col) == 2).height
+        r3_count = df.filter(pl.col(col) == 3).height
+        
+        weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1)
+        
+        # Clean name
+        clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip()
+        
+        scores.append({
+            'Character': clean_name,
+            'Weighted Score': weighted_score
+        })
+
+    return pl.DataFrame(scores).sort('Weighted Score', descending=True)
+
+
 class JPMCSurvey:
    """Class to handle JPMorgan Chase survey data."""
    
@@ -249,9 +288,19 @@ class JPMCSurvey:
        rename_dict = {
            'QID29': '18-8_Set-A',
            'QID101': '18-8_Set-B',
-            'QID36_0_GROUP': '8-3_Ranked'
+            'QID36_0_GROUP': '3_Ranked'
        }
-        return self._get_subset(q, QIDs, rename_cols=False).rename(rename_dict), None
+        
+        subset = self._get_subset(q, QIDs, rename_cols=False).rename(rename_dict)
+        
+        # Combine 18-8 Set A and Set B into single column
+        subset = subset.with_columns(
+            pl.coalesce(['18-8_Set-A', '18-8_Set-B']).alias('8_Combined')
+        )
+        # Change order of columns
+        subset = subset.select(['_recordId', '18-8_Set-A', '18-8_Set-B', '8_Combined', '3_Ranked'])
+        
+        return subset, None
    
    
    def get_voice_scale_1_10(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
@@ -364,3 +413,8 @@ class JPMCSurvey:
        QIDs = ['QID44', 'QID97', 'QID95', 'QID96']
        
        return self._get_subset(q, QIDs, rename_cols=True), None
+    
+
+
+
+