diff --git a/02_quant_analysis.py b/02_quant_analysis.py index 6ffa7f4..a7b6e23 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -12,7 +12,10 @@ def _(): from validation import check_progress, duration_validation from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores - from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_character_ranking_distribution, plot_most_ranked_1_character, plot_weighted_ranking_score + from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_ranking_distribution, plot_most_ranked_1, plot_weighted_ranking_score, plot_voice_selection_counts, plot_top3_selection_counts + + import plots as plts + import utils as utl return ( JPMCSurvey, Path, @@ -20,27 +23,23 @@ def _(): check_progress, duration_validation, mo, + pl, plot_average_scores_with_counts, - plot_character_ranking_distribution, - plot_most_ranked_1_character, + plot_most_ranked_1, + plot_ranking_distribution, plot_top3_ranking_distribution, + plot_top3_selection_counts, + plot_voice_selection_counts, plot_weighted_ranking_score, + plts, + utl, ) -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - # Load Data - """) - return - - @app.cell -def _(Path, mo): +def _(): RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv' QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' - mo.md(f"**Dataset:** `{Path(RESULTS_FILE).name}`") return QSF_FILE, RESULTS_FILE @@ -52,17 +51,30 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE): return data_all, survey -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - ## Data Validation +@app.cell +def _(Path, RESULTS_FILE, data_all, mo): + mo.md(f""" + # Load Data + + **Dataset:** `{Path(RESULTS_FILE).name}` + + {mo.ui.table(data_all.collect())} """) return -@app.cell -def _(check_progress, data_all): - check_progress(data_all) +@app.cell(hide_code=True) +def _(check_progress, data_all, duration_validation, mo): + mo.md(f""" + ## Data Validation + + {check_progress(data_all)} + + + + {duration_validation(data_all)} + + """) return @@ -112,8 +124,6 @@ def _(mo): def _(mo): mo.md(r""" ## Character personality ranking - - ### 1. Which character personality is ranked best? """) return @@ -126,15 +136,23 @@ def _(data, survey): @app.cell -def _(char_rank, plot_character_ranking_distribution): - plot_character_ranking_distribution(char_rank, x_label='Character Personality', width=1000) +def _(char_rank, mo, plot_top3_ranking_distribution): + mo.md(f""" + ### 1. Which character personality is ranked best? + + + {mo.ui.plotly(plot_top3_ranking_distribution(char_rank, x_label='Character Personality', width=1000))} + """) return @app.cell -def _(mo): - mo.md(r""" - ### 2. Which character personality is ranked number 1 the most? +def _(char_rank, mo, plot_most_ranked_1): + mo.md(f""" + ### 2. Which character personality is ranked 1st the most? + + + {mo.ui.plotly(plot_most_ranked_1(char_rank, title="Most Popular Character
(Number of Times Ranked 1st)", x_label='Character Personality', width=1000))} """) return @@ -143,16 +161,18 @@ def _(mo): def _( calculate_weighted_ranking_scores, char_rank, + mo, plot_weighted_ranking_score, ): char_rank_weighted = calculate_weighted_ranking_scores(char_rank) - plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000) - return + # plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000) + + mo.md(f""" + ### 3. Which character personality most popular based on weighted scores? -@app.cell -def _(char_rank, plot_most_ranked_1_character): - plot_most_ranked_1_character(char_rank, x_label='Character Personality', width=1000) + {mo.ui.plotly(plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score
(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice', width=1000))} + """) return @@ -167,51 +187,74 @@ def _(mo): @app.cell def _(data, survey): v_18_8_3 = survey.get_18_8_3(data)[0].collect() - print(v_18_8_3.head()) - return + # print(v_18_8_3.head()) + return (v_18_8_3,) @app.cell(hide_code=True) -def _(mo): - mo.md(r""" - Which 8 voices are chosen the most out of 18? +def _(mo, plot_voice_selection_counts, v_18_8_3): + mo.md(f""" + ### Which 8 voices are chosen the most out of 18? + + {mo.ui.plotly(plot_voice_selection_counts(v_18_8_3, height=500, width=1000))} """) return @app.cell(hide_code=True) -def _(mo): - mo.md(r""" - Which 3 voices are chosen the most out of 18? How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants’ Top 3, after they first selected 8 out of 18. +def _(mo, plot_top3_selection_counts, v_18_8_3): + mo.md(f""" + ### Which 3 voices are chosen the most out of 18? + + How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants’ Top 3, after they first selected 8 out of 18. + + {mo.ui.plotly(plot_top3_selection_counts(v_18_8_3, height=500, width=1000))} """) return @app.cell(hide_code=True) -def _(mo): - mo.md(r""" - Which voice is ranked best in the ranking question for top 3.? (so not best 3 out of 8 question) - - E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1. The voice with most points is ranked best. +def _( + calculate_weighted_ranking_scores, + data, + mo, + plot_ranking_distribution, + survey, +): + top3_voices = survey.get_top_3_voices(data)[0].collect() + top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices) + + mo.md(f""" + ### Which voice is ranked best in the ranking question for top 3? + + (not best 3 out of 8 question) + + {mo.ui.plotly(plot_ranking_distribution(top3_voices, x_label='Voice', width=1000))} + + """) + return top3_voices, top3_voices_weighted + + +@app.cell +def _(mo, plot_weighted_ranking_score, top3_voices_weighted): + mo.md(f""" + ### Most popular **voice** based on weighted scores? + - E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1. The voice with most points is ranked best. + Distribution of the rankings for each voice: + + {mo.ui.plotly(plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score
(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", height=500, width=1000))} """) return @app.cell -def _(plot_top3_ranking_distribution, top3_voices): - plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000) - return +def _(mo, plot_most_ranked_1, top3_voices): + mo.md(f""" + ### Which voice is ranked number 1 the most? + (not always the voice with most points) -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - Which voice is ranked number 1 the most? (not always the voice with most points) - - - Each of the 350 participants gives exactly one 1st-place vote. - - Total Rank-1 votes = 350. - - Voices are sorted from most to least 1st-place votes. - - The top 3 voices with the most Rank-1 votes are colored blue. - - This can differ from the points-based winners (3–2–1 totals), because a voice may receive many 2nd/3rd places but fewer 1st places. + {mo.ui.plotly(plot_most_ranked_1(top3_voices, title="Most Popular Voice
(Number of Times Ranked 1st)", x_label='Voice', width=1000))} """) return @@ -235,6 +278,56 @@ def _(mo): return +@app.cell +def _(data, survey): + ss_or, choice_map_or = survey.get_ss_orange_red(data) + ss_gb, choice_map_gb = survey.get_ss_green_blue(data) + + # Combine the data + ss_all = ss_or.join(ss_gb, on='_recordId') + _d = ss_all.collect() + + choice_map = {**choice_map_or, **choice_map_gb} + # print(_d.head()) + print(choice_map) + return choice_map, ss_all + + +@app.cell +def _(choice_map, ss_all, utl): + ss_long = utl.process_speaking_style_data(ss_all, choice_map) + ss_long + return (ss_long,) + + +@app.cell +def _(pl, ss_long): + target_trait = "Indifferent | Unfocussed | Detached:Attentive | Helpful | Caring | Deliberate" + trait_data = ss_long.filter(pl.col("Description") == target_trait) + trait_data + return target_trait, trait_data + + +@app.cell +def _(plts, target_trait, trait_data): + plts.plot_speaking_style_trait_scores( + trait_data, + title=target_trait.replace(":", " ↔ "), + # trait_description="Attentive vs Indifferent", # simplified title + ) + return + + +app._unparsable_cell( + """ + for trait in ss_long.select(\"Description\").unique().to_series().to_list(): + trait_data = ss_long.filter(pl.col(\"Description\") == trait) + mo.md(f\"\"\" + """, + name="_" +) + + @app.cell(hide_code=True) def _(mo): mo.md(r""" diff --git a/plots.py b/plots.py index ff80272..119de8c 100644 --- a/plots.py +++ b/plots.py @@ -216,22 +216,22 @@ def plot_top3_ranking_distribution( return fig -def plot_character_ranking_distribution( +def plot_ranking_distribution( df: pl.DataFrame, - title: str = "Character Personality Rankings
Distribution of Votes (1st to 4th Place)", - x_label: str = "Character Personality", + title: str = "Rankings Distribution
(1st to 4th Place)", + x_label: str = "Item", y_label: str = "Number of Votes", height: int = 500, width: int = 1000, ) -> go.Figure: """ - Create a stacked bar chart showing the distribution of rankings (1st to 4th) for character personalities. - Sorted by the number of Rank 1 votes to highlight the 'Best' options. + Create a stacked bar chart showing the distribution of rankings (1st to 4th) for characters or voices. + Sorted by the number of Rank 1 votes. Parameters ---------- df : pl.DataFrame - DataFrame containing character ranking columns (prefix 'Character_Ranking'). + DataFrame containing ranking columns. title : str, optional Plot title. x_label : str, optional @@ -249,8 +249,8 @@ def plot_character_ranking_distribution( Plotly figure object. """ stats = [] - # Identify columns related to Character Ranking (excluding ID) - ranking_cols = [c for c in df.columns if 'Character_Ranking' in c] + # Identify ranking columns (assume all columns except _recordId) + ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: # Count occurrences of each rank (1, 2, 3, 4) @@ -280,7 +280,7 @@ def plot_character_ranking_distribution( # Clean up labels: Remove prefix and underscores # e.g. "Character_Ranking_The_Coach" -> "The Coach" labels = [ - col.replace('Character_Ranking_', '').replace('_', ' ').strip() + col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip() for col in stats_df['column'] ] @@ -354,21 +354,22 @@ def plot_character_ranking_distribution( return fig -def plot_most_ranked_1_character( +def plot_most_ranked_1( df: pl.DataFrame, - title: str = "Most Popular Character Personality
(Number of Times Ranked 1st)", - x_label: str = "Character Personality", + title: str = "Most Popular Choice
(Number of Times Ranked 1st)", + x_label: str = "Item", y_label: str = "Count of 1st Place Rankings", height: int = 500, width: int = 1000, ) -> go.Figure: """ - Create a bar chart showing which character personality was ranked #1 the most. + Create a bar chart showing which item (character/voice) was ranked #1 the most. + Top 3 items are highlighted. Parameters ---------- df : pl.DataFrame - DataFrame containing character ranking columns. + DataFrame containing ranking columns. title : str, optional Plot title. x_label : str, optional @@ -386,8 +387,8 @@ def plot_most_ranked_1_character( Plotly figure object. """ stats = [] - # Identify columns related to Character Ranking - ranking_cols = [c for c in df.columns if 'Character_Ranking' in c] + # Identify ranking columns (assume all columns except _recordId) + ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: # Count occurrences of rank 1 @@ -403,19 +404,25 @@ def plot_most_ranked_1_character( # Clean up labels labels = [ - col.replace('Character_Ranking_', '').replace('_', ' ').strip() + col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip() for col in stats_df['column'] ] - fig = go.Figure() + # Assign colors: Top 3 get PRIMARY (Blue), others get NEUTRAL (Grey) + colors = [ + ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL + for i in range(len(stats_df)) + ] + fig = go.Figure() + fig.add_trace(go.Bar( x=labels, y=stats_df['count'], text=stats_df['count'], textposition='inside', textfont=dict(size=10, color='white'), - marker_color=ColorPalette.PRIMARY, + marker_color=colors, hovertemplate='%{x}
1st Place Votes: %{y}' )) @@ -444,7 +451,7 @@ def plot_most_ranked_1_character( def plot_weighted_ranking_score( weighted_df: pl.DataFrame, - title: str = "Character Popularity Score
(Weighted: 1st=3pts, 2nd=2pts, 3rd=1pt)", + title: str = "Weighted Popularity Score
(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label: str = "Character Personality", y_label: str = "Total Weighted Score", color: str = ColorPalette.PRIMARY, @@ -508,4 +515,339 @@ def plot_weighted_ranking_score( font=dict(size=11) ) - return fig \ No newline at end of file + return fig + + +def plot_voice_selection_counts( + df: pl.DataFrame, + target_column: str = "8_Combined", + title: str = "Most Frequently Chosen Voices
(Top 8 Highlighted)", + x_label: str = "Voice", + y_label: str = "Number of Times Chosen", + height: int = 500, + width: int = 1000, +) -> go.Figure: + """ + Create a bar plot showing the frequency of voice selections. + Takes a column containing comma-separated values (e.g. "Voice 1, Voice 2..."), + counts occurrences, and highlights the top 8 most frequent voices. + + Parameters + ---------- + df : pl.DataFrame + DataFrame containing the selection column. + target_column : str, optional + Name of the column containing comma-separated voice selections. + Defaults to "8_Combined". + title : str, optional + Plot title. + x_label : str, optional + X-axis label. + y_label : str, optional + Y-axis label. + height : int, optional + Plot height in pixels. + width : int, optional + Plot width in pixels. + + Returns + ------- + go.Figure + Plotly figure object. + """ + if target_column not in df.columns: + return go.Figure() + + # Process the data: + # 1. Select the relevant column and remove nulls + # 2. Split the comma-separated string into a list + # 3. Explode the list so each voice gets its own row + # 4. Strip whitespace ensuring "Voice 1" and " Voice 1" match + # 5. Count occurrences + stats_df = ( + df.select(pl.col(target_column)) + .drop_nulls() + .with_columns(pl.col(target_column).str.split(",")) + .explode(target_column) + .with_columns(pl.col(target_column).str.strip_chars()) + .filter(pl.col(target_column) != "") + .group_by(target_column) + .agg(pl.len().alias("count")) + .sort("count", descending=True) + ) + + # Define colors: Top 8 get PRIMARY, rest get NEUTRAL + colors = [ + ColorPalette.PRIMARY if i < 8 else ColorPalette.NEUTRAL + for i in range(len(stats_df)) + ] + + fig = go.Figure() + + fig.add_trace(go.Bar( + x=stats_df[target_column], + y=stats_df['count'], + text=stats_df['count'], + textposition='outside', + marker_color=colors, + hovertemplate='%{x}
Selections: %{y}' + )) + + fig.update_layout( + title=title, + xaxis_title=x_label, + yaxis_title=y_label, + height=height, + width=width, + plot_bgcolor=ColorPalette.BACKGROUND, + xaxis=dict( + showgrid=True, + gridcolor=ColorPalette.GRID, + tickangle=-45 + ), + yaxis=dict( + showgrid=True, + gridcolor=ColorPalette.GRID + ), + font=dict(size=11), + ) + + return fig + + +def plot_top3_selection_counts( + df: pl.DataFrame, + target_column: str = "3_Ranked", + title: str = "Most Frequently Chosen Top 3 Voices
(Top 3 Highlighted)", + x_label: str = "Voice", + y_label: str = "Count of Mentions in Top 3", + height: int = 500, + width: int = 1000, +) -> go.Figure: + """ + Question: Which 3 voices are chosen the most out of 18? + + How many times does each voice end up in the top 3? + (this is based on the survey question where participants need to choose 3 out + of the earlier selected 8 voices). So how often each of the 18 stimuli ended + up in participants' Top 3, after they first selected 8 out of 18. + + Parameters + ---------- + df : pl.DataFrame + DataFrame containing the ranking column (comma-separated strings). + target_column : str, optional + Name of the column containing comma-separated Top 3 voice elections. + Defaults to "3_Ranked". + title : str, optional + Plot title. + x_label : str, optional + X-axis label. + y_label : str, optional + Y-axis label. + height : int, optional + Plot height in pixels. + width : int, optional + Plot width in pixels. + + Returns + ------- + go.Figure + Plotly figure object. + """ + if target_column not in df.columns: + return go.Figure() + + # Process the data: + # Same logic as plot_voice_selection_counts: explode comma-separated string + stats_df = ( + df.select(pl.col(target_column)) + .drop_nulls() + .with_columns(pl.col(target_column).str.split(",")) + .explode(target_column) + .with_columns(pl.col(target_column).str.strip_chars()) + .filter(pl.col(target_column) != "") + .group_by(target_column) + .agg(pl.len().alias("count")) + .sort("count", descending=True) + ) + + # Define colors: Top 3 get PRIMARY, rest get NEUTRAL + colors = [ + ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL + for i in range(len(stats_df)) + ] + + fig = go.Figure() + + fig.add_trace(go.Bar( + x=stats_df[target_column], + y=stats_df['count'], + text=stats_df['count'], + textposition='outside', + marker_color=colors, + hovertemplate='%{x}
In Top 3: %{y} times' + )) + + fig.update_layout( + title=title, + xaxis_title=x_label, + yaxis_title=y_label, + height=height, + width=width, + plot_bgcolor=ColorPalette.BACKGROUND, + xaxis=dict( + showgrid=True, + gridcolor=ColorPalette.GRID, + tickangle=-45 + ), + yaxis=dict( + showgrid=True, + gridcolor=ColorPalette.GRID + ), + font=dict(size=11), + ) + + return fig + + +def plot_speaking_style_trait_scores( + df: pl.DataFrame, + trait_description: str = None, + left_anchor: str = None, + right_anchor: str = None, + title: str = "Speaking Style Trait Analysis", + height: int = 500, + width: int = 1000, +) -> go.Figure: + """ + Plot scores for a single speaking style trait across multiple voices. + + The plot shows the average score per Voice, sorted by score. + It expects the DataFrame to contain 'Voice' and 'score' columns, + typically filtered for a single trait/description. + + Parameters + ---------- + df : pl.DataFrame + DataFrame containing at least 'Voice' and 'score' columns. + Produced by utils.process_speaking_style_data and filtered. + trait_description : str, optional + Description of the trait being analyzed (e.g. "Indifferent : Attentive"). + If not provided, it will be constructed from annotations. + left_anchor : str, optional + Label for the lower end of the scale (e.g. "Indifferent"). + If not provided, attempts to read 'Left_Anchor' column from df. + right_anchor : str, optional + Label for the upper end of the scale (e.g. "Attentive"). + If not provided, attempts to read 'Right_Anchor' column from df. + title : str, optional + Plot title. + height : int, optional + Plot height. + width : int, optional + Plot width. + + Returns + ------- + go.Figure + Plotly figure object. + """ + if df.is_empty(): + return go.Figure() + + required_cols = ["Voice", "score"] + if not all(col in df.columns for col in required_cols): + return go.Figure() + + # Calculate stats: Mean, Count + stats = ( + df.filter(pl.col("score").is_not_null()) + .group_by("Voice") + .agg([ + pl.col("score").mean().alias("mean_score"), + pl.col("score").count().alias("count") + ]) + .sort("mean_score", descending=True) # Descending for Left-to-Right + ) + + # Attempt to extract anchors from DF if not provided + if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns: + head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1) + if not head.is_empty(): + if left_anchor is None: left_anchor = head["Left_Anchor"][0] + if right_anchor is None: right_anchor = head["Right_Anchor"][0] + + if trait_description is None: + if left_anchor and right_anchor: + trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}" + else: + # Try getting from Description column + if "Description" in df.columns: + head = df.filter(pl.col("Description").is_not_null()).head(1) + if not head.is_empty(): + trait_description = head["Description"][0] + else: + trait_description = "" + else: + trait_description = "" + + fig = go.Figure() + + fig.add_trace(go.Bar( + x=stats["Voice"], # X is Voice + y=stats["mean_score"], # Y is Score + text=stats["count"], + textposition='inside', + texttemplate='%{text}', # Count on bar + marker_color=ColorPalette.PRIMARY, + hovertemplate='%{x}
Average: %{y:.2f}
Count: %{text}' + )) + + # Add annotations for anchors + annotations = [] + + # Place anchors on the right side + if left_anchor: + annotations.append(dict( + xref='paper', yref='y', + x=1.01, y=1, + xanchor='left', yanchor='middle', + text=f"1: {left_anchor.split('|')[0]}", + showarrow=False, + font=dict(size=10, color='gray') + )) + if right_anchor: + annotations.append(dict( + xref='paper', yref='y', + x=1.01, y=5, + xanchor='left', yanchor='middle', + text=f"5: {right_anchor.split('|')[0]}", + showarrow=False, + font=dict(size=10, color='gray') + )) + + fig.update_layout( + title=dict( + text=f"{title}
{trait_description}
(Numbers on bars indicate respondent count)", + y=0.92 + ), + xaxis_title="Voice", + yaxis_title="Average Score (1-5)", + height=height, + width=width, + plot_bgcolor=ColorPalette.BACKGROUND, + yaxis=dict( + range=[1, 5], + showgrid=True, + gridcolor=ColorPalette.GRID, + zeroline=False + ), + xaxis=dict( + showgrid=False + ), + margin=dict(r=150), + annotations=annotations, + font=dict(size=11) + ) + return fig diff --git a/theme.py b/theme.py index 589292f..fac1325 100644 --- a/theme.py +++ b/theme.py @@ -16,6 +16,9 @@ class ColorPalette: RANK_3 = "#5AAE95" # Sea Green (3rd Choice) RANK_4 = "#9E9E9E" # Grey (4th Choice / Worst) + # Neutral color for unhighlighted comparison items + NEUTRAL = "#D3D3D3" # Light Grey + # General UI elements TEXT = "black" GRID = "lightgray" diff --git a/utils.py b/utils.py index c8435b5..b5fcd97 100644 --- a/utils.py +++ b/utils.py @@ -3,7 +3,6 @@ from pathlib import Path import pandas as pd from typing import Union import json - import re def extract_voice_label(html_str: str) -> str: @@ -57,13 +56,13 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame: """ - Calculate weighted scores for character rankings. + Calculate weighted scores for character or voice rankings. Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt. Parameters ---------- df : pl.DataFrame - DataFrame containing character ranking columns. + DataFrame containing character/ voice ranking columns. Returns ------- @@ -71,8 +70,8 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame: DataFrame with columns 'Character' and 'Weighted Score', sorted by score. """ scores = [] - # Identify columns related to Character Ranking - ranking_cols = [c for c in df.columns if 'Character_Ranking' in c] + # Identify ranking columns (assume all columns except _recordId) + ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: # Calculate score: @@ -84,7 +83,7 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame: weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1) # Clean name - clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip() + clean_name = col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip() scores.append({ 'Character': clean_name, @@ -413,6 +412,95 @@ class JPMCSurvey: QIDs = ['QID44', 'QID97', 'QID95', 'QID96'] return self._get_subset(q, QIDs, rename_cols=True), None + + +def process_speaking_style_data( + df: Union[pl.LazyFrame, pl.DataFrame], + trait_map: dict[str, str] +) -> pl.DataFrame: + """ + Process speaking style columns from wide to long format and map trait descriptions. + + Parses columns with format: SS_{StyleGroup}__{Voice}__{ChoiceID} + Example: SS_Orange_Red__V14__Choice_1 + + Parameters + ---------- + df : pl.LazyFrame or pl.DataFrame + Input dataframe containing SS_* columns. + trait_map : dict + Dictionary mapping column names to trait descriptions. + Keys should be full column names like "SS_Orange_Red__V14__Choice_1". + + Returns + ------- + pl.DataFrame + Long-format dataframe with columns: + _recordId, Voice, Style_Group, Choice_ID, Description, Score, Left_Anchor, Right_Anchor + """ + # Normalize input to LazyFrame + lf = df.lazy() if isinstance(df, pl.DataFrame) else df + + # 1. Melt SS_ columns + melted = lf.melt( + id_vars=["_recordId"], + value_vars=pl.col("^SS_.*$"), + variable_name="full_col_name", + value_name="score" + ) + + # 2. Extract components from column name + # Regex captures: Style_Group (e.g. SS_Orange_Red), Voice (e.g. V14), Choice_ID (e.g. Choice_1) + pattern = r"^(?PSS_.+?)__(?P.+?)__(?PChoice_\d+)$" + + processed = melted.with_columns( + pl.col("full_col_name").str.extract_groups(pattern) + ).unnest("full_col_name") + + # 3. Create Mapping Lookup from the provided dictionary + # We map (Style_Group, Choice_ID) -> Description + mapping_data = [] + seen = set() + + for col_name, desc in trait_map.items(): + match = re.match(pattern, col_name) + if match: + groups = match.groupdict() + key = (groups["Style_Group"], groups["Choice_ID"]) + + if key not in seen: + # Parse description into anchors if possible (Left : Right) + parts = desc.split(':') + left_anchor = parts[0].strip() if len(parts) > 0 else "" + right_anchor = parts[1].strip() if len(parts) > 1 else "" + + mapping_data.append({ + "Style_Group": groups["Style_Group"], + "Choice_ID": groups["Choice_ID"], + "Description": desc, + "Left_Anchor": left_anchor, + "Right_Anchor": right_anchor + }) + seen.add(key) + + if not mapping_data: + return processed.collect() + + mapping_lf = pl.LazyFrame(mapping_data) + + # 4. Join Data with Mapping + result = processed.join( + mapping_lf, + on=["Style_Group", "Choice_ID"], + how="left" + ) + + # 5. Cast score to Int + result = result.with_columns( + pl.col("score").cast(pl.Int64, strict=False) + ) + + return result.collect() diff --git a/validation.py b/validation.py index a332848..9828fff 100644 --- a/validation.py +++ b/validation.py @@ -5,9 +5,9 @@ import polars as pl def check_progress(data): """Check if all responses are complete based on 'progress' column.""" if data.collect().select(pl.col('progress').unique()).shape[0] == 1: - return mo.md("""### Responses Complete: \n\n✅ All responses are complete (progress = 100) """) + return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """ - return mo.md("### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️") + return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️" def duration_validation(data): @@ -30,10 +30,9 @@ def duration_validation(data): outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() if outlier_data.shape[0] == 0: - return mo.md("### Duration Outliers: \n\n✅ No duration outliers detected") + return "### Duration Outliers: \n\n✅ No duration outliers detected" - return mo.md(f""" - ### Duration Outliers: + return f"""### Duration Outliers: **⚠️ Potential outliers detected based on response duration ⚠️** @@ -50,5 +49,5 @@ def duration_validation(data): **⚠️ NOTE: These have not been removed from the dataset ⚠️** - """) + """ \ No newline at end of file