diff --git a/02_quant_analysis.py b/02_quant_analysis.py
index 6ffa7f4..a7b6e23 100644
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -12,7 +12,10 @@ def _():
from validation import check_progress, duration_validation
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
- from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_character_ranking_distribution, plot_most_ranked_1_character, plot_weighted_ranking_score
+ from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_ranking_distribution, plot_most_ranked_1, plot_weighted_ranking_score, plot_voice_selection_counts, plot_top3_selection_counts
+
+ import plots as plts
+ import utils as utl
return (
JPMCSurvey,
Path,
@@ -20,27 +23,23 @@ def _():
check_progress,
duration_validation,
mo,
+ pl,
plot_average_scores_with_counts,
- plot_character_ranking_distribution,
- plot_most_ranked_1_character,
+ plot_most_ranked_1,
+ plot_ranking_distribution,
plot_top3_ranking_distribution,
+ plot_top3_selection_counts,
+ plot_voice_selection_counts,
plot_weighted_ranking_score,
+ plts,
+ utl,
)
-@app.cell(hide_code=True)
-def _(mo):
- mo.md(r"""
- # Load Data
- """)
- return
-
-
@app.cell
-def _(Path, mo):
+def _():
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
- mo.md(f"**Dataset:** `{Path(RESULTS_FILE).name}`")
return QSF_FILE, RESULTS_FILE
@@ -52,17 +51,30 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
return data_all, survey
-@app.cell(hide_code=True)
-def _(mo):
- mo.md(r"""
- ## Data Validation
+@app.cell
+def _(Path, RESULTS_FILE, data_all, mo):
+ mo.md(f"""
+ # Load Data
+
+ **Dataset:** `{Path(RESULTS_FILE).name}`
+
+ {mo.ui.table(data_all.collect())}
""")
return
-@app.cell
-def _(check_progress, data_all):
- check_progress(data_all)
+@app.cell(hide_code=True)
+def _(check_progress, data_all, duration_validation, mo):
+ mo.md(f"""
+ ## Data Validation
+
+ {check_progress(data_all)}
+
+
+
+ {duration_validation(data_all)}
+
+ """)
return
@@ -112,8 +124,6 @@ def _(mo):
def _(mo):
mo.md(r"""
## Character personality ranking
-
- ### 1. Which character personality is ranked best?
""")
return
@@ -126,15 +136,23 @@ def _(data, survey):
@app.cell
-def _(char_rank, plot_character_ranking_distribution):
- plot_character_ranking_distribution(char_rank, x_label='Character Personality', width=1000)
+def _(char_rank, mo, plot_top3_ranking_distribution):
+ mo.md(f"""
+ ### 1. Which character personality is ranked best?
+
+
+ {mo.ui.plotly(plot_top3_ranking_distribution(char_rank, x_label='Character Personality', width=1000))}
+ """)
return
@app.cell
-def _(mo):
- mo.md(r"""
- ### 2. Which character personality is ranked number 1 the most?
+def _(char_rank, mo, plot_most_ranked_1):
+ mo.md(f"""
+ ### 2. Which character personality is ranked 1st the most?
+
+
+ {mo.ui.plotly(plot_most_ranked_1(char_rank, title="Most Popular Character
(Number of Times Ranked 1st)", x_label='Character Personality', width=1000))}
""")
return
@@ -143,16 +161,18 @@ def _(mo):
def _(
calculate_weighted_ranking_scores,
char_rank,
+ mo,
plot_weighted_ranking_score,
):
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
- plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000)
- return
+ # plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000)
+
+ mo.md(f"""
+ ### 3. Which character personality most popular based on weighted scores?
-@app.cell
-def _(char_rank, plot_most_ranked_1_character):
- plot_most_ranked_1_character(char_rank, x_label='Character Personality', width=1000)
+ {mo.ui.plotly(plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score
(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice', width=1000))}
+ """)
return
@@ -167,51 +187,74 @@ def _(mo):
@app.cell
def _(data, survey):
v_18_8_3 = survey.get_18_8_3(data)[0].collect()
- print(v_18_8_3.head())
- return
+ # print(v_18_8_3.head())
+ return (v_18_8_3,)
@app.cell(hide_code=True)
-def _(mo):
- mo.md(r"""
- Which 8 voices are chosen the most out of 18?
+def _(mo, plot_voice_selection_counts, v_18_8_3):
+ mo.md(f"""
+ ### Which 8 voices are chosen the most out of 18?
+
+ {mo.ui.plotly(plot_voice_selection_counts(v_18_8_3, height=500, width=1000))}
""")
return
@app.cell(hide_code=True)
-def _(mo):
- mo.md(r"""
- Which 3 voices are chosen the most out of 18? How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants’ Top 3, after they first selected 8 out of 18.
+def _(mo, plot_top3_selection_counts, v_18_8_3):
+ mo.md(f"""
+ ### Which 3 voices are chosen the most out of 18?
+
+ How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants’ Top 3, after they first selected 8 out of 18.
+
+ {mo.ui.plotly(plot_top3_selection_counts(v_18_8_3, height=500, width=1000))}
""")
return
@app.cell(hide_code=True)
-def _(mo):
- mo.md(r"""
- Which voice is ranked best in the ranking question for top 3.? (so not best 3 out of 8 question)
- - E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1. The voice with most points is ranked best.
+def _(
+ calculate_weighted_ranking_scores,
+ data,
+ mo,
+ plot_ranking_distribution,
+ survey,
+):
+ top3_voices = survey.get_top_3_voices(data)[0].collect()
+ top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
+
+ mo.md(f"""
+ ### Which voice is ranked best in the ranking question for top 3?
+
+ (not best 3 out of 8 question)
+
+ {mo.ui.plotly(plot_ranking_distribution(top3_voices, x_label='Voice', width=1000))}
+
+ """)
+ return top3_voices, top3_voices_weighted
+
+
+@app.cell
+def _(mo, plot_weighted_ranking_score, top3_voices_weighted):
+ mo.md(f"""
+ ### Most popular **voice** based on weighted scores?
+ - E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1. The voice with most points is ranked best.
+ Distribution of the rankings for each voice:
+
+ {mo.ui.plotly(plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score
(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", height=500, width=1000))}
""")
return
@app.cell
-def _(plot_top3_ranking_distribution, top3_voices):
- plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
- return
+def _(mo, plot_most_ranked_1, top3_voices):
+ mo.md(f"""
+ ### Which voice is ranked number 1 the most?
+ (not always the voice with most points)
-@app.cell(hide_code=True)
-def _(mo):
- mo.md(r"""
- Which voice is ranked number 1 the most? (not always the voice with most points)
-
- - Each of the 350 participants gives exactly one 1st-place vote.
- - Total Rank-1 votes = 350.
- - Voices are sorted from most to least 1st-place votes.
- - The top 3 voices with the most Rank-1 votes are colored blue.
- - This can differ from the points-based winners (3–2–1 totals), because a voice may receive many 2nd/3rd places but fewer 1st places.
+ {mo.ui.plotly(plot_most_ranked_1(top3_voices, title="Most Popular Voice
(Number of Times Ranked 1st)", x_label='Voice', width=1000))}
""")
return
@@ -235,6 +278,56 @@ def _(mo):
return
+@app.cell
+def _(data, survey):
+ ss_or, choice_map_or = survey.get_ss_orange_red(data)
+ ss_gb, choice_map_gb = survey.get_ss_green_blue(data)
+
+ # Combine the data
+ ss_all = ss_or.join(ss_gb, on='_recordId')
+ _d = ss_all.collect()
+
+ choice_map = {**choice_map_or, **choice_map_gb}
+ # print(_d.head())
+ print(choice_map)
+ return choice_map, ss_all
+
+
+@app.cell
+def _(choice_map, ss_all, utl):
+ ss_long = utl.process_speaking_style_data(ss_all, choice_map)
+ ss_long
+ return (ss_long,)
+
+
+@app.cell
+def _(pl, ss_long):
+ target_trait = "Indifferent | Unfocussed | Detached:Attentive | Helpful | Caring | Deliberate"
+ trait_data = ss_long.filter(pl.col("Description") == target_trait)
+ trait_data
+ return target_trait, trait_data
+
+
+@app.cell
+def _(plts, target_trait, trait_data):
+ plts.plot_speaking_style_trait_scores(
+ trait_data,
+ title=target_trait.replace(":", " ↔ "),
+ # trait_description="Attentive vs Indifferent", # simplified title
+ )
+ return
+
+
+app._unparsable_cell(
+ """
+ for trait in ss_long.select(\"Description\").unique().to_series().to_list():
+ trait_data = ss_long.filter(pl.col(\"Description\") == trait)
+ mo.md(f\"\"\"
+ """,
+ name="_"
+)
+
+
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
diff --git a/plots.py b/plots.py
index ff80272..119de8c 100644
--- a/plots.py
+++ b/plots.py
@@ -216,22 +216,22 @@ def plot_top3_ranking_distribution(
return fig
-def plot_character_ranking_distribution(
+def plot_ranking_distribution(
df: pl.DataFrame,
- title: str = "Character Personality Rankings
Distribution of Votes (1st to 4th Place)",
- x_label: str = "Character Personality",
+ title: str = "Rankings Distribution
(1st to 4th Place)",
+ x_label: str = "Item",
y_label: str = "Number of Votes",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
- Create a stacked bar chart showing the distribution of rankings (1st to 4th) for character personalities.
- Sorted by the number of Rank 1 votes to highlight the 'Best' options.
+ Create a stacked bar chart showing the distribution of rankings (1st to 4th) for characters or voices.
+ Sorted by the number of Rank 1 votes.
Parameters
----------
df : pl.DataFrame
- DataFrame containing character ranking columns (prefix 'Character_Ranking').
+ DataFrame containing ranking columns.
title : str, optional
Plot title.
x_label : str, optional
@@ -249,8 +249,8 @@ def plot_character_ranking_distribution(
Plotly figure object.
"""
stats = []
- # Identify columns related to Character Ranking (excluding ID)
- ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+ # Identify ranking columns (assume all columns except _recordId)
+ ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Count occurrences of each rank (1, 2, 3, 4)
@@ -280,7 +280,7 @@ def plot_character_ranking_distribution(
# Clean up labels: Remove prefix and underscores
# e.g. "Character_Ranking_The_Coach" -> "The Coach"
labels = [
- col.replace('Character_Ranking_', '').replace('_', ' ').strip()
+ col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
for col in stats_df['column']
]
@@ -354,21 +354,22 @@ def plot_character_ranking_distribution(
return fig
-def plot_most_ranked_1_character(
+def plot_most_ranked_1(
df: pl.DataFrame,
- title: str = "Most Popular Character Personality
(Number of Times Ranked 1st)",
- x_label: str = "Character Personality",
+ title: str = "Most Popular Choice
(Number of Times Ranked 1st)",
+ x_label: str = "Item",
y_label: str = "Count of 1st Place Rankings",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
- Create a bar chart showing which character personality was ranked #1 the most.
+ Create a bar chart showing which item (character/voice) was ranked #1 the most.
+ Top 3 items are highlighted.
Parameters
----------
df : pl.DataFrame
- DataFrame containing character ranking columns.
+ DataFrame containing ranking columns.
title : str, optional
Plot title.
x_label : str, optional
@@ -386,8 +387,8 @@ def plot_most_ranked_1_character(
Plotly figure object.
"""
stats = []
- # Identify columns related to Character Ranking
- ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+ # Identify ranking columns (assume all columns except _recordId)
+ ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Count occurrences of rank 1
@@ -403,19 +404,25 @@ def plot_most_ranked_1_character(
# Clean up labels
labels = [
- col.replace('Character_Ranking_', '').replace('_', ' ').strip()
+ col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
for col in stats_df['column']
]
- fig = go.Figure()
+ # Assign colors: Top 3 get PRIMARY (Blue), others get NEUTRAL (Grey)
+ colors = [
+ ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
+ for i in range(len(stats_df))
+ ]
+ fig = go.Figure()
+
fig.add_trace(go.Bar(
x=labels,
y=stats_df['count'],
text=stats_df['count'],
textposition='inside',
textfont=dict(size=10, color='white'),
- marker_color=ColorPalette.PRIMARY,
+ marker_color=colors,
hovertemplate='%{x}
1st Place Votes: %{y}'
))
@@ -444,7 +451,7 @@ def plot_most_ranked_1_character(
def plot_weighted_ranking_score(
weighted_df: pl.DataFrame,
- title: str = "Character Popularity Score
(Weighted: 1st=3pts, 2nd=2pts, 3rd=1pt)",
+ title: str = "Weighted Popularity Score
(1st=3pts, 2nd=2pts, 3rd=1pt)",
x_label: str = "Character Personality",
y_label: str = "Total Weighted Score",
color: str = ColorPalette.PRIMARY,
@@ -508,4 +515,339 @@ def plot_weighted_ranking_score(
font=dict(size=11)
)
- return fig
\ No newline at end of file
+ return fig
+
+
+def plot_voice_selection_counts(
+ df: pl.DataFrame,
+ target_column: str = "8_Combined",
+ title: str = "Most Frequently Chosen Voices
(Top 8 Highlighted)",
+ x_label: str = "Voice",
+ y_label: str = "Number of Times Chosen",
+ height: int = 500,
+ width: int = 1000,
+) -> go.Figure:
+ """
+ Create a bar plot showing the frequency of voice selections.
+ Takes a column containing comma-separated values (e.g. "Voice 1, Voice 2..."),
+ counts occurrences, and highlights the top 8 most frequent voices.
+
+ Parameters
+ ----------
+ df : pl.DataFrame
+ DataFrame containing the selection column.
+ target_column : str, optional
+ Name of the column containing comma-separated voice selections.
+ Defaults to "8_Combined".
+ title : str, optional
+ Plot title.
+ x_label : str, optional
+ X-axis label.
+ y_label : str, optional
+ Y-axis label.
+ height : int, optional
+ Plot height in pixels.
+ width : int, optional
+ Plot width in pixels.
+
+ Returns
+ -------
+ go.Figure
+ Plotly figure object.
+ """
+ if target_column not in df.columns:
+ return go.Figure()
+
+ # Process the data:
+ # 1. Select the relevant column and remove nulls
+ # 2. Split the comma-separated string into a list
+ # 3. Explode the list so each voice gets its own row
+ # 4. Strip whitespace ensuring "Voice 1" and " Voice 1" match
+ # 5. Count occurrences
+ stats_df = (
+ df.select(pl.col(target_column))
+ .drop_nulls()
+ .with_columns(pl.col(target_column).str.split(","))
+ .explode(target_column)
+ .with_columns(pl.col(target_column).str.strip_chars())
+ .filter(pl.col(target_column) != "")
+ .group_by(target_column)
+ .agg(pl.len().alias("count"))
+ .sort("count", descending=True)
+ )
+
+ # Define colors: Top 8 get PRIMARY, rest get NEUTRAL
+ colors = [
+ ColorPalette.PRIMARY if i < 8 else ColorPalette.NEUTRAL
+ for i in range(len(stats_df))
+ ]
+
+ fig = go.Figure()
+
+ fig.add_trace(go.Bar(
+ x=stats_df[target_column],
+ y=stats_df['count'],
+ text=stats_df['count'],
+ textposition='outside',
+ marker_color=colors,
+ hovertemplate='%{x}
Selections: %{y}'
+ ))
+
+ fig.update_layout(
+ title=title,
+ xaxis_title=x_label,
+ yaxis_title=y_label,
+ height=height,
+ width=width,
+ plot_bgcolor=ColorPalette.BACKGROUND,
+ xaxis=dict(
+ showgrid=True,
+ gridcolor=ColorPalette.GRID,
+ tickangle=-45
+ ),
+ yaxis=dict(
+ showgrid=True,
+ gridcolor=ColorPalette.GRID
+ ),
+ font=dict(size=11),
+ )
+
+ return fig
+
+
+def plot_top3_selection_counts(
+ df: pl.DataFrame,
+ target_column: str = "3_Ranked",
+ title: str = "Most Frequently Chosen Top 3 Voices
(Top 3 Highlighted)",
+ x_label: str = "Voice",
+ y_label: str = "Count of Mentions in Top 3",
+ height: int = 500,
+ width: int = 1000,
+) -> go.Figure:
+ """
+ Question: Which 3 voices are chosen the most out of 18?
+
+ How many times does each voice end up in the top 3?
+ (this is based on the survey question where participants need to choose 3 out
+ of the earlier selected 8 voices). So how often each of the 18 stimuli ended
+ up in participants' Top 3, after they first selected 8 out of 18.
+
+ Parameters
+ ----------
+ df : pl.DataFrame
+ DataFrame containing the ranking column (comma-separated strings).
+ target_column : str, optional
+ Name of the column containing comma-separated Top 3 voice elections.
+ Defaults to "3_Ranked".
+ title : str, optional
+ Plot title.
+ x_label : str, optional
+ X-axis label.
+ y_label : str, optional
+ Y-axis label.
+ height : int, optional
+ Plot height in pixels.
+ width : int, optional
+ Plot width in pixels.
+
+ Returns
+ -------
+ go.Figure
+ Plotly figure object.
+ """
+ if target_column not in df.columns:
+ return go.Figure()
+
+ # Process the data:
+ # Same logic as plot_voice_selection_counts: explode comma-separated string
+ stats_df = (
+ df.select(pl.col(target_column))
+ .drop_nulls()
+ .with_columns(pl.col(target_column).str.split(","))
+ .explode(target_column)
+ .with_columns(pl.col(target_column).str.strip_chars())
+ .filter(pl.col(target_column) != "")
+ .group_by(target_column)
+ .agg(pl.len().alias("count"))
+ .sort("count", descending=True)
+ )
+
+ # Define colors: Top 3 get PRIMARY, rest get NEUTRAL
+ colors = [
+ ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
+ for i in range(len(stats_df))
+ ]
+
+ fig = go.Figure()
+
+ fig.add_trace(go.Bar(
+ x=stats_df[target_column],
+ y=stats_df['count'],
+ text=stats_df['count'],
+ textposition='outside',
+ marker_color=colors,
+ hovertemplate='%{x}
In Top 3: %{y} times'
+ ))
+
+ fig.update_layout(
+ title=title,
+ xaxis_title=x_label,
+ yaxis_title=y_label,
+ height=height,
+ width=width,
+ plot_bgcolor=ColorPalette.BACKGROUND,
+ xaxis=dict(
+ showgrid=True,
+ gridcolor=ColorPalette.GRID,
+ tickangle=-45
+ ),
+ yaxis=dict(
+ showgrid=True,
+ gridcolor=ColorPalette.GRID
+ ),
+ font=dict(size=11),
+ )
+
+ return fig
+
+
+def plot_speaking_style_trait_scores(
+ df: pl.DataFrame,
+ trait_description: str = None,
+ left_anchor: str = None,
+ right_anchor: str = None,
+ title: str = "Speaking Style Trait Analysis",
+ height: int = 500,
+ width: int = 1000,
+) -> go.Figure:
+ """
+ Plot scores for a single speaking style trait across multiple voices.
+
+ The plot shows the average score per Voice, sorted by score.
+ It expects the DataFrame to contain 'Voice' and 'score' columns,
+ typically filtered for a single trait/description.
+
+ Parameters
+ ----------
+ df : pl.DataFrame
+ DataFrame containing at least 'Voice' and 'score' columns.
+ Produced by utils.process_speaking_style_data and filtered.
+ trait_description : str, optional
+ Description of the trait being analyzed (e.g. "Indifferent : Attentive").
+ If not provided, it will be constructed from annotations.
+ left_anchor : str, optional
+ Label for the lower end of the scale (e.g. "Indifferent").
+ If not provided, attempts to read 'Left_Anchor' column from df.
+ right_anchor : str, optional
+ Label for the upper end of the scale (e.g. "Attentive").
+ If not provided, attempts to read 'Right_Anchor' column from df.
+ title : str, optional
+ Plot title.
+ height : int, optional
+ Plot height.
+ width : int, optional
+ Plot width.
+
+ Returns
+ -------
+ go.Figure
+ Plotly figure object.
+ """
+ if df.is_empty():
+ return go.Figure()
+
+ required_cols = ["Voice", "score"]
+ if not all(col in df.columns for col in required_cols):
+ return go.Figure()
+
+ # Calculate stats: Mean, Count
+ stats = (
+ df.filter(pl.col("score").is_not_null())
+ .group_by("Voice")
+ .agg([
+ pl.col("score").mean().alias("mean_score"),
+ pl.col("score").count().alias("count")
+ ])
+ .sort("mean_score", descending=True) # Descending for Left-to-Right
+ )
+
+ # Attempt to extract anchors from DF if not provided
+ if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns:
+ head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1)
+ if not head.is_empty():
+ if left_anchor is None: left_anchor = head["Left_Anchor"][0]
+ if right_anchor is None: right_anchor = head["Right_Anchor"][0]
+
+ if trait_description is None:
+ if left_anchor and right_anchor:
+ trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}"
+ else:
+ # Try getting from Description column
+ if "Description" in df.columns:
+ head = df.filter(pl.col("Description").is_not_null()).head(1)
+ if not head.is_empty():
+ trait_description = head["Description"][0]
+ else:
+ trait_description = ""
+ else:
+ trait_description = ""
+
+ fig = go.Figure()
+
+ fig.add_trace(go.Bar(
+ x=stats["Voice"], # X is Voice
+ y=stats["mean_score"], # Y is Score
+ text=stats["count"],
+ textposition='inside',
+ texttemplate='%{text}', # Count on bar
+ marker_color=ColorPalette.PRIMARY,
+ hovertemplate='%{x}
Average: %{y:.2f}
Count: %{text}'
+ ))
+
+ # Add annotations for anchors
+ annotations = []
+
+ # Place anchors on the right side
+ if left_anchor:
+ annotations.append(dict(
+ xref='paper', yref='y',
+ x=1.01, y=1,
+ xanchor='left', yanchor='middle',
+ text=f"1: {left_anchor.split('|')[0]}",
+ showarrow=False,
+ font=dict(size=10, color='gray')
+ ))
+ if right_anchor:
+ annotations.append(dict(
+ xref='paper', yref='y',
+ x=1.01, y=5,
+ xanchor='left', yanchor='middle',
+ text=f"5: {right_anchor.split('|')[0]}",
+ showarrow=False,
+ font=dict(size=10, color='gray')
+ ))
+
+ fig.update_layout(
+ title=dict(
+ text=f"{title}
{trait_description}
(Numbers on bars indicate respondent count)",
+ y=0.92
+ ),
+ xaxis_title="Voice",
+ yaxis_title="Average Score (1-5)",
+ height=height,
+ width=width,
+ plot_bgcolor=ColorPalette.BACKGROUND,
+ yaxis=dict(
+ range=[1, 5],
+ showgrid=True,
+ gridcolor=ColorPalette.GRID,
+ zeroline=False
+ ),
+ xaxis=dict(
+ showgrid=False
+ ),
+ margin=dict(r=150),
+ annotations=annotations,
+ font=dict(size=11)
+ )
+ return fig
diff --git a/theme.py b/theme.py
index 589292f..fac1325 100644
--- a/theme.py
+++ b/theme.py
@@ -16,6 +16,9 @@ class ColorPalette:
RANK_3 = "#5AAE95" # Sea Green (3rd Choice)
RANK_4 = "#9E9E9E" # Grey (4th Choice / Worst)
+ # Neutral color for unhighlighted comparison items
+ NEUTRAL = "#D3D3D3" # Light Grey
+
# General UI elements
TEXT = "black"
GRID = "lightgray"
diff --git a/utils.py b/utils.py
index c8435b5..b5fcd97 100644
--- a/utils.py
+++ b/utils.py
@@ -3,7 +3,6 @@ from pathlib import Path
import pandas as pd
from typing import Union
import json
-
import re
def extract_voice_label(html_str: str) -> str:
@@ -57,13 +56,13 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe
def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
"""
- Calculate weighted scores for character rankings.
+ Calculate weighted scores for character or voice rankings.
Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt.
Parameters
----------
df : pl.DataFrame
- DataFrame containing character ranking columns.
+ DataFrame containing character/ voice ranking columns.
Returns
-------
@@ -71,8 +70,8 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
DataFrame with columns 'Character' and 'Weighted Score', sorted by score.
"""
scores = []
- # Identify columns related to Character Ranking
- ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+ # Identify ranking columns (assume all columns except _recordId)
+ ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Calculate score:
@@ -84,7 +83,7 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1)
# Clean name
- clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip()
+ clean_name = col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
scores.append({
'Character': clean_name,
@@ -413,6 +412,95 @@ class JPMCSurvey:
QIDs = ['QID44', 'QID97', 'QID95', 'QID96']
return self._get_subset(q, QIDs, rename_cols=True), None
+
+
+def process_speaking_style_data(
+ df: Union[pl.LazyFrame, pl.DataFrame],
+ trait_map: dict[str, str]
+) -> pl.DataFrame:
+ """
+ Process speaking style columns from wide to long format and map trait descriptions.
+
+ Parses columns with format: SS_{StyleGroup}__{Voice}__{ChoiceID}
+ Example: SS_Orange_Red__V14__Choice_1
+
+ Parameters
+ ----------
+ df : pl.LazyFrame or pl.DataFrame
+ Input dataframe containing SS_* columns.
+ trait_map : dict
+ Dictionary mapping column names to trait descriptions.
+ Keys should be full column names like "SS_Orange_Red__V14__Choice_1".
+
+ Returns
+ -------
+ pl.DataFrame
+ Long-format dataframe with columns:
+ _recordId, Voice, Style_Group, Choice_ID, Description, Score, Left_Anchor, Right_Anchor
+ """
+ # Normalize input to LazyFrame
+ lf = df.lazy() if isinstance(df, pl.DataFrame) else df
+
+ # 1. Melt SS_ columns
+ melted = lf.melt(
+ id_vars=["_recordId"],
+ value_vars=pl.col("^SS_.*$"),
+ variable_name="full_col_name",
+ value_name="score"
+ )
+
+ # 2. Extract components from column name
+ # Regex captures: Style_Group (e.g. SS_Orange_Red), Voice (e.g. V14), Choice_ID (e.g. Choice_1)
+ pattern = r"^(?PSS_.+?)__(?P.+?)__(?PChoice_\d+)$"
+
+ processed = melted.with_columns(
+ pl.col("full_col_name").str.extract_groups(pattern)
+ ).unnest("full_col_name")
+
+ # 3. Create Mapping Lookup from the provided dictionary
+ # We map (Style_Group, Choice_ID) -> Description
+ mapping_data = []
+ seen = set()
+
+ for col_name, desc in trait_map.items():
+ match = re.match(pattern, col_name)
+ if match:
+ groups = match.groupdict()
+ key = (groups["Style_Group"], groups["Choice_ID"])
+
+ if key not in seen:
+ # Parse description into anchors if possible (Left : Right)
+ parts = desc.split(':')
+ left_anchor = parts[0].strip() if len(parts) > 0 else ""
+ right_anchor = parts[1].strip() if len(parts) > 1 else ""
+
+ mapping_data.append({
+ "Style_Group": groups["Style_Group"],
+ "Choice_ID": groups["Choice_ID"],
+ "Description": desc,
+ "Left_Anchor": left_anchor,
+ "Right_Anchor": right_anchor
+ })
+ seen.add(key)
+
+ if not mapping_data:
+ return processed.collect()
+
+ mapping_lf = pl.LazyFrame(mapping_data)
+
+ # 4. Join Data with Mapping
+ result = processed.join(
+ mapping_lf,
+ on=["Style_Group", "Choice_ID"],
+ how="left"
+ )
+
+ # 5. Cast score to Int
+ result = result.with_columns(
+ pl.col("score").cast(pl.Int64, strict=False)
+ )
+
+ return result.collect()
diff --git a/validation.py b/validation.py
index a332848..9828fff 100644
--- a/validation.py
+++ b/validation.py
@@ -5,9 +5,9 @@ import polars as pl
def check_progress(data):
"""Check if all responses are complete based on 'progress' column."""
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
- return mo.md("""### Responses Complete: \n\n✅ All responses are complete (progress = 100) """)
+ return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
- return mo.md("### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️")
+ return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
def duration_validation(data):
@@ -30,10 +30,9 @@ def duration_validation(data):
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
if outlier_data.shape[0] == 0:
- return mo.md("### Duration Outliers: \n\n✅ No duration outliers detected")
+ return "### Duration Outliers: \n\n✅ No duration outliers detected"
- return mo.md(f"""
- ### Duration Outliers:
+ return f"""### Duration Outliers:
**⚠️ Potential outliers detected based on response duration ⚠️**
@@ -50,5 +49,5 @@ def duration_validation(data):
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
- """)
+ """
\ No newline at end of file