speaking style trait scores vertical

This commit is contained in:
2026-01-23 12:26:47 +01:00
parent 424355f4a1
commit 84a0f8052e
5 changed files with 615 additions and 90 deletions

View File

@@ -12,7 +12,10 @@ def _():
from validation import check_progress, duration_validation
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_character_ranking_distribution, plot_most_ranked_1_character, plot_weighted_ranking_score
from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_ranking_distribution, plot_most_ranked_1, plot_weighted_ranking_score, plot_voice_selection_counts, plot_top3_selection_counts
import plots as plts
import utils as utl
return (
JPMCSurvey,
Path,
@@ -20,27 +23,23 @@ def _():
check_progress,
duration_validation,
mo,
pl,
plot_average_scores_with_counts,
plot_character_ranking_distribution,
plot_most_ranked_1_character,
plot_most_ranked_1,
plot_ranking_distribution,
plot_top3_ranking_distribution,
plot_top3_selection_counts,
plot_voice_selection_counts,
plot_weighted_ranking_score,
plts,
utl,
)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Load Data
""")
return
@app.cell
def _(Path, mo):
def _():
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
mo.md(f"**Dataset:** `{Path(RESULTS_FILE).name}`")
return QSF_FILE, RESULTS_FILE
@@ -52,17 +51,30 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
return data_all, survey
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Data Validation
@app.cell
def _(Path, RESULTS_FILE, data_all, mo):
mo.md(f"""
# Load Data
**Dataset:** `{Path(RESULTS_FILE).name}`
{mo.ui.table(data_all.collect())}
""")
return
@app.cell
def _(check_progress, data_all):
check_progress(data_all)
@app.cell(hide_code=True)
def _(check_progress, data_all, duration_validation, mo):
mo.md(f"""
## Data Validation
{check_progress(data_all)}
{duration_validation(data_all)}
""")
return
@@ -112,8 +124,6 @@ def _(mo):
def _(mo):
mo.md(r"""
## Character personality ranking
### 1. Which character personality is ranked best?
""")
return
@@ -126,15 +136,23 @@ def _(data, survey):
@app.cell
def _(char_rank, plot_character_ranking_distribution):
plot_character_ranking_distribution(char_rank, x_label='Character Personality', width=1000)
def _(char_rank, mo, plot_top3_ranking_distribution):
mo.md(f"""
### 1. Which character personality is ranked best?
{mo.ui.plotly(plot_top3_ranking_distribution(char_rank, x_label='Character Personality', width=1000))}
""")
return
@app.cell
def _(mo):
mo.md(r"""
### 2. Which character personality is ranked number 1 the most?
def _(char_rank, mo, plot_most_ranked_1):
mo.md(f"""
### 2. Which character personality is ranked 1st the most?
{mo.ui.plotly(plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality', width=1000))}
""")
return
@@ -143,16 +161,18 @@ def _(mo):
def _(
calculate_weighted_ranking_scores,
char_rank,
mo,
plot_weighted_ranking_score,
):
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000)
return
# plot_weighted_ranking_score(char_rank_weighted, x_label='Voice', width=1000)
mo.md(f"""
### 3. Which character personality most popular based on weighted scores?
@app.cell
def _(char_rank, plot_most_ranked_1_character):
plot_most_ranked_1_character(char_rank, x_label='Character Personality', width=1000)
{mo.ui.plotly(plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice', width=1000))}
""")
return
@@ -167,51 +187,74 @@ def _(mo):
@app.cell
def _(data, survey):
v_18_8_3 = survey.get_18_8_3(data)[0].collect()
print(v_18_8_3.head())
return
# print(v_18_8_3.head())
return (v_18_8_3,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Which 8 voices are chosen the most out of 18?
def _(mo, plot_voice_selection_counts, v_18_8_3):
mo.md(f"""
### Which 8 voices are chosen the most out of 18?
{mo.ui.plotly(plot_voice_selection_counts(v_18_8_3, height=500, width=1000))}
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Which 3 voices are chosen the most out of 18? How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants Top 3, after they first selected 8 out of 18.
def _(mo, plot_top3_selection_counts, v_18_8_3):
mo.md(f"""
### Which 3 voices are chosen the most out of 18?
How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants Top 3, after they first selected 8 out of 18.
{mo.ui.plotly(plot_top3_selection_counts(v_18_8_3, height=500, width=1000))}
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Which voice is ranked best in the ranking question for top 3.? (so not best 3 out of 8 question)
- E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1. The voice with most points is ranked best.
def _(
calculate_weighted_ranking_scores,
data,
mo,
plot_ranking_distribution,
survey,
):
top3_voices = survey.get_top_3_voices(data)[0].collect()
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
mo.md(f"""
### Which voice is ranked best in the ranking question for top 3?
(not best 3 out of 8 question)
{mo.ui.plotly(plot_ranking_distribution(top3_voices, x_label='Voice', width=1000))}
""")
return top3_voices, top3_voices_weighted
@app.cell
def _(mo, plot_weighted_ranking_score, top3_voices_weighted):
mo.md(f"""
### Most popular **voice** based on weighted scores?
- E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1. The voice with most points is ranked best.
Distribution of the rankings for each voice:
{mo.ui.plotly(plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", height=500, width=1000))}
""")
return
@app.cell
def _(plot_top3_ranking_distribution, top3_voices):
plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
return
def _(mo, plot_most_ranked_1, top3_voices):
mo.md(f"""
### Which voice is ranked number 1 the most?
(not always the voice with most points)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
Which voice is ranked number 1 the most? (not always the voice with most points)
- Each of the 350 participants gives exactly one 1st-place vote.
- Total Rank-1 votes = 350.
- Voices are sorted from most to least 1st-place votes.
- The top 3 voices with the most Rank-1 votes are colored blue.
- This can differ from the points-based winners (321 totals), because a voice may receive many 2nd/3rd places but fewer 1st places.
{mo.ui.plotly(plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', width=1000))}
""")
return
@@ -235,6 +278,56 @@ def _(mo):
return
@app.cell
def _(data, survey):
ss_or, choice_map_or = survey.get_ss_orange_red(data)
ss_gb, choice_map_gb = survey.get_ss_green_blue(data)
# Combine the data
ss_all = ss_or.join(ss_gb, on='_recordId')
_d = ss_all.collect()
choice_map = {**choice_map_or, **choice_map_gb}
# print(_d.head())
print(choice_map)
return choice_map, ss_all
@app.cell
def _(choice_map, ss_all, utl):
ss_long = utl.process_speaking_style_data(ss_all, choice_map)
ss_long
return (ss_long,)
@app.cell
def _(pl, ss_long):
target_trait = "Indifferent | Unfocussed | Detached:Attentive | Helpful | Caring | Deliberate"
trait_data = ss_long.filter(pl.col("Description") == target_trait)
trait_data
return target_trait, trait_data
@app.cell
def _(plts, target_trait, trait_data):
plts.plot_speaking_style_trait_scores(
trait_data,
title=target_trait.replace(":", ""),
# trait_description="Attentive vs Indifferent", # simplified title
)
return
app._unparsable_cell(
"""
for trait in ss_long.select(\"Description\").unique().to_series().to_list():
trait_data = ss_long.filter(pl.col(\"Description\") == trait)
mo.md(f\"\"\"
""",
name="_"
)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""

384
plots.py
View File

@@ -216,22 +216,22 @@ def plot_top3_ranking_distribution(
return fig
def plot_character_ranking_distribution(
def plot_ranking_distribution(
df: pl.DataFrame,
title: str = "Character Personality Rankings<br>Distribution of Votes (1st to 4th Place)",
x_label: str = "Character Personality",
title: str = "Rankings Distribution<br>(1st to 4th Place)",
x_label: str = "Item",
y_label: str = "Number of Votes",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
Create a stacked bar chart showing the distribution of rankings (1st to 4th) for character personalities.
Sorted by the number of Rank 1 votes to highlight the 'Best' options.
Create a stacked bar chart showing the distribution of rankings (1st to 4th) for characters or voices.
Sorted by the number of Rank 1 votes.
Parameters
----------
df : pl.DataFrame
DataFrame containing character ranking columns (prefix 'Character_Ranking').
DataFrame containing ranking columns.
title : str, optional
Plot title.
x_label : str, optional
@@ -249,8 +249,8 @@ def plot_character_ranking_distribution(
Plotly figure object.
"""
stats = []
# Identify columns related to Character Ranking (excluding ID)
ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
# Identify ranking columns (assume all columns except _recordId)
ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Count occurrences of each rank (1, 2, 3, 4)
@@ -280,7 +280,7 @@ def plot_character_ranking_distribution(
# Clean up labels: Remove prefix and underscores
# e.g. "Character_Ranking_The_Coach" -> "The Coach"
labels = [
col.replace('Character_Ranking_', '').replace('_', ' ').strip()
col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
for col in stats_df['column']
]
@@ -354,21 +354,22 @@ def plot_character_ranking_distribution(
return fig
def plot_most_ranked_1_character(
def plot_most_ranked_1(
df: pl.DataFrame,
title: str = "Most Popular Character Personality<br>(Number of Times Ranked 1st)",
x_label: str = "Character Personality",
title: str = "Most Popular Choice<br>(Number of Times Ranked 1st)",
x_label: str = "Item",
y_label: str = "Count of 1st Place Rankings",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
Create a bar chart showing which character personality was ranked #1 the most.
Create a bar chart showing which item (character/voice) was ranked #1 the most.
Top 3 items are highlighted.
Parameters
----------
df : pl.DataFrame
DataFrame containing character ranking columns.
DataFrame containing ranking columns.
title : str, optional
Plot title.
x_label : str, optional
@@ -386,8 +387,8 @@ def plot_most_ranked_1_character(
Plotly figure object.
"""
stats = []
# Identify columns related to Character Ranking
ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
# Identify ranking columns (assume all columns except _recordId)
ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Count occurrences of rank 1
@@ -403,19 +404,25 @@ def plot_most_ranked_1_character(
# Clean up labels
labels = [
col.replace('Character_Ranking_', '').replace('_', ' ').strip()
col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
for col in stats_df['column']
]
fig = go.Figure()
# Assign colors: Top 3 get PRIMARY (Blue), others get NEUTRAL (Grey)
colors = [
ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
for i in range(len(stats_df))
]
fig = go.Figure()
fig.add_trace(go.Bar(
x=labels,
y=stats_df['count'],
text=stats_df['count'],
textposition='inside',
textfont=dict(size=10, color='white'),
marker_color=ColorPalette.PRIMARY,
marker_color=colors,
hovertemplate='<b>%{x}</b><br>1st Place Votes: %{y}<extra></extra>'
))
@@ -444,7 +451,7 @@ def plot_most_ranked_1_character(
def plot_weighted_ranking_score(
weighted_df: pl.DataFrame,
title: str = "Character Popularity Score<br>(Weighted: 1st=3pts, 2nd=2pts, 3rd=1pt)",
title: str = "Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)",
x_label: str = "Character Personality",
y_label: str = "Total Weighted Score",
color: str = ColorPalette.PRIMARY,
@@ -508,4 +515,339 @@ def plot_weighted_ranking_score(
font=dict(size=11)
)
return fig
return fig
def plot_voice_selection_counts(
df: pl.DataFrame,
target_column: str = "8_Combined",
title: str = "Most Frequently Chosen Voices<br>(Top 8 Highlighted)",
x_label: str = "Voice",
y_label: str = "Number of Times Chosen",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
Create a bar plot showing the frequency of voice selections.
Takes a column containing comma-separated values (e.g. "Voice 1, Voice 2..."),
counts occurrences, and highlights the top 8 most frequent voices.
Parameters
----------
df : pl.DataFrame
DataFrame containing the selection column.
target_column : str, optional
Name of the column containing comma-separated voice selections.
Defaults to "8_Combined".
title : str, optional
Plot title.
x_label : str, optional
X-axis label.
y_label : str, optional
Y-axis label.
height : int, optional
Plot height in pixels.
width : int, optional
Plot width in pixels.
Returns
-------
go.Figure
Plotly figure object.
"""
if target_column not in df.columns:
return go.Figure()
# Process the data:
# 1. Select the relevant column and remove nulls
# 2. Split the comma-separated string into a list
# 3. Explode the list so each voice gets its own row
# 4. Strip whitespace ensuring "Voice 1" and " Voice 1" match
# 5. Count occurrences
stats_df = (
df.select(pl.col(target_column))
.drop_nulls()
.with_columns(pl.col(target_column).str.split(","))
.explode(target_column)
.with_columns(pl.col(target_column).str.strip_chars())
.filter(pl.col(target_column) != "")
.group_by(target_column)
.agg(pl.len().alias("count"))
.sort("count", descending=True)
)
# Define colors: Top 8 get PRIMARY, rest get NEUTRAL
colors = [
ColorPalette.PRIMARY if i < 8 else ColorPalette.NEUTRAL
for i in range(len(stats_df))
]
fig = go.Figure()
fig.add_trace(go.Bar(
x=stats_df[target_column],
y=stats_df['count'],
text=stats_df['count'],
textposition='outside',
marker_color=colors,
hovertemplate='<b>%{x}</b><br>Selections: %{y}<extra></extra>'
))
fig.update_layout(
title=title,
xaxis_title=x_label,
yaxis_title=y_label,
height=height,
width=width,
plot_bgcolor=ColorPalette.BACKGROUND,
xaxis=dict(
showgrid=True,
gridcolor=ColorPalette.GRID,
tickangle=-45
),
yaxis=dict(
showgrid=True,
gridcolor=ColorPalette.GRID
),
font=dict(size=11),
)
return fig
def plot_top3_selection_counts(
df: pl.DataFrame,
target_column: str = "3_Ranked",
title: str = "Most Frequently Chosen Top 3 Voices<br>(Top 3 Highlighted)",
x_label: str = "Voice",
y_label: str = "Count of Mentions in Top 3",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
Question: Which 3 voices are chosen the most out of 18?
How many times does each voice end up in the top 3?
(this is based on the survey question where participants need to choose 3 out
of the earlier selected 8 voices). So how often each of the 18 stimuli ended
up in participants' Top 3, after they first selected 8 out of 18.
Parameters
----------
df : pl.DataFrame
DataFrame containing the ranking column (comma-separated strings).
target_column : str, optional
Name of the column containing comma-separated Top 3 voice elections.
Defaults to "3_Ranked".
title : str, optional
Plot title.
x_label : str, optional
X-axis label.
y_label : str, optional
Y-axis label.
height : int, optional
Plot height in pixels.
width : int, optional
Plot width in pixels.
Returns
-------
go.Figure
Plotly figure object.
"""
if target_column not in df.columns:
return go.Figure()
# Process the data:
# Same logic as plot_voice_selection_counts: explode comma-separated string
stats_df = (
df.select(pl.col(target_column))
.drop_nulls()
.with_columns(pl.col(target_column).str.split(","))
.explode(target_column)
.with_columns(pl.col(target_column).str.strip_chars())
.filter(pl.col(target_column) != "")
.group_by(target_column)
.agg(pl.len().alias("count"))
.sort("count", descending=True)
)
# Define colors: Top 3 get PRIMARY, rest get NEUTRAL
colors = [
ColorPalette.PRIMARY if i < 3 else ColorPalette.NEUTRAL
for i in range(len(stats_df))
]
fig = go.Figure()
fig.add_trace(go.Bar(
x=stats_df[target_column],
y=stats_df['count'],
text=stats_df['count'],
textposition='outside',
marker_color=colors,
hovertemplate='<b>%{x}</b><br>In Top 3: %{y} times<extra></extra>'
))
fig.update_layout(
title=title,
xaxis_title=x_label,
yaxis_title=y_label,
height=height,
width=width,
plot_bgcolor=ColorPalette.BACKGROUND,
xaxis=dict(
showgrid=True,
gridcolor=ColorPalette.GRID,
tickangle=-45
),
yaxis=dict(
showgrid=True,
gridcolor=ColorPalette.GRID
),
font=dict(size=11),
)
return fig
def plot_speaking_style_trait_scores(
df: pl.DataFrame,
trait_description: str = None,
left_anchor: str = None,
right_anchor: str = None,
title: str = "Speaking Style Trait Analysis",
height: int = 500,
width: int = 1000,
) -> go.Figure:
"""
Plot scores for a single speaking style trait across multiple voices.
The plot shows the average score per Voice, sorted by score.
It expects the DataFrame to contain 'Voice' and 'score' columns,
typically filtered for a single trait/description.
Parameters
----------
df : pl.DataFrame
DataFrame containing at least 'Voice' and 'score' columns.
Produced by utils.process_speaking_style_data and filtered.
trait_description : str, optional
Description of the trait being analyzed (e.g. "Indifferent : Attentive").
If not provided, it will be constructed from annotations.
left_anchor : str, optional
Label for the lower end of the scale (e.g. "Indifferent").
If not provided, attempts to read 'Left_Anchor' column from df.
right_anchor : str, optional
Label for the upper end of the scale (e.g. "Attentive").
If not provided, attempts to read 'Right_Anchor' column from df.
title : str, optional
Plot title.
height : int, optional
Plot height.
width : int, optional
Plot width.
Returns
-------
go.Figure
Plotly figure object.
"""
if df.is_empty():
return go.Figure()
required_cols = ["Voice", "score"]
if not all(col in df.columns for col in required_cols):
return go.Figure()
# Calculate stats: Mean, Count
stats = (
df.filter(pl.col("score").is_not_null())
.group_by("Voice")
.agg([
pl.col("score").mean().alias("mean_score"),
pl.col("score").count().alias("count")
])
.sort("mean_score", descending=True) # Descending for Left-to-Right
)
# Attempt to extract anchors from DF if not provided
if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns:
head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1)
if not head.is_empty():
if left_anchor is None: left_anchor = head["Left_Anchor"][0]
if right_anchor is None: right_anchor = head["Right_Anchor"][0]
if trait_description is None:
if left_anchor and right_anchor:
trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}"
else:
# Try getting from Description column
if "Description" in df.columns:
head = df.filter(pl.col("Description").is_not_null()).head(1)
if not head.is_empty():
trait_description = head["Description"][0]
else:
trait_description = ""
else:
trait_description = ""
fig = go.Figure()
fig.add_trace(go.Bar(
x=stats["Voice"], # X is Voice
y=stats["mean_score"], # Y is Score
text=stats["count"],
textposition='inside',
texttemplate='%{text}', # Count on bar
marker_color=ColorPalette.PRIMARY,
hovertemplate='<b>%{x}</b><br>Average: %{y:.2f}<br>Count: %{text}<extra></extra>'
))
# Add annotations for anchors
annotations = []
# Place anchors on the right side
if left_anchor:
annotations.append(dict(
xref='paper', yref='y',
x=1.01, y=1,
xanchor='left', yanchor='middle',
text=f"<b>1: {left_anchor.split('|')[0]}</b>",
showarrow=False,
font=dict(size=10, color='gray')
))
if right_anchor:
annotations.append(dict(
xref='paper', yref='y',
x=1.01, y=5,
xanchor='left', yanchor='middle',
text=f"<b>5: {right_anchor.split('|')[0]}</b>",
showarrow=False,
font=dict(size=10, color='gray')
))
fig.update_layout(
title=dict(
text=f"{title}<br><sub>{trait_description}</sub><br><sub>(Numbers on bars indicate respondent count)</sub>",
y=0.92
),
xaxis_title="Voice",
yaxis_title="Average Score (1-5)",
height=height,
width=width,
plot_bgcolor=ColorPalette.BACKGROUND,
yaxis=dict(
range=[1, 5],
showgrid=True,
gridcolor=ColorPalette.GRID,
zeroline=False
),
xaxis=dict(
showgrid=False
),
margin=dict(r=150),
annotations=annotations,
font=dict(size=11)
)
return fig

View File

@@ -16,6 +16,9 @@ class ColorPalette:
RANK_3 = "#5AAE95" # Sea Green (3rd Choice)
RANK_4 = "#9E9E9E" # Grey (4th Choice / Worst)
# Neutral color for unhighlighted comparison items
NEUTRAL = "#D3D3D3" # Light Grey
# General UI elements
TEXT = "black"
GRID = "lightgray"

100
utils.py
View File

@@ -3,7 +3,6 @@ from pathlib import Path
import pandas as pd
from typing import Union
import json
import re
def extract_voice_label(html_str: str) -> str:
@@ -57,13 +56,13 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe
def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
"""
Calculate weighted scores for character rankings.
Calculate weighted scores for character or voice rankings.
Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt.
Parameters
----------
df : pl.DataFrame
DataFrame containing character ranking columns.
DataFrame containing character/ voice ranking columns.
Returns
-------
@@ -71,8 +70,8 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
DataFrame with columns 'Character' and 'Weighted Score', sorted by score.
"""
scores = []
# Identify columns related to Character Ranking
ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
# Identify ranking columns (assume all columns except _recordId)
ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Calculate score:
@@ -84,7 +83,7 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1)
# Clean name
clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip()
clean_name = col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
scores.append({
'Character': clean_name,
@@ -413,6 +412,95 @@ class JPMCSurvey:
QIDs = ['QID44', 'QID97', 'QID95', 'QID96']
return self._get_subset(q, QIDs, rename_cols=True), None
def process_speaking_style_data(
df: Union[pl.LazyFrame, pl.DataFrame],
trait_map: dict[str, str]
) -> pl.DataFrame:
"""
Process speaking style columns from wide to long format and map trait descriptions.
Parses columns with format: SS_{StyleGroup}__{Voice}__{ChoiceID}
Example: SS_Orange_Red__V14__Choice_1
Parameters
----------
df : pl.LazyFrame or pl.DataFrame
Input dataframe containing SS_* columns.
trait_map : dict
Dictionary mapping column names to trait descriptions.
Keys should be full column names like "SS_Orange_Red__V14__Choice_1".
Returns
-------
pl.DataFrame
Long-format dataframe with columns:
_recordId, Voice, Style_Group, Choice_ID, Description, Score, Left_Anchor, Right_Anchor
"""
# Normalize input to LazyFrame
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
# 1. Melt SS_ columns
melted = lf.melt(
id_vars=["_recordId"],
value_vars=pl.col("^SS_.*$"),
variable_name="full_col_name",
value_name="score"
)
# 2. Extract components from column name
# Regex captures: Style_Group (e.g. SS_Orange_Red), Voice (e.g. V14), Choice_ID (e.g. Choice_1)
pattern = r"^(?P<Style_Group>SS_.+?)__(?P<Voice>.+?)__(?P<Choice_ID>Choice_\d+)$"
processed = melted.with_columns(
pl.col("full_col_name").str.extract_groups(pattern)
).unnest("full_col_name")
# 3. Create Mapping Lookup from the provided dictionary
# We map (Style_Group, Choice_ID) -> Description
mapping_data = []
seen = set()
for col_name, desc in trait_map.items():
match = re.match(pattern, col_name)
if match:
groups = match.groupdict()
key = (groups["Style_Group"], groups["Choice_ID"])
if key not in seen:
# Parse description into anchors if possible (Left : Right)
parts = desc.split(':')
left_anchor = parts[0].strip() if len(parts) > 0 else ""
right_anchor = parts[1].strip() if len(parts) > 1 else ""
mapping_data.append({
"Style_Group": groups["Style_Group"],
"Choice_ID": groups["Choice_ID"],
"Description": desc,
"Left_Anchor": left_anchor,
"Right_Anchor": right_anchor
})
seen.add(key)
if not mapping_data:
return processed.collect()
mapping_lf = pl.LazyFrame(mapping_data)
# 4. Join Data with Mapping
result = processed.join(
mapping_lf,
on=["Style_Group", "Choice_ID"],
how="left"
)
# 5. Cast score to Int
result = result.with_columns(
pl.col("score").cast(pl.Int64, strict=False)
)
return result.collect()

View File

@@ -5,9 +5,9 @@ import polars as pl
def check_progress(data):
"""Check if all responses are complete based on 'progress' column."""
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
return mo.md("""### Responses Complete: \n\n✅ All responses are complete (progress = 100) """)
return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
return mo.md("### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️")
return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
def duration_validation(data):
@@ -30,10 +30,9 @@ def duration_validation(data):
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
if outlier_data.shape[0] == 0:
return mo.md("### Duration Outliers: \n\n✅ No duration outliers detected")
return "### Duration Outliers: \n\n✅ No duration outliers detected"
return mo.md(f"""
### Duration Outliers:
return f"""### Duration Outliers:
**⚠️ Potential outliers detected based on response duration ⚠️**
@@ -50,5 +49,5 @@ def duration_validation(data):
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
""")
"""