correlation start

This commit is contained in:
2026-01-27 17:22:16 +01:00
parent 393c527656
commit fd4cb4b596
9 changed files with 5375 additions and 24 deletions

View File

@@ -12,15 +12,24 @@ def _():
import plotly as plt
from pathlib import Path
from utils import extract_qid_descr_map
return Path, extract_qid_descr_map, mo, pd
import utils
return Path, mo, pd, utils
@app.cell
def _(Path):
# results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv')
results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
return (results_file,)
# results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
results_file = Path('data/exports/1-23-26/JPMC_Chase Brand Personality_Quant Round 1_January 23, 2026_Labels.csv')
qsf_file = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
return qsf_file, results_file
@app.cell
def _(qsf_file, results_file, utils):
survey = utils.JPMCSurvey(results_file, qsf_file)
data_all = survey.load_data()
return (survey,)
@app.cell
@@ -33,8 +42,8 @@ def _(mo):
@app.cell
def _(extract_qid_descr_map, results_file):
qid_descr_map = extract_qid_descr_map(results_file)
def _(survey):
qid_descr_map = survey.qid_descr_map
qid_descr_map
return (qid_descr_map,)

View File

@@ -14,25 +14,27 @@ def _():
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_ranking_distribution, plot_most_ranked_1, plot_weighted_ranking_score, plot_voice_selection_counts, plot_top3_selection_counts
import plots as plts
import utils as utl
import plots
import utils
from speaking_styles import SPEAKING_STYLES
return (
JPMCSurvey,
Path,
SPEAKING_STYLES,
calculate_weighted_ranking_scores,
check_progress,
duration_validation,
mo,
pl,
plot_average_scores_with_counts,
plot_most_ranked_1,
plot_ranking_distribution,
plot_top3_ranking_distribution,
plot_top3_selection_counts,
plot_voice_selection_counts,
plot_weighted_ranking_score,
plts,
utl,
plots,
utils,
)
@@ -47,7 +49,7 @@ def _():
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
data_all = survey.load_data()
data_all.collect()
# data_all.collect()
return data_all, survey
@@ -298,7 +300,7 @@ def _(mo):
@app.cell
def _(data, survey, utl):
def _(data, survey, utils):
ss_or, choice_map_or = survey.get_ss_orange_red(data)
ss_gb, choice_map_gb = survey.get_ss_green_blue(data)
@@ -309,12 +311,12 @@ def _(data, survey, utl):
choice_map = {**choice_map_or, **choice_map_gb}
# print(_d.head())
# print(choice_map)
ss_long = utl.process_speaking_style_data(ss_all, choice_map)
return (ss_long,)
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
return choice_map, ss_all, ss_long
@app.cell
def _(mo, pl, plts, ss_long):
def _(mo, pl, plots, ss_long):
content = """### How does each voice score for each “speaking style labeled trait”?"""
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
@@ -323,7 +325,7 @@ def _(mo, pl, plts, ss_long):
content += f"""
### {i+1}) {trait.replace(":", "")}
{mo.ui.plotly(plts.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", ""), height=550))}
{mo.ui.plotly(plots.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", ""), height=550))}
"""
mo.md(content)
@@ -339,17 +341,17 @@ def _(mo):
@app.cell
def _(data, mo, plot_average_scores_with_counts, survey):
def _(data, mo, plots, survey):
vscales = survey.get_voice_scale_1_10(data)[0].collect()
plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
# plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
mo.md(f"""
### How does each voice score on a scale from 1-10?
{mo.ui.plotly(plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
{mo.ui.plotly(plots.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
""")
return
return (vscales,)
@app.cell(hide_code=True)
@@ -373,16 +375,57 @@ def _(mo):
return
@app.cell
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Total Results
### How to Interpret These Correlation Results
Each bar represents the Pearson correlation coefficient (r) between a speaking style trait rating (1-5 scale) and the overall Voice Scale rating (1-10).
- [ ] 4 correlation diagrams
**Reading the Chart**
| Correlation Value | Interpretation |
|-----------|----------|
| r > 0 (Green bars)| Positive correlation — voices rated higher on this trait tend to receive higher Voice Scale scores|
| r < 0 (Red bars)| Negative correlation — voices rated higher on this trait tend to receive lower Voice Scale scores|
| r ≈ 0| No relationship — this trait doesn't predict Voice Scale ratings|
""")
return
@app.cell
def _(choice_map, ss_all, utils, vscales):
df_style = utils.process_speaking_style_data(ss_all.collect(), choice_map)
df_voice_long = utils.process_voice_scale_data(vscales)
joined_df = df_style.join(df_voice_long, on=["_recordId", "Voice"], how="inner")
# df_voice_long
return df_style, joined_df
@app.cell
def _(SPEAKING_STYLES, joined_df, mo, plots):
_content = """### Total Results
"""
for style, traits in SPEAKING_STYLES.items():
# print(f"Correlation plot for {style}...")
fig = plots.plot_speaking_style_correlation(
df=joined_df,
style_color=style,
style_traits=traits,
title=f"Correlation: Speaking Style {style} and Voice Scale 1-10"
)
_content += f"""
#### Speaking Style **{style}**:
{mo.ui.plotly(fig)}
"""
mo.md(_content)
return
@app.cell
def _(mo):
mo.md(r"""
@@ -425,6 +468,30 @@ def _(mo):
return
@app.cell
def _(SPEAKING_STYLES, df_style, mo, plots, top3_voices, utils):
df_ranking = utils.process_voice_ranking_data(top3_voices)
joined = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
_content = """## Correlations Voice Speaking Styles <-> Voice Ranking Points
"""
for _style, _traits in SPEAKING_STYLES.items():
_fig = plots.plot_speaking_style_ranking_correlation(joined, _style, _traits)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.plotly(_fig)}
"""
mo.md(_content)
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""

View File

@@ -0,0 +1,60 @@
import polars as pl
from utils import JPMCSurvey, process_speaking_style_data, process_voice_scale_data, join_voice_and_style_data
from plots import plot_speaking_style_correlation
from speaking_styles import SPEAKING_STYLES
# 1. Initialize Survey and Load Data
# We need to point to the actual data files if possible, or use standard paths
# Assuming the file structure observed in workspace:
# Data: data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Values.csv
# QSF: data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf
RESULTS_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Values.csv"
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
try:
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
except TypeError:
# Fallback if signature is different or file not found (just in case)
print("Error initializing survey with paths. Checking signature...")
# This part is just for debugging if it fails again
raise
data = survey.load_data()
# 2. Extract Data
# Speaking Styles
ss_gb, map_gb = survey.get_ss_green_blue(data)
ss_or, map_or = survey.get_ss_orange_red(data)
# Voice Scale 1-10
voice_scale, _ = survey.get_voice_scale_1_10(data)
# 3. Process Dataframes (Wide to Long)
# Note: process_speaking_style_data handles the melt and parsing
# We collect them because the plotting functions expect eager DataFrames usually,
# but polars functions here return eager DFs currently based on `utils.py` implementation (return result.collect())
df_style_gb = process_speaking_style_data(ss_gb, map_gb)
df_style_or = process_speaking_style_data(ss_or, map_or)
# Combine both style dataframes
df_style_all = pl.concat([df_style_gb, df_style_or])
# Process Voice Scale
df_voice_long = process_voice_scale_data(voice_scale)
# 4. Join Style + Voice Data
joined_df = join_voice_and_style_data(df_style_all, df_voice_long)
# 5. Generate Plots for each Style Color
for style, traits in SPEAKING_STYLES.items():
print(f"Generating plot for {style}...")
fig = plot_speaking_style_correlation(
df=joined_df,
style_color=style,
style_traits=traits
)
fig.show()
# If in Marimo/Jupyter, just 'fig' or 'mo.ui.plotly(fig)'

220
plots.py
View File

@@ -854,3 +854,223 @@ def plot_speaking_style_trait_scores(
font=dict(size=11)
)
return fig
def plot_speaking_style_correlation(
df: pl.DataFrame,
style_color: str,
style_traits: list[str],
title=f"Speaking style and voice scale 1-10 correlations"
) -> go.Figure:
"""
Plots the correlation between Speaking Style Trait Scores (1-5) and Voice Scale (1-10) using a Bar Chart.
Each bar represents one trait.
Parameters
----------
df : pl.DataFrame
Joined dataframe containing 'Right_Anchor', 'score' (Trait Score), and 'Voice_Scale_Score'.
style_color : str
The name of the style (e.g., 'Green', 'Blue') for title and coloring.
style_traits : list[str]
List of trait descriptions (positive side) to include in the plot.
These should match the 'Right_Anchor' column values.
Returns
-------
go.Figure
"""
trait_correlations = []
# 1. Calculate Correlations
for i, trait in enumerate(style_traits):
# Match against Right_Anchor which contains the positive trait description
# Use exact match for reliability
subset = df.filter(
pl.col("Right_Anchor") == trait
)
# Drop Nulls for correlation calculation
valid_data = subset.select(["score", "Voice_Scale_Score"]).drop_nulls()
if valid_data.height > 1:
# Calculate Pearson Correlation
corr_val = valid_data.select(pl.corr("score", "Voice_Scale_Score")).item()
# Trait Label for Plot (Use the provided list text, maybe truncated or wrapped later)
trait_label = f"Trait {i+1}: {trait}"
# Or just "Trait {i+1}" and put full text in hover or subtitle?
# User example showed "Trait 1", "Trait 2".
# User request said "Use the traits directly".
# Let's use the trait text as the x-axis label, perhaps wrapped.
trait_correlations.append({
"trait_full": trait,
"trait_short": f"Trait {i+1}",
"correlation": corr_val if corr_val is not None else 0.0
})
# 2. Build Plot Data
if not trait_correlations:
# Return empty fig with title
fig = go.Figure()
fig.update_layout(title=f"No data for {style_color} Style")
return fig
plot_df = pl.DataFrame(trait_correlations)
# Determine colors based on correlation sign
colors = []
for val in plot_df["correlation"]:
if val >= 0:
colors.append("green") # Positive
else:
colors.append("red") # Negative
fig = go.Figure()
fig.add_trace(go.Bar(
x=[f"Trait {i+1}" for i in range(len(plot_df))], # Simple Labels on Axis
y=plot_df["correlation"],
text=[f"{val:.2f}" for val in plot_df["correlation"]],
textposition='outside', # Or auto
marker_color=colors,
hovertemplate="<b>%{customdata}</b><br>Correlation: %{y:.2f}<extra></extra>",
customdata=plot_df["trait_full"] # Full text on hover
))
# 3. Add Trait Descriptions as Subtitle or Annotation?
# Or put on X-axis? The traits are long strings "Friendly | Conversational ...".
# User's example has "Trait 1", "Trait 2" on axis.
# But user specifically said "Use the traits directly".
# This might mean "Don't map choice 1->Green, choice 2->Blue dynamically, trusting indices. Instead use the text match".
# It might ALSO mean "Show the text on the chart".
# The example image has simple "Trait X" labels.
# I will stick to "Trait X" on axis but add the legend/list in the title or as annotations,
# OR better: Use the full text on X-axis but with <br> wrapping.
# Given the length ("Optimistic | Benevolent | Positive | Appreciative"), wrapping is needed.
# Wrap text at the "|" separator for cleaner line breaks
def wrap_text_at_pipe(text):
parts = [p.strip() for p in text.split("|")]
return "<br>".join(parts)
x_labels = [wrap_text_at_pipe(t) for t in plot_df["trait_full"]]
# Update trace to use full labels
fig.data[0].x = x_labels
fig.update_layout(
title=title,
yaxis_title="Correlation",
yaxis=dict(range=[-1, 1], zeroline=True, zerolinecolor="black"),
xaxis=dict(tickangle=0), # Keep flat if possible
height=400,
width=1000,
template="plotly_white",
showlegend=False
)
return fig
def plot_speaking_style_ranking_correlation(
df: pl.DataFrame,
style_color: str,
style_traits: list[str],
title: str = None
) -> go.Figure:
"""
Plots the correlation between Speaking Style Trait Scores (1-5) and Voice Ranking Points (0-3).
Each bar represents one trait.
Parameters
----------
df : pl.DataFrame
Joined dataframe containing 'Right_Anchor', 'score' (Trait Score), and 'Ranking_Points'.
style_color : str
The name of the style (e.g., 'Green', 'Blue') for title and coloring.
style_traits : list[str]
List of trait descriptions (positive side) to include in the plot.
These should match the 'Right_Anchor' column values.
title : str, optional
Custom title for the plot. If None, uses default.
Returns
-------
go.Figure
"""
if title is None:
title = f"Speaking style {style_color} and voice ranking points correlations"
trait_correlations = []
# 1. Calculate Correlations
for i, trait in enumerate(style_traits):
# Match against Right_Anchor which contains the positive trait description
subset = df.filter(pl.col("Right_Anchor") == trait)
# Drop Nulls for correlation calculation
valid_data = subset.select(["score", "Ranking_Points"]).drop_nulls()
if valid_data.height > 1:
# Calculate Pearson Correlation
corr_val = valid_data.select(pl.corr("score", "Ranking_Points")).item()
trait_correlations.append({
"trait_full": trait,
"trait_short": f"Trait {i+1}",
"correlation": corr_val if corr_val is not None else 0.0
})
# 2. Build Plot Data
if not trait_correlations:
fig = go.Figure()
fig.update_layout(title=f"No data for {style_color} Style")
return fig
plot_df = pl.DataFrame(trait_correlations)
# Determine colors based on correlation sign
colors = []
for val in plot_df["correlation"]:
if val >= 0:
colors.append("green")
else:
colors.append("red")
fig = go.Figure()
fig.add_trace(go.Bar(
x=[f"Trait {i+1}" for i in range(len(plot_df))],
y=plot_df["correlation"],
text=[f"{val:.2f}" for val in plot_df["correlation"]],
textposition='outside',
marker_color=colors,
hovertemplate="<b>%{customdata}</b><br>Correlation: %{y:.2f}<extra></extra>",
customdata=plot_df["trait_full"]
))
# Wrap text at the "|" separator for cleaner line breaks
def wrap_text_at_pipe(text):
parts = [p.strip() for p in text.split("|")]
return "<br>".join(parts)
x_labels = [wrap_text_at_pipe(t) for t in plot_df["trait_full"]]
# Update trace to use full labels
fig.data[0].x = x_labels
fig.update_layout(
title=title,
yaxis_title="Correlation",
yaxis=dict(range=[-1, 1], zeroline=True, zerolinecolor="black"),
xaxis=dict(tickangle=0),
height=400,
width=1000,
template="plotly_white",
showlegend=False
)
return fig

33
speaking_styles.py Normal file
View File

@@ -0,0 +1,33 @@
"""
Mapping of Speaking Styles (Colors) to their constituent Traits (Positive side).
Derived from "Speaking Style Traits Quantitative test design.pdf".
"""
SPEAKING_STYLES = {
"Green": [
"Friendly | Conversational | Down-to-earth",
"Approachable | Familiar | Warm",
"Optimistic | Benevolent | Positive | Appreciative"
],
"Blue": [
"Proactive | Cooperative",
"Knowledgable | Resourceful | Savvy",
"Clear | Straightforward | Direct",
"Confident | Competent",
"Respectable | Respectful"
],
"Orange": [
"Attentive | Helpful | Caring | Deliberate",
"Reassuring | Empowering",
"Progressive | Guiding | Intentional",
"Patient | Open-minded"
],
"Red": [
"Trustworthy | Reliable | Dependable",
"Calm | Steady/Stable | Controlled",
"Transparent | Upright | Altruistic",
"Adaptive | Flexible"
]
}

115
utils.py
View File

@@ -506,3 +506,118 @@ def process_speaking_style_data(
def process_voice_scale_data(
df: Union[pl.LazyFrame, pl.DataFrame]
) -> pl.DataFrame:
"""
Process Voice Scale columns from wide to long format.
Parses columns with format: Voice_Scale_1_10__V{Voice}
Example: Voice_Scale_1_10__V14
Returns
-------
pl.DataFrame
Long-format dataframe with columns:
_recordId, Voice, Voice_Scale_Score
"""
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
# Melt
melted = lf.melt(
id_vars=["_recordId"],
value_vars=pl.col("^Voice_Scale_1_10__V.*$"),
variable_name="full_col_name",
value_name="Voice_Scale_Score"
)
# Extract Voice
processed = melted.with_columns(
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
).with_columns(
("V" + pl.col("Voice_Num")).alias("Voice")
)
# Keep Score as Float (original data is f64)
result = processed.select([
"_recordId",
"Voice",
pl.col("Voice_Scale_Score").cast(pl.Float64, strict=False)
])
return result.collect()
def join_voice_and_style_data(
processed_style_data: pl.DataFrame,
processed_voice_data: pl.DataFrame
) -> pl.DataFrame:
"""
Joins processed Speaking Style data with Voice Scale 1-10 data.
Parameters
----------
processed_style_data : pl.DataFrame
Result of process_speaking_style_data
processed_voice_data : pl.DataFrame
Result of process_voice_scale_data
Returns
-------
pl.DataFrame
Merged dataframe with columns from both, joined on _recordId and Voice.
"""
return processed_style_data.join(
processed_voice_data,
on=["_recordId", "Voice"],
how="inner"
)
def process_voice_ranking_data(
df: Union[pl.LazyFrame, pl.DataFrame]
) -> pl.DataFrame:
"""
Process Voice Ranking columns from wide to long format and convert ranks to points.
Parses columns with format: Top_3_Voices_ranking__V{Voice}
Converts ranks to points: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt
Returns
-------
pl.DataFrame
Long-format dataframe with columns:
_recordId, Voice, Ranking_Points
"""
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
# Melt
melted = lf.melt(
id_vars=["_recordId"],
value_vars=pl.col("^Top_3_Voices_ranking__V.*$"),
variable_name="full_col_name",
value_name="rank"
)
# Extract Voice
processed = melted.with_columns(
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
).with_columns(
("V" + pl.col("Voice_Num")).alias("Voice")
)
# Convert rank to points: 1st=3, 2nd=2, 3rd=1, null=0 (not ranked)
# Rank values are 1, 2, 3 for position in top 3
result = processed.with_columns(
pl.when(pl.col("rank") == 1).then(3)
.when(pl.col("rank") == 2).then(2)
.when(pl.col("rank") == 3).then(1)
.otherwise(0)
.alias("Ranking_Points")
).select([
"_recordId",
"Voice",
"Ranking_Points"
])
return result.collect()

File diff suppressed because it is too large Load Diff

20
voices.py Normal file
View File

@@ -0,0 +1,20 @@
Voice Reference Gender
Voice 14 Female
Voice 04 Female
Voice 08 Female
Voice 77 Female
Voice 48 Female
Voice 82 Female
Voice 89 Female
Voice 91 Emily (Current IVR Voice) Female
Voice 34 Male
Voice 69 Male
Voice 45 Male
Voice 46 Male
Voice 54 Male
Voice 74 Male
Voice 81 Male
Voice 86 Male
Voice 88 Male
Voice 16 Male