base correlations

This commit is contained in:
2026-02-03 01:32:06 +01:00
parent 1dce4db909
commit 2408d06098
4 changed files with 257 additions and 0 deletions

View File

@@ -664,5 +664,140 @@ def _():
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Correlation Speaking Styles
""")
return
@app.cell
def _(S, data, top3_voices):
ss_or, choice_map_or = S.get_ss_orange_red(data)
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
# Combine the data
ss_all = ss_or.join(ss_gb, on='_recordId')
_d = ss_all.collect()
choice_map = {**choice_map_or, **choice_map_gb}
# print(_d.head())
# print(choice_map)
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
df_style = utils.process_speaking_style_data(ss_all, choice_map)
vscales = S.get_voice_scale_1_10(data)[0]
df_scale_long = utils.process_voice_scale_data(vscales)
joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
df_ranking = utils.process_voice_ranking_data(top3_voices)
joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
return joined_ranking, joined_scale
@app.cell
def _():
mo.md(r"""
### Colors vs Scale 1-10
""")
return
@app.cell
def _(S, joined_scale):
# Transform to get one row per color with average correlation
color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
S.plot_speaking_style_color_correlation(
data=color_corr_scale,
title="Correlation: Speaking Style Colors and Voice Scale 1-10"
)
return
@app.cell
def _():
mo.md(r"""
### Colors vs Ranking Points
""")
return
@app.cell
def _(S, joined_ranking):
color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
joined_ranking,
SPEAKING_STYLES,
target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
data=color_corr_ranking,
title="Correlation: Speaking Style Colors and Voice Ranking Points"
)
return
@app.cell
def _():
mo.md(r"""
### Individual Traits vs Scale 1-10
""")
return
@app.cell
def _(S, joined_scale):
_content = """"""
for _style, _traits in SPEAKING_STYLES.items():
# print(f"Correlation plot for {style}...")
_fig = S.plot_speaking_style_correlation(
data=joined_scale,
style_color=_style,
style_traits=_traits,
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.altair_chart(_fig)}
"""
mo.md(_content)
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
### Individual Traits vs Ranking Points
""")
return
@app.cell
def _(S, joined_ranking):
_content = """"""
for _style, _traits in SPEAKING_STYLES.items():
# print(f"Correlation plot for {style}...")
_fig = S.plot_speaking_style_ranking_correlation(
data=joined_ranking,
style_color=_style,
style_traits=_traits,
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.altair_chart(_fig)}
"""
mo.md(_content)
return
if __name__ == "__main__":
app.run()

View File

@@ -1048,6 +1048,59 @@ class QualtricsPlotsMixin:
chart = self._save_plot(chart, title)
return chart
def plot_speaking_style_color_correlation(
self,
data: pl.LazyFrame | pl.DataFrame | None = None,
title: str = "Speaking Style and Voice Scale 1-10 Correlations<br>(Average by Color)",
width: int | str | None = None,
height: int | None = None,
) -> alt.Chart:
"""Plot high-level correlation showing one bar per speaking style color.
Original use-case: "I want to create high-level correlation plots between
'green, blue, orange, red' speaking styles and the 'voice scale scores'.
I want to go to one plot with one bar for each color."
Args:
data: DataFrame with columns [Color, correlation, n_traits] from
utils.transform_speaking_style_color_correlation
title: Chart title (supports <br> for line breaks)
width: Chart width in pixels
height: Chart height in pixels
Returns:
Altair chart with one bar per speaking style color
"""
df = self._ensure_dataframe(data)
# Conditional color based on sign (matches plot_speaking_style_correlation)
chart = alt.Chart(df.to_pandas()).mark_bar().encode(
x=alt.X('Color:N',
title=None,
axis=alt.Axis(labelAngle=0),
sort=["Green", "Blue", "Orange", "Red"]),
y=alt.Y('correlation:Q',
title='Average Correlation',
scale=alt.Scale(domain=[-1, 1])),
color=alt.condition(
alt.datum.correlation >= 0,
alt.value('green'),
alt.value('red')
),
tooltip=[
alt.Tooltip('Color:N', title='Speaking Style'),
alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'),
alt.Tooltip('n_traits:Q', title='# Traits')
]
).properties(
title=self._process_title(title),
width=width or 400,
height=height or 350
)
chart = self._save_plot(chart, title)
return chart
def plot_demographic_distribution(
self,
column: str,

View File

@@ -77,6 +77,12 @@ class ColorPalette:
GENDER_MALE_NEUTRAL = "#B8C9D9" # Grey-Blue
GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
# Speaking Style Colors (named after the style quadrant colors)
STYLE_GREEN = "#2E7D32" # Forest Green
STYLE_BLUE = "#1565C0" # Strong Blue
STYLE_ORANGE = "#E07A00" # Burnt Orange
STYLE_RED = "#C62828" # Deep Red
def jpmc_altair_theme():
"""JPMC brand theme for Altair charts."""

View File

@@ -1676,6 +1676,69 @@ def join_voice_and_style_data(
how="inner"
)
def transform_speaking_style_color_correlation(
joined_df: pl.LazyFrame | pl.DataFrame,
speaking_styles: dict[str, list[str]],
target_column: str = "Voice_Scale_Score"
) -> tuple[pl.DataFrame, dict | None]:
"""Aggregate speaking style correlation by color (Green, Blue, Orange, Red).
Original use-case: "I want to create high-level correlation plots between
'green, blue, orange, red' speaking styles and the 'voice scale scores'.
I want to go to one plot with one bar for each color."
This function calculates the mean correlation per speaking style color by
averaging the correlations of all traits within each color.
Parameters
----------
joined_df : pl.LazyFrame or pl.DataFrame
Pre-fetched data from joining speaking style data with target data.
Must have columns: Right_Anchor, score, and the target_column
speaking_styles : dict
Dictionary mapping color names to their constituent traits.
Typically imported from speaking_styles.SPEAKING_STYLES
target_column : str
The column to correlate against speaking style scores.
Default: "Voice_Scale_Score" (for voice scale 1-10)
Alternative: "Ranking_Points" (for top 3 voice ranking)
Returns
-------
tuple[pl.DataFrame, dict | None]
(DataFrame with columns [Color, correlation, n_traits], None)
"""
if isinstance(joined_df, pl.LazyFrame):
joined_df = joined_df.collect()
color_correlations = []
for color, traits in speaking_styles.items():
trait_corrs = []
for trait in traits:
# Filter to this specific trait
subset = joined_df.filter(pl.col("Right_Anchor") == trait)
valid_data = subset.select(["score", target_column]).drop_nulls()
if valid_data.height > 1:
corr_val = valid_data.select(pl.corr("score", target_column)).item()
if corr_val is not None:
trait_corrs.append(corr_val)
# Average across all traits for this color
if trait_corrs:
avg_corr = sum(trait_corrs) / len(trait_corrs)
color_correlations.append({
"Color": color,
"correlation": avg_corr,
"n_traits": len(trait_corrs)
})
result_df = pl.DataFrame(color_correlations)
return result_df, None
def process_voice_ranking_data(
df: Union[pl.LazyFrame, pl.DataFrame]
) -> pl.DataFrame: