voice gender split correlation plots

This commit is contained in:
2026-02-04 13:44:51 +01:00
parent ab78276a97
commit fc76bb0ab5
3 changed files with 360 additions and 81 deletions

View File

@@ -573,41 +573,24 @@ joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
# Colors vs Scale 1-10 (Male voices only)
color_corr_scale_male, _ = utils.transform_speaking_style_color_correlation(joined_scale_male, SPEAKING_STYLES)
S.plot_speaking_style_color_correlation(
data=color_corr_scale_male,
title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Male Voices Only)"
# Colors vs Scale 1-10 (grouped by voice gender)
S.plot_speaking_style_color_correlation_by_gender(
data_male=joined_scale_male,
data_female=joined_scale_female,
speaking_styles=SPEAKING_STYLES,
target_column="Voice_Scale_Score",
title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
)
# Colors vs Scale 1-10 (Female voices only)
color_corr_scale_female, _ = utils.transform_speaking_style_color_correlation(joined_scale_female, SPEAKING_STYLES)
S.plot_speaking_style_color_correlation(
data=color_corr_scale_female,
title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Female Voices Only)"
)
# %%
# Colors vs Ranking Points (Male voices only)
color_corr_ranking_male, _ = utils.transform_speaking_style_color_correlation(
joined_ranking_male,
SPEAKING_STYLES,
target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
data=color_corr_ranking_male,
title="Correlation: Speaking Style Colors and Voice Ranking Points (Male Voices Only)"
)
# Colors vs Ranking Points (Female voices only)
color_corr_ranking_female, _ = utils.transform_speaking_style_color_correlation(
joined_ranking_female,
SPEAKING_STYLES,
target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
data=color_corr_ranking_female,
title="Correlation: Speaking Style Colors and Voice Ranking Points (Female Voices Only)"
# Colors vs Ranking Points (grouped by voice gender)
S.plot_speaking_style_color_correlation_by_gender(
data_male=joined_ranking_male,
data_female=joined_ranking_female,
speaking_styles=SPEAKING_STYLES,
target_column="Ranking_Points",
title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
)
# %%
@@ -659,15 +642,17 @@ for _style, _traits in SPEAKING_STYLES.items():
mo.md(_content)
# %%
# Individual Traits vs Scale 1-10 (Male voices only)
_content = """### Individual Traits vs Scale 1-10 (Male Voices Only)\n\n"""
# Individual Traits vs Scale 1-10 (grouped by voice gender)
_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
for _style, _traits in SPEAKING_STYLES.items():
_fig = S.plot_speaking_style_scale_correlation(
data=joined_scale_male,
_fig = S.plot_speaking_style_scale_correlation_by_gender(
data_male=joined_scale_male,
data_female=joined_scale_female,
style_color=_style,
style_traits=_traits,
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Male Voices Only)",
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
)
_content += f"""
#### Speaking Style **{_style}**:
@@ -678,53 +663,17 @@ for _style, _traits in SPEAKING_STYLES.items():
mo.md(_content)
# %%
# Individual Traits vs Scale 1-10 (Female voices only)
_content = """### Individual Traits vs Scale 1-10 (Female Voices Only)\n\n"""
# Individual Traits vs Ranking Points (grouped by voice gender)
_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
for _style, _traits in SPEAKING_STYLES.items():
_fig = S.plot_speaking_style_scale_correlation(
data=joined_scale_female,
_fig = S.plot_speaking_style_ranking_correlation_by_gender(
data_male=joined_ranking_male,
data_female=joined_ranking_female,
style_color=_style,
style_traits=_traits,
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Female Voices Only)",
)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.altair_chart(_fig)}
"""
mo.md(_content)
# %%
# Individual Traits vs Ranking Points (Male voices only)
_content = """### Individual Traits vs Ranking Points (Male Voices Only)\n\n"""
for _style, _traits in SPEAKING_STYLES.items():
_fig = S.plot_speaking_style_ranking_correlation(
data=joined_ranking_male,
style_color=_style,
style_traits=_traits,
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Male Voices Only)",
)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.altair_chart(_fig)}
"""
mo.md(_content)
# %%
# Individual Traits vs Ranking Points (Female voices only)
_content = """### Individual Traits vs Ranking Points (Female Voices Only)\n\n"""
for _style, _traits in SPEAKING_STYLES.items():
_fig = S.plot_speaking_style_ranking_correlation(
data=joined_ranking_female,
style_color=_style,
style_traits=_traits,
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Female Voices Only)",
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
)
_content += f"""
#### Speaking Style **{_style}**:

323
plots.py
View File

@@ -1256,6 +1256,237 @@ class QualtricsPlotsMixin:
chart = self._save_plot(chart, title, filename=filename)
return chart
def _create_gender_correlation_legend(self) -> alt.Chart:
"""Create a custom legend for gender correlation plots with dual-color swatches.
Horizontal layout below the chart:
[■][■] Male [■][■] Female
"""
# Horizontal layout: Male at x=0-2, Female at x=5-7 (gap for whitespace)
legend_data = pd.DataFrame([
{"x": 0, "color": ColorPalette.CORR_MALE_POSITIVE},
{"x": 1, "color": ColorPalette.CORR_MALE_NEGATIVE},
{"x": 5, "color": ColorPalette.CORR_FEMALE_POSITIVE},
{"x": 6, "color": ColorPalette.CORR_FEMALE_NEGATIVE},
])
# Color blocks
blocks = alt.Chart(legend_data).mark_rect(width=12, height=12).encode(
x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])),
y=alt.value(6),
color=alt.Color('color:N', scale=None),
)
# Labels positioned after each pair of blocks
label_data = pd.DataFrame([
{"x": 2.3, "label": "Male"},
{"x": 7.3, "label": "Female"},
])
labels = alt.Chart(label_data).mark_text(align='left', baseline='middle', fontSize=11).encode(
x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])),
y=alt.value(6),
text='label:N'
)
legend = (blocks + labels).properties(width=200, height=20)
return legend
def plot_speaking_style_scale_correlation_by_gender(
self,
style_color: str,
style_traits: list[str],
data_male: pl.LazyFrame | pl.DataFrame,
data_female: pl.LazyFrame | pl.DataFrame,
title: str | None = None,
filename: str | None = None,
width: int | str | None = None,
height: int | None = None,
) -> alt.Chart:
"""Plots correlation between Speaking Style Trait Scores and Voice Scale,
with grouped bars comparing male vs female voices.
Args:
style_color: The speaking style color (e.g., "Green", "Blue")
style_traits: List of traits for this style
data_male: DataFrame filtered to male voices only
data_female: DataFrame filtered to female voices only
title: Chart title
filename: Optional explicit filename for saving
width: Chart width in pixels
height: Chart height in pixels
Returns:
Altair chart with grouped bars (male/female) per trait
"""
df_male = self._ensure_dataframe(data_male)
df_female = self._ensure_dataframe(data_female)
if title is None:
title = f"Speaking style {style_color} and voice scale 1-10 correlations (by Voice Gender)"
trait_correlations = []
for i, trait in enumerate(style_traits):
trait_display = trait.replace('|', '\n')
# Male correlation
subset_m = df_male.filter(pl.col("Right_Anchor") == trait)
valid_m = subset_m.select(["score", "Voice_Scale_Score"]).drop_nulls()
if valid_m.height > 1:
corr_m = valid_m.select(pl.corr("score", "Voice_Scale_Score")).item()
corr_val = corr_m if corr_m is not None else 0.0
trait_correlations.append({
"trait_display": trait_display,
"Gender": "Male",
"correlation": corr_val,
"color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg"
})
# Female correlation
subset_f = df_female.filter(pl.col("Right_Anchor") == trait)
valid_f = subset_f.select(["score", "Voice_Scale_Score"]).drop_nulls()
if valid_f.height > 1:
corr_f = valid_f.select(pl.corr("score", "Voice_Scale_Score")).item()
corr_val = corr_f if corr_f is not None else 0.0
trait_correlations.append({
"trait_display": trait_display,
"Gender": "Female",
"correlation": corr_val,
"color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg"
})
if not trait_correlations:
return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N')
plot_df = pl.DataFrame(trait_correlations).to_pandas()
main_chart = alt.Chart(plot_df).mark_bar().encode(
x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)),
xOffset='Gender:N',
y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)),
color=alt.Color('color_key:N',
scale=alt.Scale(
domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
),
legend=None),
tooltip=[
alt.Tooltip('trait_display:N', title='Trait'),
alt.Tooltip('Gender:N'),
alt.Tooltip('correlation:Q', format='.3f')
]
).properties(
title=self._process_title(title),
width=width or 800,
height=height or 350
)
# Add custom legend below the chart
legend = self._create_gender_correlation_legend()
chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
chart = self._save_plot(chart, title, filename=filename)
return chart
def plot_speaking_style_ranking_correlation_by_gender(
self,
style_color: str,
style_traits: list[str],
data_male: pl.LazyFrame | pl.DataFrame,
data_female: pl.LazyFrame | pl.DataFrame,
title: str | None = None,
filename: str | None = None,
width: int | str | None = None,
height: int | None = None,
) -> alt.Chart:
"""Plots correlation between Speaking Style Trait Scores and Voice Ranking Points,
with grouped bars comparing male vs female voices.
Args:
style_color: The speaking style color (e.g., "Green", "Blue")
style_traits: List of traits for this style
data_male: DataFrame filtered to male voices only
data_female: DataFrame filtered to female voices only
title: Chart title
filename: Optional explicit filename for saving
width: Chart width in pixels
height: Chart height in pixels
Returns:
Altair chart with grouped bars (male/female) per trait
"""
df_male = self._ensure_dataframe(data_male)
df_female = self._ensure_dataframe(data_female)
if title is None:
title = f"Speaking style {style_color} and voice ranking points correlations (by Voice Gender)"
trait_correlations = []
for i, trait in enumerate(style_traits):
trait_display = trait.replace('|', '\n')
# Male correlation
subset_m = df_male.filter(pl.col("Right_Anchor") == trait)
valid_m = subset_m.select(["score", "Ranking_Points"]).drop_nulls()
if valid_m.height > 1:
corr_m = valid_m.select(pl.corr("score", "Ranking_Points")).item()
corr_val = corr_m if corr_m is not None else 0.0
trait_correlations.append({
"trait_display": trait_display,
"Gender": "Male",
"correlation": corr_val,
"color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg"
})
# Female correlation
subset_f = df_female.filter(pl.col("Right_Anchor") == trait)
valid_f = subset_f.select(["score", "Ranking_Points"]).drop_nulls()
if valid_f.height > 1:
corr_f = valid_f.select(pl.corr("score", "Ranking_Points")).item()
corr_val = corr_f if corr_f is not None else 0.0
trait_correlations.append({
"trait_display": trait_display,
"Gender": "Female",
"correlation": corr_val,
"color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg"
})
if not trait_correlations:
return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N')
plot_df = pl.DataFrame(trait_correlations).to_pandas()
main_chart = alt.Chart(plot_df).mark_bar().encode(
x=alt.X('trait_display:N', title='Speaking Style Trait', axis=alt.Axis(labelAngle=0, grid=False)),
xOffset='Gender:N',
y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)),
color=alt.Color('color_key:N',
scale=alt.Scale(
domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
),
legend=None),
tooltip=[
alt.Tooltip('trait_display:N', title='Trait'),
alt.Tooltip('Gender:N'),
alt.Tooltip('correlation:Q', format='.3f')
]
).properties(
title=self._process_title(title),
width=width or 800,
height=height or 350
)
# Add custom legend below the chart
legend = self._create_gender_correlation_legend()
chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
chart = self._save_plot(chart, title, filename=filename)
return chart
def plot_speaking_style_color_correlation(
self,
data: pl.LazyFrame | pl.DataFrame | None = None,
@@ -1313,6 +1544,98 @@ class QualtricsPlotsMixin:
chart = self._save_plot(chart, title, filename=filename)
return chart
def plot_speaking_style_color_correlation_by_gender(
self,
data_male: pl.LazyFrame | pl.DataFrame,
data_female: pl.LazyFrame | pl.DataFrame,
speaking_styles: dict[str, list[str]],
target_column: str = "Voice_Scale_Score",
title: str = "Speaking Style Colors Correlation (by Voice Gender)",
filename: str | None = None,
width: int | str | None = None,
height: int | None = None,
) -> alt.Chart:
"""Plot correlation by speaking style color with grouped bars for male vs female voices.
Args:
data_male: DataFrame filtered to male voices only
data_female: DataFrame filtered to female voices only
speaking_styles: Dictionary mapping color names to their constituent traits
target_column: The column to correlate against ("Voice_Scale_Score" or "Ranking_Points")
title: Chart title
filename: Optional explicit filename for saving
width: Chart width in pixels
height: Chart height in pixels
Returns:
Altair chart with grouped bars (male/female) per color
"""
import utils
df_male = self._ensure_dataframe(data_male)
df_female = self._ensure_dataframe(data_female)
# Get correlations for each gender
color_corr_male, _ = utils.transform_speaking_style_color_correlation(
df_male, speaking_styles, target_column=target_column
)
color_corr_female, _ = utils.transform_speaking_style_color_correlation(
df_female, speaking_styles, target_column=target_column
)
# Add gender column and color_key based on correlation sign
color_corr_male = color_corr_male.with_columns([
pl.lit("Male").alias("Gender"),
pl.when(pl.col("correlation") >= 0)
.then(pl.lit("Male_Pos"))
.otherwise(pl.lit("Male_Neg"))
.alias("color_key")
])
color_corr_female = color_corr_female.with_columns([
pl.lit("Female").alias("Gender"),
pl.when(pl.col("correlation") >= 0)
.then(pl.lit("Female_Pos"))
.otherwise(pl.lit("Female_Neg"))
.alias("color_key")
])
combined = pl.concat([color_corr_male, color_corr_female])
main_chart = alt.Chart(combined.to_pandas()).mark_bar().encode(
x=alt.X('Color:N',
title='Speaking Style Color',
axis=alt.Axis(labelAngle=0, grid=False),
sort=["Green", "Blue", "Orange", "Red"]),
xOffset='Gender:N',
y=alt.Y('correlation:Q',
title='Average Correlation',
scale=alt.Scale(domain=[-1, 1]),
axis=alt.Axis(grid=True)),
color=alt.Color('color_key:N',
scale=alt.Scale(
domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
),
legend=None),
tooltip=[
alt.Tooltip('Color:N', title='Speaking Style'),
alt.Tooltip('Gender:N'),
alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'),
alt.Tooltip('n_traits:Q', title='# Traits')
]
).properties(
title=self._process_title(title),
width=width or 400,
height=height or 350
)
# Add custom legend below the chart
legend = self._create_gender_correlation_legend()
chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
chart = self._save_plot(chart, title, filename=filename)
return chart
def plot_demographic_distribution(
self,
column: str,

View File

@@ -77,6 +77,13 @@ class ColorPalette:
GENDER_MALE_NEUTRAL = "#B8C9D9" # Grey-Blue
GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
# Gender colors for correlation plots (green/red indicate +/- correlation)
# Male = darker shade, Female = lighter shade
CORR_MALE_POSITIVE = "#1B5E20" # Dark Green
CORR_FEMALE_POSITIVE = "#81C784" # Light Green
CORR_MALE_NEGATIVE = "#B71C1C" # Dark Red
CORR_FEMALE_NEGATIVE = "#E57373" # Light Red
# Speaking Style Colors (named after the style quadrant colors)
STYLE_GREEN = "#2E7D32" # Forest Green
STYLE_BLUE = "#1565C0" # Strong Blue