From fc76bb0ab5ed09439fb72817f9ee8b93c570d371 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Wed, 4 Feb 2026 13:44:51 +0100 Subject: [PATCH] voice gender split correlation plots --- 03_quant_report.script.py | 111 ++++--------- plots.py | 323 ++++++++++++++++++++++++++++++++++++++ theme.py | 7 + 3 files changed, 360 insertions(+), 81 deletions(-) diff --git a/03_quant_report.script.py b/03_quant_report.script.py index 1f7c607..e2200fb 100644 --- a/03_quant_report.script.py +++ b/03_quant_report.script.py @@ -573,41 +573,24 @@ joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES)) joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES)) joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES)) -# Colors vs Scale 1-10 (Male voices only) -color_corr_scale_male, _ = utils.transform_speaking_style_color_correlation(joined_scale_male, SPEAKING_STYLES) -S.plot_speaking_style_color_correlation( - data=color_corr_scale_male, - title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Male Voices Only)" +# Colors vs Scale 1-10 (grouped by voice gender) +S.plot_speaking_style_color_correlation_by_gender( + data_male=joined_scale_male, + data_female=joined_scale_female, + speaking_styles=SPEAKING_STYLES, + target_column="Voice_Scale_Score", + title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)", + filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color", ) -# Colors vs Scale 1-10 (Female voices only) -color_corr_scale_female, _ = utils.transform_speaking_style_color_correlation(joined_scale_female, SPEAKING_STYLES) -S.plot_speaking_style_color_correlation( - data=color_corr_scale_female, - title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Female Voices Only)" -) - -# %% -# Colors vs Ranking Points (Male voices only) -color_corr_ranking_male, _ = utils.transform_speaking_style_color_correlation( - joined_ranking_male, - SPEAKING_STYLES, - target_column="Ranking_Points" -) -S.plot_speaking_style_color_correlation( - data=color_corr_ranking_male, - title="Correlation: Speaking Style Colors and Voice Ranking Points (Male Voices Only)" -) - -# Colors vs Ranking Points (Female voices only) -color_corr_ranking_female, _ = utils.transform_speaking_style_color_correlation( - joined_ranking_female, - SPEAKING_STYLES, - target_column="Ranking_Points" -) -S.plot_speaking_style_color_correlation( - data=color_corr_ranking_female, - title="Correlation: Speaking Style Colors and Voice Ranking Points (Female Voices Only)" +# Colors vs Ranking Points (grouped by voice gender) +S.plot_speaking_style_color_correlation_by_gender( + data_male=joined_ranking_male, + data_female=joined_ranking_female, + speaking_styles=SPEAKING_STYLES, + target_column="Ranking_Points", + title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)", + filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color", ) # %% @@ -659,15 +642,17 @@ for _style, _traits in SPEAKING_STYLES.items(): mo.md(_content) # %% -# Individual Traits vs Scale 1-10 (Male voices only) -_content = """### Individual Traits vs Scale 1-10 (Male Voices Only)\n\n""" +# Individual Traits vs Scale 1-10 (grouped by voice gender) +_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n""" for _style, _traits in SPEAKING_STYLES.items(): - _fig = S.plot_speaking_style_scale_correlation( - data=joined_scale_male, + _fig = S.plot_speaking_style_scale_correlation_by_gender( + data_male=joined_scale_male, + data_female=joined_scale_female, style_color=_style, style_traits=_traits, - title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Male Voices Only)", + title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)", + filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}", ) _content += f""" #### Speaking Style **{_style}**: @@ -678,53 +663,17 @@ for _style, _traits in SPEAKING_STYLES.items(): mo.md(_content) # %% -# Individual Traits vs Scale 1-10 (Female voices only) -_content = """### Individual Traits vs Scale 1-10 (Female Voices Only)\n\n""" +# Individual Traits vs Ranking Points (grouped by voice gender) +_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n""" for _style, _traits in SPEAKING_STYLES.items(): - _fig = S.plot_speaking_style_scale_correlation( - data=joined_scale_female, + _fig = S.plot_speaking_style_ranking_correlation_by_gender( + data_male=joined_ranking_male, + data_female=joined_ranking_female, style_color=_style, style_traits=_traits, - title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Female Voices Only)", - ) - _content += f""" -#### Speaking Style **{_style}**: - -{mo.ui.altair_chart(_fig)} - -""" -mo.md(_content) - -# %% -# Individual Traits vs Ranking Points (Male voices only) -_content = """### Individual Traits vs Ranking Points (Male Voices Only)\n\n""" - -for _style, _traits in SPEAKING_STYLES.items(): - _fig = S.plot_speaking_style_ranking_correlation( - data=joined_ranking_male, - style_color=_style, - style_traits=_traits, - title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Male Voices Only)", - ) - _content += f""" -#### Speaking Style **{_style}**: - -{mo.ui.altair_chart(_fig)} - -""" -mo.md(_content) - -# %% -# Individual Traits vs Ranking Points (Female voices only) -_content = """### Individual Traits vs Ranking Points (Female Voices Only)\n\n""" - -for _style, _traits in SPEAKING_STYLES.items(): - _fig = S.plot_speaking_style_ranking_correlation( - data=joined_ranking_female, - style_color=_style, - style_traits=_traits, - title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Female Voices Only)", + title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)", + filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}", ) _content += f""" #### Speaking Style **{_style}**: diff --git a/plots.py b/plots.py index bf03a9a..a8d19a0 100644 --- a/plots.py +++ b/plots.py @@ -1256,6 +1256,237 @@ class QualtricsPlotsMixin: chart = self._save_plot(chart, title, filename=filename) return chart + def _create_gender_correlation_legend(self) -> alt.Chart: + """Create a custom legend for gender correlation plots with dual-color swatches. + + Horizontal layout below the chart: + [■][■] Male [■][■] Female + """ + # Horizontal layout: Male at x=0-2, Female at x=5-7 (gap for whitespace) + legend_data = pd.DataFrame([ + {"x": 0, "color": ColorPalette.CORR_MALE_POSITIVE}, + {"x": 1, "color": ColorPalette.CORR_MALE_NEGATIVE}, + {"x": 5, "color": ColorPalette.CORR_FEMALE_POSITIVE}, + {"x": 6, "color": ColorPalette.CORR_FEMALE_NEGATIVE}, + ]) + + # Color blocks + blocks = alt.Chart(legend_data).mark_rect(width=12, height=12).encode( + x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])), + y=alt.value(6), + color=alt.Color('color:N', scale=None), + ) + + # Labels positioned after each pair of blocks + label_data = pd.DataFrame([ + {"x": 2.3, "label": "Male"}, + {"x": 7.3, "label": "Female"}, + ]) + labels = alt.Chart(label_data).mark_text(align='left', baseline='middle', fontSize=11).encode( + x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])), + y=alt.value(6), + text='label:N' + ) + + legend = (blocks + labels).properties(width=200, height=20) + return legend + + def plot_speaking_style_scale_correlation_by_gender( + self, + style_color: str, + style_traits: list[str], + data_male: pl.LazyFrame | pl.DataFrame, + data_female: pl.LazyFrame | pl.DataFrame, + title: str | None = None, + filename: str | None = None, + width: int | str | None = None, + height: int | None = None, + ) -> alt.Chart: + """Plots correlation between Speaking Style Trait Scores and Voice Scale, + with grouped bars comparing male vs female voices. + + Args: + style_color: The speaking style color (e.g., "Green", "Blue") + style_traits: List of traits for this style + data_male: DataFrame filtered to male voices only + data_female: DataFrame filtered to female voices only + title: Chart title + filename: Optional explicit filename for saving + width: Chart width in pixels + height: Chart height in pixels + + Returns: + Altair chart with grouped bars (male/female) per trait + """ + df_male = self._ensure_dataframe(data_male) + df_female = self._ensure_dataframe(data_female) + + if title is None: + title = f"Speaking style {style_color} and voice scale 1-10 correlations (by Voice Gender)" + + trait_correlations = [] + + for i, trait in enumerate(style_traits): + trait_display = trait.replace('|', '\n') + + # Male correlation + subset_m = df_male.filter(pl.col("Right_Anchor") == trait) + valid_m = subset_m.select(["score", "Voice_Scale_Score"]).drop_nulls() + if valid_m.height > 1: + corr_m = valid_m.select(pl.corr("score", "Voice_Scale_Score")).item() + corr_val = corr_m if corr_m is not None else 0.0 + trait_correlations.append({ + "trait_display": trait_display, + "Gender": "Male", + "correlation": corr_val, + "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg" + }) + + # Female correlation + subset_f = df_female.filter(pl.col("Right_Anchor") == trait) + valid_f = subset_f.select(["score", "Voice_Scale_Score"]).drop_nulls() + if valid_f.height > 1: + corr_f = valid_f.select(pl.corr("score", "Voice_Scale_Score")).item() + corr_val = corr_f if corr_f is not None else 0.0 + trait_correlations.append({ + "trait_display": trait_display, + "Gender": "Female", + "correlation": corr_val, + "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg" + }) + + if not trait_correlations: + return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') + + plot_df = pl.DataFrame(trait_correlations).to_pandas() + + main_chart = alt.Chart(plot_df).mark_bar().encode( + x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)), + xOffset='Gender:N', + y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), + color=alt.Color('color_key:N', + scale=alt.Scale( + domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'], + range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE, + ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE] + ), + legend=None), + tooltip=[ + alt.Tooltip('trait_display:N', title='Trait'), + alt.Tooltip('Gender:N'), + alt.Tooltip('correlation:Q', format='.3f') + ] + ).properties( + title=self._process_title(title), + width=width or 800, + height=height or 350 + ) + + # Add custom legend below the chart + legend = self._create_gender_correlation_legend() + chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent') + + chart = self._save_plot(chart, title, filename=filename) + return chart + + def plot_speaking_style_ranking_correlation_by_gender( + self, + style_color: str, + style_traits: list[str], + data_male: pl.LazyFrame | pl.DataFrame, + data_female: pl.LazyFrame | pl.DataFrame, + title: str | None = None, + filename: str | None = None, + width: int | str | None = None, + height: int | None = None, + ) -> alt.Chart: + """Plots correlation between Speaking Style Trait Scores and Voice Ranking Points, + with grouped bars comparing male vs female voices. + + Args: + style_color: The speaking style color (e.g., "Green", "Blue") + style_traits: List of traits for this style + data_male: DataFrame filtered to male voices only + data_female: DataFrame filtered to female voices only + title: Chart title + filename: Optional explicit filename for saving + width: Chart width in pixels + height: Chart height in pixels + + Returns: + Altair chart with grouped bars (male/female) per trait + """ + df_male = self._ensure_dataframe(data_male) + df_female = self._ensure_dataframe(data_female) + + if title is None: + title = f"Speaking style {style_color} and voice ranking points correlations (by Voice Gender)" + + trait_correlations = [] + + for i, trait in enumerate(style_traits): + trait_display = trait.replace('|', '\n') + + # Male correlation + subset_m = df_male.filter(pl.col("Right_Anchor") == trait) + valid_m = subset_m.select(["score", "Ranking_Points"]).drop_nulls() + if valid_m.height > 1: + corr_m = valid_m.select(pl.corr("score", "Ranking_Points")).item() + corr_val = corr_m if corr_m is not None else 0.0 + trait_correlations.append({ + "trait_display": trait_display, + "Gender": "Male", + "correlation": corr_val, + "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg" + }) + + # Female correlation + subset_f = df_female.filter(pl.col("Right_Anchor") == trait) + valid_f = subset_f.select(["score", "Ranking_Points"]).drop_nulls() + if valid_f.height > 1: + corr_f = valid_f.select(pl.corr("score", "Ranking_Points")).item() + corr_val = corr_f if corr_f is not None else 0.0 + trait_correlations.append({ + "trait_display": trait_display, + "Gender": "Female", + "correlation": corr_val, + "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg" + }) + + if not trait_correlations: + return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') + + plot_df = pl.DataFrame(trait_correlations).to_pandas() + + main_chart = alt.Chart(plot_df).mark_bar().encode( + x=alt.X('trait_display:N', title='Speaking Style Trait', axis=alt.Axis(labelAngle=0, grid=False)), + xOffset='Gender:N', + y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), + color=alt.Color('color_key:N', + scale=alt.Scale( + domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'], + range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE, + ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE] + ), + legend=None), + tooltip=[ + alt.Tooltip('trait_display:N', title='Trait'), + alt.Tooltip('Gender:N'), + alt.Tooltip('correlation:Q', format='.3f') + ] + ).properties( + title=self._process_title(title), + width=width or 800, + height=height or 350 + ) + + # Add custom legend below the chart + legend = self._create_gender_correlation_legend() + chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent') + + chart = self._save_plot(chart, title, filename=filename) + return chart + def plot_speaking_style_color_correlation( self, data: pl.LazyFrame | pl.DataFrame | None = None, @@ -1313,6 +1544,98 @@ class QualtricsPlotsMixin: chart = self._save_plot(chart, title, filename=filename) return chart + def plot_speaking_style_color_correlation_by_gender( + self, + data_male: pl.LazyFrame | pl.DataFrame, + data_female: pl.LazyFrame | pl.DataFrame, + speaking_styles: dict[str, list[str]], + target_column: str = "Voice_Scale_Score", + title: str = "Speaking Style Colors Correlation (by Voice Gender)", + filename: str | None = None, + width: int | str | None = None, + height: int | None = None, + ) -> alt.Chart: + """Plot correlation by speaking style color with grouped bars for male vs female voices. + + Args: + data_male: DataFrame filtered to male voices only + data_female: DataFrame filtered to female voices only + speaking_styles: Dictionary mapping color names to their constituent traits + target_column: The column to correlate against ("Voice_Scale_Score" or "Ranking_Points") + title: Chart title + filename: Optional explicit filename for saving + width: Chart width in pixels + height: Chart height in pixels + + Returns: + Altair chart with grouped bars (male/female) per color + """ + import utils + + df_male = self._ensure_dataframe(data_male) + df_female = self._ensure_dataframe(data_female) + + # Get correlations for each gender + color_corr_male, _ = utils.transform_speaking_style_color_correlation( + df_male, speaking_styles, target_column=target_column + ) + color_corr_female, _ = utils.transform_speaking_style_color_correlation( + df_female, speaking_styles, target_column=target_column + ) + + # Add gender column and color_key based on correlation sign + color_corr_male = color_corr_male.with_columns([ + pl.lit("Male").alias("Gender"), + pl.when(pl.col("correlation") >= 0) + .then(pl.lit("Male_Pos")) + .otherwise(pl.lit("Male_Neg")) + .alias("color_key") + ]) + color_corr_female = color_corr_female.with_columns([ + pl.lit("Female").alias("Gender"), + pl.when(pl.col("correlation") >= 0) + .then(pl.lit("Female_Pos")) + .otherwise(pl.lit("Female_Neg")) + .alias("color_key") + ]) + combined = pl.concat([color_corr_male, color_corr_female]) + + main_chart = alt.Chart(combined.to_pandas()).mark_bar().encode( + x=alt.X('Color:N', + title='Speaking Style Color', + axis=alt.Axis(labelAngle=0, grid=False), + sort=["Green", "Blue", "Orange", "Red"]), + xOffset='Gender:N', + y=alt.Y('correlation:Q', + title='Average Correlation', + scale=alt.Scale(domain=[-1, 1]), + axis=alt.Axis(grid=True)), + color=alt.Color('color_key:N', + scale=alt.Scale( + domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'], + range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE, + ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE] + ), + legend=None), + tooltip=[ + alt.Tooltip('Color:N', title='Speaking Style'), + alt.Tooltip('Gender:N'), + alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'), + alt.Tooltip('n_traits:Q', title='# Traits') + ] + ).properties( + title=self._process_title(title), + width=width or 400, + height=height or 350 + ) + + # Add custom legend below the chart + legend = self._create_gender_correlation_legend() + chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent') + + chart = self._save_plot(chart, title, filename=filename) + return chart + def plot_demographic_distribution( self, column: str, diff --git a/theme.py b/theme.py index 9ad9914..0324b23 100644 --- a/theme.py +++ b/theme.py @@ -77,6 +77,13 @@ class ColorPalette: GENDER_MALE_NEUTRAL = "#B8C9D9" # Grey-Blue GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink + # Gender colors for correlation plots (green/red indicate +/- correlation) + # Male = darker shade, Female = lighter shade + CORR_MALE_POSITIVE = "#1B5E20" # Dark Green + CORR_FEMALE_POSITIVE = "#81C784" # Light Green + CORR_MALE_NEGATIVE = "#B71C1C" # Dark Red + CORR_FEMALE_NEGATIVE = "#E57373" # Light Red + # Speaking Style Colors (named after the style quadrant colors) STYLE_GREEN = "#2E7D32" # Forest Green STYLE_BLUE = "#1565C0" # Strong Blue