From fc76bb0ab5ed09439fb72817f9ee8b93c570d371 Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Wed, 4 Feb 2026 13:44:51 +0100
Subject: [PATCH] voice gender split correlation plots

---
 03_quant_report.script.py | 111 ++++---------
 plots.py                  | 323 ++++++++++++++++++++++++++++++++++++++
 theme.py                  |   7 +
 3 files changed, 360 insertions(+), 81 deletions(-)

diff --git a/03_quant_report.script.py b/03_quant_report.script.py
index 1f7c607..e2200fb 100644
--- a/03_quant_report.script.py
+++ b/03_quant_report.script.py
@@ -573,41 +573,24 @@ joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
 joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
 joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
 
-# Colors vs Scale 1-10 (Male voices only)
-color_corr_scale_male, _ = utils.transform_speaking_style_color_correlation(joined_scale_male, SPEAKING_STYLES)
-S.plot_speaking_style_color_correlation(
-    data=color_corr_scale_male,
-    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Male Voices Only)"
+# Colors vs Scale 1-10 (grouped by voice gender)
+S.plot_speaking_style_color_correlation_by_gender(
+    data_male=joined_scale_male,
+    data_female=joined_scale_female,
+    speaking_styles=SPEAKING_STYLES,
+    target_column="Voice_Scale_Score",
+    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
+    filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
 )
 
-# Colors vs Scale 1-10 (Female voices only)
-color_corr_scale_female, _ = utils.transform_speaking_style_color_correlation(joined_scale_female, SPEAKING_STYLES)
-S.plot_speaking_style_color_correlation(
-    data=color_corr_scale_female,
-    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Female Voices Only)"
-)
-
-# %%
-# Colors vs Ranking Points (Male voices only)
-color_corr_ranking_male, _ = utils.transform_speaking_style_color_correlation(
-    joined_ranking_male, 
-    SPEAKING_STYLES, 
-    target_column="Ranking_Points"
-)
-S.plot_speaking_style_color_correlation(
-    data=color_corr_ranking_male,
-    title="Correlation: Speaking Style Colors and Voice Ranking Points (Male Voices Only)"
-)
-
-# Colors vs Ranking Points (Female voices only)
-color_corr_ranking_female, _ = utils.transform_speaking_style_color_correlation(
-    joined_ranking_female, 
-    SPEAKING_STYLES, 
-    target_column="Ranking_Points"
-)
-S.plot_speaking_style_color_correlation(
-    data=color_corr_ranking_female,
-    title="Correlation: Speaking Style Colors and Voice Ranking Points (Female Voices Only)"
+# Colors vs Ranking Points (grouped by voice gender)
+S.plot_speaking_style_color_correlation_by_gender(
+    data_male=joined_ranking_male,
+    data_female=joined_ranking_female,
+    speaking_styles=SPEAKING_STYLES,
+    target_column="Ranking_Points",
+    title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
+    filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
 )
 
 # %%
@@ -659,15 +642,17 @@ for _style, _traits in SPEAKING_STYLES.items():
 mo.md(_content)
 
 # %%
-# Individual Traits vs Scale 1-10 (Male voices only)
-_content = """### Individual Traits vs Scale 1-10 (Male Voices Only)\n\n"""
+# Individual Traits vs Scale 1-10 (grouped by voice gender)
+_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
 
 for _style, _traits in SPEAKING_STYLES.items():
-    _fig = S.plot_speaking_style_scale_correlation(
-        data=joined_scale_male,
+    _fig = S.plot_speaking_style_scale_correlation_by_gender(
+        data_male=joined_scale_male,
+        data_female=joined_scale_female,
         style_color=_style,
         style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Male Voices Only)",
+        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
+        filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
     )
     _content += f"""
 #### Speaking Style **{_style}**:
@@ -678,53 +663,17 @@ for _style, _traits in SPEAKING_STYLES.items():
 mo.md(_content)
 
 # %%
-# Individual Traits vs Scale 1-10 (Female voices only)
-_content = """### Individual Traits vs Scale 1-10 (Female Voices Only)\n\n"""
+# Individual Traits vs Ranking Points (grouped by voice gender)
+_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
 
 for _style, _traits in SPEAKING_STYLES.items():
-    _fig = S.plot_speaking_style_scale_correlation(
-        data=joined_scale_female,
+    _fig = S.plot_speaking_style_ranking_correlation_by_gender(
+        data_male=joined_ranking_male,
+        data_female=joined_ranking_female,
         style_color=_style,
         style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Female Voices Only)",
-    )
-    _content += f"""
-#### Speaking Style **{_style}**:
-
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)
-
-# %%
-# Individual Traits vs Ranking Points (Male voices only)
-_content = """### Individual Traits vs Ranking Points (Male Voices Only)\n\n"""
-
-for _style, _traits in SPEAKING_STYLES.items():
-    _fig = S.plot_speaking_style_ranking_correlation(
-        data=joined_ranking_male,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Male Voices Only)",
-    )
-    _content += f"""
-#### Speaking Style **{_style}**:
-
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)
-
-# %%
-# Individual Traits vs Ranking Points (Female voices only)
-_content = """### Individual Traits vs Ranking Points (Female Voices Only)\n\n"""
-
-for _style, _traits in SPEAKING_STYLES.items():
-    _fig = S.plot_speaking_style_ranking_correlation(
-        data=joined_ranking_female,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Female Voices Only)",
+        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
+        filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
     )
     _content += f"""
 #### Speaking Style **{_style}**:
diff --git a/plots.py b/plots.py
index bf03a9a..a8d19a0 100644
--- a/plots.py
+++ b/plots.py
@@ -1256,6 +1256,237 @@ class QualtricsPlotsMixin:
         chart = self._save_plot(chart, title, filename=filename)
         return chart
 
+    def _create_gender_correlation_legend(self) -> alt.Chart:
+        """Create a custom legend for gender correlation plots with dual-color swatches.
+        
+        Horizontal layout below the chart:
+        [■][■] Male          [■][■] Female
+        """
+        # Horizontal layout: Male at x=0-2, Female at x=5-7 (gap for whitespace)
+        legend_data = pd.DataFrame([
+            {"x": 0, "color": ColorPalette.CORR_MALE_POSITIVE},
+            {"x": 1, "color": ColorPalette.CORR_MALE_NEGATIVE},
+            {"x": 5, "color": ColorPalette.CORR_FEMALE_POSITIVE},
+            {"x": 6, "color": ColorPalette.CORR_FEMALE_NEGATIVE},
+        ])
+        
+        # Color blocks
+        blocks = alt.Chart(legend_data).mark_rect(width=12, height=12).encode(
+            x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])),
+            y=alt.value(6),
+            color=alt.Color('color:N', scale=None),
+        )
+        
+        # Labels positioned after each pair of blocks
+        label_data = pd.DataFrame([
+            {"x": 2.3, "label": "Male"},
+            {"x": 7.3, "label": "Female"},
+        ])
+        labels = alt.Chart(label_data).mark_text(align='left', baseline='middle', fontSize=11).encode(
+            x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])),
+            y=alt.value(6),
+            text='label:N'
+        )
+        
+        legend = (blocks + labels).properties(width=200, height=20)
+        return legend
+
+    def plot_speaking_style_scale_correlation_by_gender(
+        self,
+        style_color: str,
+        style_traits: list[str],
+        data_male: pl.LazyFrame | pl.DataFrame,
+        data_female: pl.LazyFrame | pl.DataFrame,
+        title: str | None = None,
+        filename: str | None = None,
+        width: int | str | None = None,
+        height: int | None = None,
+    ) -> alt.Chart:
+        """Plots correlation between Speaking Style Trait Scores and Voice Scale,
+        with grouped bars comparing male vs female voices.
+        
+        Args:
+            style_color: The speaking style color (e.g., "Green", "Blue")
+            style_traits: List of traits for this style
+            data_male: DataFrame filtered to male voices only
+            data_female: DataFrame filtered to female voices only
+            title: Chart title
+            filename: Optional explicit filename for saving
+            width: Chart width in pixels
+            height: Chart height in pixels
+            
+        Returns:
+            Altair chart with grouped bars (male/female) per trait
+        """
+        df_male = self._ensure_dataframe(data_male)
+        df_female = self._ensure_dataframe(data_female)
+
+        if title is None:
+            title = f"Speaking style {style_color} and voice scale 1-10 correlations (by Voice Gender)"
+
+        trait_correlations = []
+        
+        for i, trait in enumerate(style_traits):
+            trait_display = trait.replace('|', '\n')
+            
+            # Male correlation
+            subset_m = df_male.filter(pl.col("Right_Anchor") == trait)
+            valid_m = subset_m.select(["score", "Voice_Scale_Score"]).drop_nulls()
+            if valid_m.height > 1:
+                corr_m = valid_m.select(pl.corr("score", "Voice_Scale_Score")).item()
+                corr_val = corr_m if corr_m is not None else 0.0
+                trait_correlations.append({
+                    "trait_display": trait_display,
+                    "Gender": "Male",
+                    "correlation": corr_val,
+                    "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg"
+                })
+            
+            # Female correlation
+            subset_f = df_female.filter(pl.col("Right_Anchor") == trait)
+            valid_f = subset_f.select(["score", "Voice_Scale_Score"]).drop_nulls()
+            if valid_f.height > 1:
+                corr_f = valid_f.select(pl.corr("score", "Voice_Scale_Score")).item()
+                corr_val = corr_f if corr_f is not None else 0.0
+                trait_correlations.append({
+                    "trait_display": trait_display,
+                    "Gender": "Female",
+                    "correlation": corr_val,
+                    "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg"
+                })
+        
+        if not trait_correlations:
+            return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N')
+            
+        plot_df = pl.DataFrame(trait_correlations).to_pandas()
+
+        main_chart = alt.Chart(plot_df).mark_bar().encode(
+            x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)),
+            xOffset='Gender:N',
+            y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)),
+            color=alt.Color('color_key:N', 
+                           scale=alt.Scale(
+                               domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
+                               range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
+                                      ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
+                           ),
+                           legend=None),
+            tooltip=[
+                alt.Tooltip('trait_display:N', title='Trait'),
+                alt.Tooltip('Gender:N'),
+                alt.Tooltip('correlation:Q', format='.3f')
+            ]
+        ).properties(
+            title=self._process_title(title),
+            width=width or 800,
+            height=height or 350
+        )
+        
+        # Add custom legend below the chart
+        legend = self._create_gender_correlation_legend()
+        chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
+
+        chart = self._save_plot(chart, title, filename=filename)
+        return chart
+
+    def plot_speaking_style_ranking_correlation_by_gender(
+        self,
+        style_color: str,
+        style_traits: list[str],
+        data_male: pl.LazyFrame | pl.DataFrame,
+        data_female: pl.LazyFrame | pl.DataFrame,
+        title: str | None = None,
+        filename: str | None = None,
+        width: int | str | None = None,
+        height: int | None = None,
+    ) -> alt.Chart:
+        """Plots correlation between Speaking Style Trait Scores and Voice Ranking Points,
+        with grouped bars comparing male vs female voices.
+        
+        Args:
+            style_color: The speaking style color (e.g., "Green", "Blue")
+            style_traits: List of traits for this style
+            data_male: DataFrame filtered to male voices only
+            data_female: DataFrame filtered to female voices only
+            title: Chart title
+            filename: Optional explicit filename for saving
+            width: Chart width in pixels
+            height: Chart height in pixels
+            
+        Returns:
+            Altair chart with grouped bars (male/female) per trait
+        """
+        df_male = self._ensure_dataframe(data_male)
+        df_female = self._ensure_dataframe(data_female)
+
+        if title is None:
+            title = f"Speaking style {style_color} and voice ranking points correlations (by Voice Gender)"
+
+        trait_correlations = []
+        
+        for i, trait in enumerate(style_traits):
+            trait_display = trait.replace('|', '\n')
+            
+            # Male correlation
+            subset_m = df_male.filter(pl.col("Right_Anchor") == trait)
+            valid_m = subset_m.select(["score", "Ranking_Points"]).drop_nulls()
+            if valid_m.height > 1:
+                corr_m = valid_m.select(pl.corr("score", "Ranking_Points")).item()
+                corr_val = corr_m if corr_m is not None else 0.0
+                trait_correlations.append({
+                    "trait_display": trait_display,
+                    "Gender": "Male",
+                    "correlation": corr_val,
+                    "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg"
+                })
+            
+            # Female correlation
+            subset_f = df_female.filter(pl.col("Right_Anchor") == trait)
+            valid_f = subset_f.select(["score", "Ranking_Points"]).drop_nulls()
+            if valid_f.height > 1:
+                corr_f = valid_f.select(pl.corr("score", "Ranking_Points")).item()
+                corr_val = corr_f if corr_f is not None else 0.0
+                trait_correlations.append({
+                    "trait_display": trait_display,
+                    "Gender": "Female",
+                    "correlation": corr_val,
+                    "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg"
+                })
+        
+        if not trait_correlations:
+            return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N')
+            
+        plot_df = pl.DataFrame(trait_correlations).to_pandas()
+
+        main_chart = alt.Chart(plot_df).mark_bar().encode(
+            x=alt.X('trait_display:N', title='Speaking Style Trait', axis=alt.Axis(labelAngle=0, grid=False)),
+            xOffset='Gender:N',
+            y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)),
+            color=alt.Color('color_key:N', 
+                           scale=alt.Scale(
+                               domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
+                               range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
+                                      ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
+                           ),
+                           legend=None),
+            tooltip=[
+                alt.Tooltip('trait_display:N', title='Trait'),
+                alt.Tooltip('Gender:N'),
+                alt.Tooltip('correlation:Q', format='.3f')
+            ]
+        ).properties(
+            title=self._process_title(title),
+            width=width or 800,
+            height=height or 350
+        )
+        
+        # Add custom legend below the chart
+        legend = self._create_gender_correlation_legend()
+        chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
+
+        chart = self._save_plot(chart, title, filename=filename)
+        return chart
+
     def plot_speaking_style_color_correlation(
         self,
         data: pl.LazyFrame | pl.DataFrame | None = None,
@@ -1313,6 +1544,98 @@ class QualtricsPlotsMixin:
         chart = self._save_plot(chart, title, filename=filename)
         return chart
 
+    def plot_speaking_style_color_correlation_by_gender(
+        self,
+        data_male: pl.LazyFrame | pl.DataFrame,
+        data_female: pl.LazyFrame | pl.DataFrame,
+        speaking_styles: dict[str, list[str]],
+        target_column: str = "Voice_Scale_Score",
+        title: str = "Speaking Style Colors Correlation (by Voice Gender)",
+        filename: str | None = None,
+        width: int | str | None = None,
+        height: int | None = None,
+    ) -> alt.Chart:
+        """Plot correlation by speaking style color with grouped bars for male vs female voices.
+        
+        Args:
+            data_male: DataFrame filtered to male voices only
+            data_female: DataFrame filtered to female voices only
+            speaking_styles: Dictionary mapping color names to their constituent traits
+            target_column: The column to correlate against ("Voice_Scale_Score" or "Ranking_Points")
+            title: Chart title
+            filename: Optional explicit filename for saving
+            width: Chart width in pixels
+            height: Chart height in pixels
+            
+        Returns:
+            Altair chart with grouped bars (male/female) per color
+        """
+        import utils
+        
+        df_male = self._ensure_dataframe(data_male)
+        df_female = self._ensure_dataframe(data_female)
+        
+        # Get correlations for each gender
+        color_corr_male, _ = utils.transform_speaking_style_color_correlation(
+            df_male, speaking_styles, target_column=target_column
+        )
+        color_corr_female, _ = utils.transform_speaking_style_color_correlation(
+            df_female, speaking_styles, target_column=target_column
+        )
+        
+        # Add gender column and color_key based on correlation sign
+        color_corr_male = color_corr_male.with_columns([
+            pl.lit("Male").alias("Gender"),
+            pl.when(pl.col("correlation") >= 0)
+              .then(pl.lit("Male_Pos"))
+              .otherwise(pl.lit("Male_Neg"))
+              .alias("color_key")
+        ])
+        color_corr_female = color_corr_female.with_columns([
+            pl.lit("Female").alias("Gender"),
+            pl.when(pl.col("correlation") >= 0)
+              .then(pl.lit("Female_Pos"))
+              .otherwise(pl.lit("Female_Neg"))
+              .alias("color_key")
+        ])
+        combined = pl.concat([color_corr_male, color_corr_female])
+        
+        main_chart = alt.Chart(combined.to_pandas()).mark_bar().encode(
+            x=alt.X('Color:N', 
+                    title='Speaking Style Color', 
+                    axis=alt.Axis(labelAngle=0, grid=False),
+                    sort=["Green", "Blue", "Orange", "Red"]),
+            xOffset='Gender:N',
+            y=alt.Y('correlation:Q', 
+                    title='Average Correlation',
+                    scale=alt.Scale(domain=[-1, 1]),
+                    axis=alt.Axis(grid=True)),
+            color=alt.Color('color_key:N', 
+                           scale=alt.Scale(
+                               domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
+                               range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
+                                      ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
+                           ),
+                           legend=None),
+            tooltip=[
+                alt.Tooltip('Color:N', title='Speaking Style'),
+                alt.Tooltip('Gender:N'),
+                alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'),
+                alt.Tooltip('n_traits:Q', title='# Traits')
+            ]
+        ).properties(
+            title=self._process_title(title),
+            width=width or 400,
+            height=height or 350
+        )
+        
+        # Add custom legend below the chart
+        legend = self._create_gender_correlation_legend()
+        chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
+        
+        chart = self._save_plot(chart, title, filename=filename)
+        return chart
+
     def plot_demographic_distribution(
         self,
         column: str,
diff --git a/theme.py b/theme.py
index 9ad9914..0324b23 100644
--- a/theme.py
+++ b/theme.py
@@ -77,6 +77,13 @@ class ColorPalette:
     GENDER_MALE_NEUTRAL = "#B8C9D9"   # Grey-Blue
     GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
 
+    # Gender colors for correlation plots (green/red indicate +/- correlation)
+    # Male = darker shade, Female = lighter shade
+    CORR_MALE_POSITIVE = "#1B5E20"     # Dark Green
+    CORR_FEMALE_POSITIVE = "#81C784"   # Light Green
+    CORR_MALE_NEGATIVE = "#B71C1C"     # Dark Red
+    CORR_FEMALE_NEGATIVE = "#E57373"   # Light Red
+
     # Speaking Style Colors (named after the style quadrant colors)
     STYLE_GREEN = "#2E7D32"   # Forest Green
     STYLE_BLUE = "#1565C0"    # Strong Blue