voice gender split correlation plots

2026-02-04 13:44:51 +01:00
parent ab78276a97
commit fc76bb0ab5
3 changed files with 360 additions and 81 deletions
--- a/03_quant_report.script.py
+++ b/03_quant_report.script.py
@@ -573,41 +573,24 @@ joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
 joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
 joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
-# Colors vs Scale 1-10 (Male voices only)
+# Colors vs Scale 1-10 (grouped by voice gender)
-color_corr_scale_male, _ = utils.transform_speaking_style_color_correlation(joined_scale_male, SPEAKING_STYLES)
+S.plot_speaking_style_color_correlation_by_gender(
-S.plot_speaking_style_color_correlation(
+    data_male=joined_scale_male,
-    data=color_corr_scale_male,
+    data_female=joined_scale_female,
-    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Male Voices Only)"
+    speaking_styles=SPEAKING_STYLES,
    target_column="Voice_Scale_Score",
    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
    filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
 )
-# Colors vs Scale 1-10 (Female voices only)
+# Colors vs Ranking Points (grouped by voice gender)
-color_corr_scale_female, _ = utils.transform_speaking_style_color_correlation(joined_scale_female, SPEAKING_STYLES)
+S.plot_speaking_style_color_correlation_by_gender(
-S.plot_speaking_style_color_correlation(
+    data_male=joined_ranking_male,
-    data=color_corr_scale_female,
+    data_female=joined_ranking_female,
-    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (Female Voices Only)"
+    speaking_styles=SPEAKING_STYLES,
-)
+    target_column="Ranking_Points",
-
+    title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
-# %%
+    filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
 # Colors vs Ranking Points (Male voices only)
 color_corr_ranking_male, _ = utils.transform_speaking_style_color_correlation(
    joined_ranking_male, 
    SPEAKING_STYLES, 
    target_column="Ranking_Points"
 )
 S.plot_speaking_style_color_correlation(
    data=color_corr_ranking_male,
    title="Correlation: Speaking Style Colors and Voice Ranking Points (Male Voices Only)"
 )
 # Colors vs Ranking Points (Female voices only)
 color_corr_ranking_female, _ = utils.transform_speaking_style_color_correlation(
    joined_ranking_female, 
    SPEAKING_STYLES, 
    target_column="Ranking_Points"
 )
 S.plot_speaking_style_color_correlation(
    data=color_corr_ranking_female,
    title="Correlation: Speaking Style Colors and Voice Ranking Points (Female Voices Only)"
 )
 # %%
@@ -659,15 +642,17 @@ for _style, _traits in SPEAKING_STYLES.items():
 mo.md(_content)
 # %%
-# Individual Traits vs Scale 1-10 (Male voices only)
+# Individual Traits vs Scale 1-10 (grouped by voice gender)
-_content = """### Individual Traits vs Scale 1-10 (Male Voices Only)\n\n"""
+_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
 for _style, _traits in SPEAKING_STYLES.items():
-    _fig = S.plot_speaking_style_scale_correlation(
+    _fig = S.plot_speaking_style_scale_correlation_by_gender(
-        data=joined_scale_male,
+        data_male=joined_scale_male,
        data_female=joined_scale_female,
        style_color=_style,
        style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Male Voices Only)",
+        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
        filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
    )
    _content += f"""
 #### Speaking Style **{_style}**:
@@ -678,53 +663,17 @@ for _style, _traits in SPEAKING_STYLES.items():
 mo.md(_content)
 # %%
-# Individual Traits vs Scale 1-10 (Female voices only)
+# Individual Traits vs Ranking Points (grouped by voice gender)
-_content = """### Individual Traits vs Scale 1-10 (Female Voices Only)\n\n"""
+_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
 for _style, _traits in SPEAKING_STYLES.items():
-    _fig = S.plot_speaking_style_scale_correlation(
+    _fig = S.plot_speaking_style_ranking_correlation_by_gender(
-        data=joined_scale_female,
+        data_male=joined_ranking_male,
        data_female=joined_ranking_female,
        style_color=_style,
        style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (Female Voices Only)",
+        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
-    )
+        filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
    _content += f"""
 #### Speaking Style **{_style}**:
 {mo.ui.altair_chart(_fig)}
 """
 mo.md(_content)
 # %%
 # Individual Traits vs Ranking Points (Male voices only)
 _content = """### Individual Traits vs Ranking Points (Male Voices Only)\n\n"""
 for _style, _traits in SPEAKING_STYLES.items():
    _fig = S.plot_speaking_style_ranking_correlation(
        data=joined_ranking_male,
        style_color=_style,
        style_traits=_traits,
        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Male Voices Only)",
    )
    _content += f"""
 #### Speaking Style **{_style}**:
 {mo.ui.altair_chart(_fig)}
 """
 mo.md(_content)
 # %%
 # Individual Traits vs Ranking Points (Female voices only)
 _content = """### Individual Traits vs Ranking Points (Female Voices Only)\n\n"""
 for _style, _traits in SPEAKING_STYLES.items():
    _fig = S.plot_speaking_style_ranking_correlation(
        data=joined_ranking_female,
        style_color=_style,
        style_traits=_traits,
        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (Female Voices Only)",
    )
    _content += f"""
 #### Speaking Style **{_style}**:
--- a/plots.py
+++ b/plots.py
@@ -1256,6 +1256,237 @@ class QualtricsPlotsMixin:
        chart = self._save_plot(chart, title, filename=filename)
        return chart
    def _create_gender_correlation_legend(self) -> alt.Chart:
        """Create a custom legend for gender correlation plots with dual-color swatches.
        Horizontal layout below the chart:
        [■][■] Male          [■][■] Female
        """
        # Horizontal layout: Male at x=0-2, Female at x=5-7 (gap for whitespace)
        legend_data = pd.DataFrame([
            {"x": 0, "color": ColorPalette.CORR_MALE_POSITIVE},
            {"x": 1, "color": ColorPalette.CORR_MALE_NEGATIVE},
            {"x": 5, "color": ColorPalette.CORR_FEMALE_POSITIVE},
            {"x": 6, "color": ColorPalette.CORR_FEMALE_NEGATIVE},
        ])
        # Color blocks
        blocks = alt.Chart(legend_data).mark_rect(width=12, height=12).encode(
            x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])),
            y=alt.value(6),
            color=alt.Color('color:N', scale=None),
        )
        # Labels positioned after each pair of blocks
        label_data = pd.DataFrame([
            {"x": 2.3, "label": "Male"},
            {"x": 7.3, "label": "Female"},
        ])
        labels = alt.Chart(label_data).mark_text(align='left', baseline='middle', fontSize=11).encode(
            x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])),
            y=alt.value(6),
            text='label:N'
        )
        legend = (blocks + labels).properties(width=200, height=20)
        return legend
    def plot_speaking_style_scale_correlation_by_gender(
        self,
        style_color: str,
        style_traits: list[str],
        data_male: pl.LazyFrame | pl.DataFrame,
        data_female: pl.LazyFrame | pl.DataFrame,
        title: str | None = None,
        filename: str | None = None,
        width: int | str | None = None,
        height: int | None = None,
    ) -> alt.Chart:
        """Plots correlation between Speaking Style Trait Scores and Voice Scale,
        with grouped bars comparing male vs female voices.
        Args:
            style_color: The speaking style color (e.g., "Green", "Blue")
            style_traits: List of traits for this style
            data_male: DataFrame filtered to male voices only
            data_female: DataFrame filtered to female voices only
            title: Chart title
            filename: Optional explicit filename for saving
            width: Chart width in pixels
            height: Chart height in pixels
        Returns:
            Altair chart with grouped bars (male/female) per trait
        """
        df_male = self._ensure_dataframe(data_male)
        df_female = self._ensure_dataframe(data_female)
        if title is None:
            title = f"Speaking style {style_color} and voice scale 1-10 correlations (by Voice Gender)"
        trait_correlations = []
        for i, trait in enumerate(style_traits):
            trait_display = trait.replace('|', '\n')
            # Male correlation
            subset_m = df_male.filter(pl.col("Right_Anchor") == trait)
            valid_m = subset_m.select(["score", "Voice_Scale_Score"]).drop_nulls()
            if valid_m.height > 1:
                corr_m = valid_m.select(pl.corr("score", "Voice_Scale_Score")).item()
                corr_val = corr_m if corr_m is not None else 0.0
                trait_correlations.append({
                    "trait_display": trait_display,
                    "Gender": "Male",
                    "correlation": corr_val,
                    "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg"
                })
            # Female correlation
            subset_f = df_female.filter(pl.col("Right_Anchor") == trait)
            valid_f = subset_f.select(["score", "Voice_Scale_Score"]).drop_nulls()
            if valid_f.height > 1:
                corr_f = valid_f.select(pl.corr("score", "Voice_Scale_Score")).item()
                corr_val = corr_f if corr_f is not None else 0.0
                trait_correlations.append({
                    "trait_display": trait_display,
                    "Gender": "Female",
                    "correlation": corr_val,
                    "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg"
                })
        if not trait_correlations:
            return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N')
        plot_df = pl.DataFrame(trait_correlations).to_pandas()
        main_chart = alt.Chart(plot_df).mark_bar().encode(
            x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)),
            xOffset='Gender:N',
            y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)),
            color=alt.Color('color_key:N', 
                           scale=alt.Scale(
                               domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
                               range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
                                      ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
                           ),
                           legend=None),
            tooltip=[
                alt.Tooltip('trait_display:N', title='Trait'),
                alt.Tooltip('Gender:N'),
                alt.Tooltip('correlation:Q', format='.3f')
            ]
        ).properties(
            title=self._process_title(title),
            width=width or 800,
            height=height or 350
        )
        # Add custom legend below the chart
        legend = self._create_gender_correlation_legend()
        chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
        chart = self._save_plot(chart, title, filename=filename)
        return chart
    def plot_speaking_style_ranking_correlation_by_gender(
        self,
        style_color: str,
        style_traits: list[str],
        data_male: pl.LazyFrame | pl.DataFrame,
        data_female: pl.LazyFrame | pl.DataFrame,
        title: str | None = None,
        filename: str | None = None,
        width: int | str | None = None,
        height: int | None = None,
    ) -> alt.Chart:
        """Plots correlation between Speaking Style Trait Scores and Voice Ranking Points,
        with grouped bars comparing male vs female voices.
        Args:
            style_color: The speaking style color (e.g., "Green", "Blue")
            style_traits: List of traits for this style
            data_male: DataFrame filtered to male voices only
            data_female: DataFrame filtered to female voices only
            title: Chart title
            filename: Optional explicit filename for saving
            width: Chart width in pixels
            height: Chart height in pixels
        Returns:
            Altair chart with grouped bars (male/female) per trait
        """
        df_male = self._ensure_dataframe(data_male)
        df_female = self._ensure_dataframe(data_female)
        if title is None:
            title = f"Speaking style {style_color} and voice ranking points correlations (by Voice Gender)"
        trait_correlations = []
        for i, trait in enumerate(style_traits):
            trait_display = trait.replace('|', '\n')
            # Male correlation
            subset_m = df_male.filter(pl.col("Right_Anchor") == trait)
            valid_m = subset_m.select(["score", "Ranking_Points"]).drop_nulls()
            if valid_m.height > 1:
                corr_m = valid_m.select(pl.corr("score", "Ranking_Points")).item()
                corr_val = corr_m if corr_m is not None else 0.0
                trait_correlations.append({
                    "trait_display": trait_display,
                    "Gender": "Male",
                    "correlation": corr_val,
                    "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg"
                })
            # Female correlation
            subset_f = df_female.filter(pl.col("Right_Anchor") == trait)
            valid_f = subset_f.select(["score", "Ranking_Points"]).drop_nulls()
            if valid_f.height > 1:
                corr_f = valid_f.select(pl.corr("score", "Ranking_Points")).item()
                corr_val = corr_f if corr_f is not None else 0.0
                trait_correlations.append({
                    "trait_display": trait_display,
                    "Gender": "Female",
                    "correlation": corr_val,
                    "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg"
                })
        if not trait_correlations:
            return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N')
        plot_df = pl.DataFrame(trait_correlations).to_pandas()
        main_chart = alt.Chart(plot_df).mark_bar().encode(
            x=alt.X('trait_display:N', title='Speaking Style Trait', axis=alt.Axis(labelAngle=0, grid=False)),
            xOffset='Gender:N',
            y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)),
            color=alt.Color('color_key:N', 
                           scale=alt.Scale(
                               domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
                               range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
                                      ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
                           ),
                           legend=None),
            tooltip=[
                alt.Tooltip('trait_display:N', title='Trait'),
                alt.Tooltip('Gender:N'),
                alt.Tooltip('correlation:Q', format='.3f')
            ]
        ).properties(
            title=self._process_title(title),
            width=width or 800,
            height=height or 350
        )
        # Add custom legend below the chart
        legend = self._create_gender_correlation_legend()
        chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
        chart = self._save_plot(chart, title, filename=filename)
        return chart
    def plot_speaking_style_color_correlation(
        self,
        data: pl.LazyFrame | pl.DataFrame | None = None,
@@ -1313,6 +1544,98 @@ class QualtricsPlotsMixin:
        chart = self._save_plot(chart, title, filename=filename)
        return chart
    def plot_speaking_style_color_correlation_by_gender(
        self,
        data_male: pl.LazyFrame | pl.DataFrame,
        data_female: pl.LazyFrame | pl.DataFrame,
        speaking_styles: dict[str, list[str]],
        target_column: str = "Voice_Scale_Score",
        title: str = "Speaking Style Colors Correlation (by Voice Gender)",
        filename: str | None = None,
        width: int | str | None = None,
        height: int | None = None,
    ) -> alt.Chart:
        """Plot correlation by speaking style color with grouped bars for male vs female voices.
        Args:
            data_male: DataFrame filtered to male voices only
            data_female: DataFrame filtered to female voices only
            speaking_styles: Dictionary mapping color names to their constituent traits
            target_column: The column to correlate against ("Voice_Scale_Score" or "Ranking_Points")
            title: Chart title
            filename: Optional explicit filename for saving
            width: Chart width in pixels
            height: Chart height in pixels
        Returns:
            Altair chart with grouped bars (male/female) per color
        """
        import utils
        df_male = self._ensure_dataframe(data_male)
        df_female = self._ensure_dataframe(data_female)
        # Get correlations for each gender
        color_corr_male, _ = utils.transform_speaking_style_color_correlation(
            df_male, speaking_styles, target_column=target_column
        )
        color_corr_female, _ = utils.transform_speaking_style_color_correlation(
            df_female, speaking_styles, target_column=target_column
        )
        # Add gender column and color_key based on correlation sign
        color_corr_male = color_corr_male.with_columns([
            pl.lit("Male").alias("Gender"),
            pl.when(pl.col("correlation") >= 0)
              .then(pl.lit("Male_Pos"))
              .otherwise(pl.lit("Male_Neg"))
              .alias("color_key")
        ])
        color_corr_female = color_corr_female.with_columns([
            pl.lit("Female").alias("Gender"),
            pl.when(pl.col("correlation") >= 0)
              .then(pl.lit("Female_Pos"))
              .otherwise(pl.lit("Female_Neg"))
              .alias("color_key")
        ])
        combined = pl.concat([color_corr_male, color_corr_female])
        main_chart = alt.Chart(combined.to_pandas()).mark_bar().encode(
            x=alt.X('Color:N', 
                    title='Speaking Style Color', 
                    axis=alt.Axis(labelAngle=0, grid=False),
                    sort=["Green", "Blue", "Orange", "Red"]),
            xOffset='Gender:N',
            y=alt.Y('correlation:Q', 
                    title='Average Correlation',
                    scale=alt.Scale(domain=[-1, 1]),
                    axis=alt.Axis(grid=True)),
            color=alt.Color('color_key:N', 
                           scale=alt.Scale(
                               domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'],
                               range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE,
                                      ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE]
                           ),
                           legend=None),
            tooltip=[
                alt.Tooltip('Color:N', title='Speaking Style'),
                alt.Tooltip('Gender:N'),
                alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'),
                alt.Tooltip('n_traits:Q', title='# Traits')
            ]
        ).properties(
            title=self._process_title(title),
            width=width or 400,
            height=height or 350
        )
        # Add custom legend below the chart
        legend = self._create_gender_correlation_legend()
        chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent')
        chart = self._save_plot(chart, title, filename=filename)
        return chart
    def plot_demographic_distribution(
        self,
        column: str,
--- a/theme.py
+++ b/theme.py
@@ -77,6 +77,13 @@ class ColorPalette:
    GENDER_MALE_NEUTRAL = "#B8C9D9"   # Grey-Blue
    GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
    # Gender colors for correlation plots (green/red indicate +/- correlation)
    # Male = darker shade, Female = lighter shade
    CORR_MALE_POSITIVE = "#1B5E20"     # Dark Green
    CORR_FEMALE_POSITIVE = "#81C784"   # Light Green
    CORR_MALE_NEGATIVE = "#B71C1C"     # Dark Red
    CORR_FEMALE_NEGATIVE = "#E57373"   # Light Red
    # Speaking Style Colors (named after the style quadrant colors)
    STYLE_GREEN = "#2E7D32"   # Forest Green
    STYLE_BLUE = "#1565C0"    # Strong Blue