base correlations

2026-02-03 01:32:06 +01:00
parent 1dce4db909
commit 2408d06098
4 changed files with 257 additions and 0 deletions
--- a/03_quant_report.py
+++ b/03_quant_report.py
@@ -664,5 +664,140 @@ def _():
    return


+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    ## Correlation Speaking Styles
+    """)
+    return
+
+
+@app.cell
+def _(S, data, top3_voices):
+    ss_or, choice_map_or = S.get_ss_orange_red(data)
+    ss_gb, choice_map_gb = S.get_ss_green_blue(data)
+
+    # Combine the data
+    ss_all = ss_or.join(ss_gb, on='_recordId')
+    _d = ss_all.collect()
+
+    choice_map = {**choice_map_or, **choice_map_gb}
+    # print(_d.head())
+    # print(choice_map)
+    ss_long = utils.process_speaking_style_data(ss_all, choice_map)
+
+    df_style = utils.process_speaking_style_data(ss_all, choice_map)
+
+    vscales = S.get_voice_scale_1_10(data)[0]
+    df_scale_long = utils.process_voice_scale_data(vscales)
+
+    joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
+
+    df_ranking = utils.process_voice_ranking_data(top3_voices)
+    joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
+    return joined_ranking, joined_scale
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Colors vs Scale 1-10
+    """)
+    return
+
+
+@app.cell
+def _(S, joined_scale):
+    # Transform to get one row per color with average correlation
+    color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
+    S.plot_speaking_style_color_correlation(
+        data=color_corr_scale,
+        title="Correlation: Speaking Style Colors and Voice Scale 1-10"
+    )
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Colors vs Ranking Points
+    """)
+    return
+
+
+@app.cell
+def _(S, joined_ranking):
+    color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
+        joined_ranking, 
+        SPEAKING_STYLES, 
+        target_column="Ranking_Points"
+    )
+    S.plot_speaking_style_color_correlation(
+        data=color_corr_ranking,
+        title="Correlation: Speaking Style Colors and Voice Ranking Points"
+    )
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Individual Traits vs Scale 1-10
+    """)
+    return
+
+
+@app.cell
+def _(S, joined_scale):
+    _content = """"""
+
+    for _style, _traits in SPEAKING_STYLES.items():
+        # print(f"Correlation plot for {style}...")
+        _fig = S.plot_speaking_style_correlation(
+            data=joined_scale,
+            style_color=_style,
+            style_traits=_traits,
+            title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
+        )
+        _content += f"""
+    #### Speaking Style **{_style}**:
+
+    {mo.ui.altair_chart(_fig)}
+
+    """
+    mo.md(_content)
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    ### Individual Traits vs Ranking Points
+    """)
+    return
+
+
+@app.cell
+def _(S, joined_ranking):
+    _content = """"""
+
+    for _style, _traits in SPEAKING_STYLES.items():
+        # print(f"Correlation plot for {style}...")
+        _fig = S.plot_speaking_style_ranking_correlation(
+        data=joined_ranking,
+        style_color=_style,
+        style_traits=_traits,
+        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
+    )
+        _content += f"""
+    #### Speaking Style **{_style}**:
+
+    {mo.ui.altair_chart(_fig)}
+
+    """
+    mo.md(_content)
+    return
+
+
 if __name__ == "__main__":
    app.run()
--- a/plots.py
+++ b/plots.py
@@ -1048,6 +1048,59 @@ class QualtricsPlotsMixin:
        chart = self._save_plot(chart, title)
        return chart

+    def plot_speaking_style_color_correlation(
+        self,
+        data: pl.LazyFrame | pl.DataFrame | None = None,
+        title: str = "Speaking Style and Voice Scale 1-10 Correlations<br>(Average by Color)",
+        width: int | str | None = None,
+        height: int | None = None,
+    ) -> alt.Chart:
+        """Plot high-level correlation showing one bar per speaking style color.
+        
+        Original use-case: "I want to create high-level correlation plots between
+        'green, blue, orange, red' speaking styles and the 'voice scale scores'.
+        I want to go to one plot with one bar for each color."
+        
+        Args:
+            data: DataFrame with columns [Color, correlation, n_traits] from 
+                  utils.transform_speaking_style_color_correlation
+            title: Chart title (supports <br> for line breaks)
+            width: Chart width in pixels
+            height: Chart height in pixels
+        
+        Returns:
+            Altair chart with one bar per speaking style color
+        """
+        df = self._ensure_dataframe(data)
+        
+        # Conditional color based on sign (matches plot_speaking_style_correlation)
+        chart = alt.Chart(df.to_pandas()).mark_bar().encode(
+            x=alt.X('Color:N', 
+                    title=None, 
+                    axis=alt.Axis(labelAngle=0),
+                    sort=["Green", "Blue", "Orange", "Red"]),
+            y=alt.Y('correlation:Q', 
+                    title='Average Correlation',
+                    scale=alt.Scale(domain=[-1, 1])),
+            color=alt.condition(
+                alt.datum.correlation >= 0,
+                alt.value('green'),
+                alt.value('red')
+            ),
+            tooltip=[
+                alt.Tooltip('Color:N', title='Speaking Style'),
+                alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'),
+                alt.Tooltip('n_traits:Q', title='# Traits')
+            ]
+        ).properties(
+            title=self._process_title(title),
+            width=width or 400,
+            height=height or 350
+        )
+        
+        chart = self._save_plot(chart, title)
+        return chart
+
    def plot_demographic_distribution(
        self,
        column: str,
--- a/theme.py
+++ b/theme.py
@@ -77,6 +77,12 @@ class ColorPalette:
    GENDER_MALE_NEUTRAL = "#B8C9D9"   # Grey-Blue
    GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink

+    # Speaking Style Colors (named after the style quadrant colors)
+    STYLE_GREEN = "#2E7D32"   # Forest Green
+    STYLE_BLUE = "#1565C0"    # Strong Blue
+    STYLE_ORANGE = "#E07A00"  # Burnt Orange
+    STYLE_RED = "#C62828"     # Deep Red
+

 def jpmc_altair_theme():
    """JPMC brand theme for Altair charts."""
--- a/utils.py
+++ b/utils.py
@@ -1676,6 +1676,69 @@ def join_voice_and_style_data(
        how="inner"
    )

+
+def transform_speaking_style_color_correlation(
+    joined_df: pl.LazyFrame | pl.DataFrame,
+    speaking_styles: dict[str, list[str]],
+    target_column: str = "Voice_Scale_Score"
+) -> tuple[pl.DataFrame, dict | None]: 
+    """Aggregate speaking style correlation by color (Green, Blue, Orange, Red).
+    
+    Original use-case: "I want to create high-level correlation plots between
+    'green, blue, orange, red' speaking styles and the 'voice scale scores'.
+    I want to go to one plot with one bar for each color."
+    
+    This function calculates the mean correlation per speaking style color by
+    averaging the correlations of all traits within each color.
+    
+    Parameters
+    ----------
+    joined_df : pl.LazyFrame or pl.DataFrame
+        Pre-fetched data from joining speaking style data with target data.
+        Must have columns: Right_Anchor, score, and the target_column
+    speaking_styles : dict
+        Dictionary mapping color names to their constituent traits.
+        Typically imported from speaking_styles.SPEAKING_STYLES
+    target_column : str
+        The column to correlate against speaking style scores.
+        Default: "Voice_Scale_Score" (for voice scale 1-10)
+        Alternative: "Ranking_Points" (for top 3 voice ranking)
+    
+    Returns
+    -------
+    tuple[pl.DataFrame, dict | None]
+        (DataFrame with columns [Color, correlation, n_traits], None)
+    """
+    if isinstance(joined_df, pl.LazyFrame):
+        joined_df = joined_df.collect()
+    
+    color_correlations = []
+    
+    for color, traits in speaking_styles.items():
+        trait_corrs = []
+        for trait in traits:
+            # Filter to this specific trait
+            subset = joined_df.filter(pl.col("Right_Anchor") == trait)
+            valid_data = subset.select(["score", target_column]).drop_nulls()
+            
+            if valid_data.height > 1:
+                corr_val = valid_data.select(pl.corr("score", target_column)).item()
+                if corr_val is not None:
+                    trait_corrs.append(corr_val)
+        
+        # Average across all traits for this color
+        if trait_corrs:
+            avg_corr = sum(trait_corrs) / len(trait_corrs)
+            color_correlations.append({
+                "Color": color,
+                "correlation": avg_corr,
+                "n_traits": len(trait_corrs)
+            })
+    
+    result_df = pl.DataFrame(color_correlations)
+    return result_df, None
+
+
 def process_voice_ranking_data(
    df: Union[pl.LazyFrame, pl.DataFrame]
 ) -> pl.DataFrame: