base correlations

2026-02-03 01:32:06 +01:00
parent 1dce4db909
commit 2408d06098
4 changed files with 257 additions and 0 deletions
--- a/03_quant_report.py
+++ b/03_quant_report.py
@@ -664,5 +664,140 @@ def _():
    return
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
    ## Correlation Speaking Styles
    """)
    return
@app.cell
 def _(S, data, top3_voices):
    ss_or, choice_map_or = S.get_ss_orange_red(data)
    ss_gb, choice_map_gb = S.get_ss_green_blue(data)
    # Combine the data
    ss_all = ss_or.join(ss_gb, on='_recordId')
    _d = ss_all.collect()
    choice_map = {**choice_map_or, **choice_map_gb}
    # print(_d.head())
    # print(choice_map)
    ss_long = utils.process_speaking_style_data(ss_all, choice_map)
    df_style = utils.process_speaking_style_data(ss_all, choice_map)
    vscales = S.get_voice_scale_1_10(data)[0]
    df_scale_long = utils.process_voice_scale_data(vscales)
    joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
    df_ranking = utils.process_voice_ranking_data(top3_voices)
    joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
    return joined_ranking, joined_scale
@app.cell
 def _():
    mo.md(r"""
    ### Colors vs Scale 1-10
    """)
    return
@app.cell
 def _(S, joined_scale):
    # Transform to get one row per color with average correlation
    color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
    S.plot_speaking_style_color_correlation(
        data=color_corr_scale,
        title="Correlation: Speaking Style Colors and Voice Scale 1-10"
    )
    return
@app.cell
 def _():
    mo.md(r"""
    ### Colors vs Ranking Points
    """)
    return
@app.cell
 def _(S, joined_ranking):
    color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
        joined_ranking, 
        SPEAKING_STYLES, 
        target_column="Ranking_Points"
    )
    S.plot_speaking_style_color_correlation(
        data=color_corr_ranking,
        title="Correlation: Speaking Style Colors and Voice Ranking Points"
    )
    return
@app.cell
 def _():
    mo.md(r"""
    ### Individual Traits vs Scale 1-10
    """)
    return
@app.cell
 def _(S, joined_scale):
    _content = """"""
    for _style, _traits in SPEAKING_STYLES.items():
        # print(f"Correlation plot for {style}...")
        _fig = S.plot_speaking_style_correlation(
            data=joined_scale,
            style_color=_style,
            style_traits=_traits,
            title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
        )
        _content += f"""
    #### Speaking Style **{_style}**:
    {mo.ui.altair_chart(_fig)}
    """
    mo.md(_content)
    return
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
    ### Individual Traits vs Ranking Points
    """)
    return
@app.cell
 def _(S, joined_ranking):
    _content = """"""
    for _style, _traits in SPEAKING_STYLES.items():
        # print(f"Correlation plot for {style}...")
        _fig = S.plot_speaking_style_ranking_correlation(
        data=joined_ranking,
        style_color=_style,
        style_traits=_traits,
        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
    )
        _content += f"""
    #### Speaking Style **{_style}**:
    {mo.ui.altair_chart(_fig)}
    """
    mo.md(_content)
    return
 if __name__ == "__main__":
    app.run()
--- a/plots.py
+++ b/plots.py
@@ -1048,6 +1048,59 @@ class QualtricsPlotsMixin:
        chart = self._save_plot(chart, title)
        return chart
    def plot_speaking_style_color_correlation(
        self,
        data: pl.LazyFrame | pl.DataFrame | None = None,
        title: str = "Speaking Style and Voice Scale 1-10 Correlations<br>(Average by Color)",
        width: int | str | None = None,
        height: int | None = None,
    ) -> alt.Chart:
        """Plot high-level correlation showing one bar per speaking style color.
        Original use-case: "I want to create high-level correlation plots between
        'green, blue, orange, red' speaking styles and the 'voice scale scores'.
        I want to go to one plot with one bar for each color."
        Args:
            data: DataFrame with columns [Color, correlation, n_traits] from 
                  utils.transform_speaking_style_color_correlation
            title: Chart title (supports <br> for line breaks)
            width: Chart width in pixels
            height: Chart height in pixels
        Returns:
            Altair chart with one bar per speaking style color
        """
        df = self._ensure_dataframe(data)
        # Conditional color based on sign (matches plot_speaking_style_correlation)
        chart = alt.Chart(df.to_pandas()).mark_bar().encode(
            x=alt.X('Color:N', 
                    title=None, 
                    axis=alt.Axis(labelAngle=0),
                    sort=["Green", "Blue", "Orange", "Red"]),
            y=alt.Y('correlation:Q', 
                    title='Average Correlation',
                    scale=alt.Scale(domain=[-1, 1])),
            color=alt.condition(
                alt.datum.correlation >= 0,
                alt.value('green'),
                alt.value('red')
            ),
            tooltip=[
                alt.Tooltip('Color:N', title='Speaking Style'),
                alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'),
                alt.Tooltip('n_traits:Q', title='# Traits')
            ]
        ).properties(
            title=self._process_title(title),
            width=width or 400,
            height=height or 350
        )
        chart = self._save_plot(chart, title)
        return chart
    def plot_demographic_distribution(
        self,
        column: str,
--- a/theme.py
+++ b/theme.py
@@ -77,6 +77,12 @@ class ColorPalette:
    GENDER_MALE_NEUTRAL = "#B8C9D9"   # Grey-Blue
    GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
    # Speaking Style Colors (named after the style quadrant colors)
    STYLE_GREEN = "#2E7D32"   # Forest Green
    STYLE_BLUE = "#1565C0"    # Strong Blue
    STYLE_ORANGE = "#E07A00"  # Burnt Orange
    STYLE_RED = "#C62828"     # Deep Red
 def jpmc_altair_theme():
    """JPMC brand theme for Altair charts."""
--- a/utils.py
+++ b/utils.py
@@ -1676,6 +1676,69 @@ def join_voice_and_style_data(
        how="inner"
    )
 def transform_speaking_style_color_correlation(
    joined_df: pl.LazyFrame | pl.DataFrame,
    speaking_styles: dict[str, list[str]],
    target_column: str = "Voice_Scale_Score"
 ) -> tuple[pl.DataFrame, dict | None]: 
    """Aggregate speaking style correlation by color (Green, Blue, Orange, Red).
    Original use-case: "I want to create high-level correlation plots between
    'green, blue, orange, red' speaking styles and the 'voice scale scores'.
    I want to go to one plot with one bar for each color."
    This function calculates the mean correlation per speaking style color by
    averaging the correlations of all traits within each color.
    Parameters
    ----------
    joined_df : pl.LazyFrame or pl.DataFrame
        Pre-fetched data from joining speaking style data with target data.
        Must have columns: Right_Anchor, score, and the target_column
    speaking_styles : dict
        Dictionary mapping color names to their constituent traits.
        Typically imported from speaking_styles.SPEAKING_STYLES
    target_column : str
        The column to correlate against speaking style scores.
        Default: "Voice_Scale_Score" (for voice scale 1-10)
        Alternative: "Ranking_Points" (for top 3 voice ranking)
    Returns
    -------
    tuple[pl.DataFrame, dict | None]
        (DataFrame with columns [Color, correlation, n_traits], None)
    """
    if isinstance(joined_df, pl.LazyFrame):
        joined_df = joined_df.collect()
    color_correlations = []
    for color, traits in speaking_styles.items():
        trait_corrs = []
        for trait in traits:
            # Filter to this specific trait
            subset = joined_df.filter(pl.col("Right_Anchor") == trait)
            valid_data = subset.select(["score", target_column]).drop_nulls()
            if valid_data.height > 1:
                corr_val = valid_data.select(pl.corr("score", target_column)).item()
                if corr_val is not None:
                    trait_corrs.append(corr_val)
        # Average across all traits for this color
        if trait_corrs:
            avg_corr = sum(trait_corrs) / len(trait_corrs)
            color_correlations.append({
                "Color": color,
                "correlation": avg_corr,
                "n_traits": len(trait_corrs)
            })
    result_df = pl.DataFrame(color_correlations)
    return result_df, None
 def process_voice_ranking_data(
    df: Union[pl.LazyFrame, pl.DataFrame]
 ) -> pl.DataFrame: