diff --git a/03_quant_report.py b/03_quant_report.py index 5ef8436..a902439 100644 --- a/03_quant_report.py +++ b/03_quant_report.py @@ -664,5 +664,140 @@ def _(): return +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Correlation Speaking Styles + """) + return + + +@app.cell +def _(S, data, top3_voices): + ss_or, choice_map_or = S.get_ss_orange_red(data) + ss_gb, choice_map_gb = S.get_ss_green_blue(data) + + # Combine the data + ss_all = ss_or.join(ss_gb, on='_recordId') + _d = ss_all.collect() + + choice_map = {**choice_map_or, **choice_map_gb} + # print(_d.head()) + # print(choice_map) + ss_long = utils.process_speaking_style_data(ss_all, choice_map) + + df_style = utils.process_speaking_style_data(ss_all, choice_map) + + vscales = S.get_voice_scale_1_10(data)[0] + df_scale_long = utils.process_voice_scale_data(vscales) + + joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner") + + df_ranking = utils.process_voice_ranking_data(top3_voices) + joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner') + return joined_ranking, joined_scale + + +@app.cell +def _(): + mo.md(r""" + ### Colors vs Scale 1-10 + """) + return + + +@app.cell +def _(S, joined_scale): + # Transform to get one row per color with average correlation + color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES) + S.plot_speaking_style_color_correlation( + data=color_corr_scale, + title="Correlation: Speaking Style Colors and Voice Scale 1-10" + ) + return + + +@app.cell +def _(): + mo.md(r""" + ### Colors vs Ranking Points + """) + return + + +@app.cell +def _(S, joined_ranking): + color_corr_ranking, _ = utils.transform_speaking_style_color_correlation( + joined_ranking, + SPEAKING_STYLES, + target_column="Ranking_Points" + ) + S.plot_speaking_style_color_correlation( + data=color_corr_ranking, + title="Correlation: Speaking Style Colors and Voice Ranking Points" + ) + return + + +@app.cell +def _(): + mo.md(r""" + ### Individual Traits vs Scale 1-10 + """) + return + + +@app.cell +def _(S, joined_scale): + _content = """""" + + for _style, _traits in SPEAKING_STYLES.items(): + # print(f"Correlation plot for {style}...") + _fig = S.plot_speaking_style_correlation( + data=joined_scale, + style_color=_style, + style_traits=_traits, + title=f"Correlation: Speaking Style {_style} and Voice Ranking Points", + ) + _content += f""" + #### Speaking Style **{_style}**: + + {mo.ui.altair_chart(_fig)} + + """ + mo.md(_content) + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ### Individual Traits vs Ranking Points + """) + return + + +@app.cell +def _(S, joined_ranking): + _content = """""" + + for _style, _traits in SPEAKING_STYLES.items(): + # print(f"Correlation plot for {style}...") + _fig = S.plot_speaking_style_ranking_correlation( + data=joined_ranking, + style_color=_style, + style_traits=_traits, + title=f"Correlation: Speaking Style {_style} and Voice Ranking Points", + ) + _content += f""" + #### Speaking Style **{_style}**: + + {mo.ui.altair_chart(_fig)} + + """ + mo.md(_content) + return + + if __name__ == "__main__": app.run() diff --git a/plots.py b/plots.py index ca81be4..0bee376 100644 --- a/plots.py +++ b/plots.py @@ -1048,6 +1048,59 @@ class QualtricsPlotsMixin: chart = self._save_plot(chart, title) return chart + def plot_speaking_style_color_correlation( + self, + data: pl.LazyFrame | pl.DataFrame | None = None, + title: str = "Speaking Style and Voice Scale 1-10 Correlations
(Average by Color)", + width: int | str | None = None, + height: int | None = None, + ) -> alt.Chart: + """Plot high-level correlation showing one bar per speaking style color. + + Original use-case: "I want to create high-level correlation plots between + 'green, blue, orange, red' speaking styles and the 'voice scale scores'. + I want to go to one plot with one bar for each color." + + Args: + data: DataFrame with columns [Color, correlation, n_traits] from + utils.transform_speaking_style_color_correlation + title: Chart title (supports
for line breaks) + width: Chart width in pixels + height: Chart height in pixels + + Returns: + Altair chart with one bar per speaking style color + """ + df = self._ensure_dataframe(data) + + # Conditional color based on sign (matches plot_speaking_style_correlation) + chart = alt.Chart(df.to_pandas()).mark_bar().encode( + x=alt.X('Color:N', + title=None, + axis=alt.Axis(labelAngle=0), + sort=["Green", "Blue", "Orange", "Red"]), + y=alt.Y('correlation:Q', + title='Average Correlation', + scale=alt.Scale(domain=[-1, 1])), + color=alt.condition( + alt.datum.correlation >= 0, + alt.value('green'), + alt.value('red') + ), + tooltip=[ + alt.Tooltip('Color:N', title='Speaking Style'), + alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'), + alt.Tooltip('n_traits:Q', title='# Traits') + ] + ).properties( + title=self._process_title(title), + width=width or 400, + height=height or 350 + ) + + chart = self._save_plot(chart, title) + return chart + def plot_demographic_distribution( self, column: str, diff --git a/theme.py b/theme.py index 9164769..9ad9914 100644 --- a/theme.py +++ b/theme.py @@ -77,6 +77,12 @@ class ColorPalette: GENDER_MALE_NEUTRAL = "#B8C9D9" # Grey-Blue GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink + # Speaking Style Colors (named after the style quadrant colors) + STYLE_GREEN = "#2E7D32" # Forest Green + STYLE_BLUE = "#1565C0" # Strong Blue + STYLE_ORANGE = "#E07A00" # Burnt Orange + STYLE_RED = "#C62828" # Deep Red + def jpmc_altair_theme(): """JPMC brand theme for Altair charts.""" diff --git a/utils.py b/utils.py index 2f0d34f..f784f3d 100644 --- a/utils.py +++ b/utils.py @@ -1676,6 +1676,69 @@ def join_voice_and_style_data( how="inner" ) + +def transform_speaking_style_color_correlation( + joined_df: pl.LazyFrame | pl.DataFrame, + speaking_styles: dict[str, list[str]], + target_column: str = "Voice_Scale_Score" +) -> tuple[pl.DataFrame, dict | None]: + """Aggregate speaking style correlation by color (Green, Blue, Orange, Red). + + Original use-case: "I want to create high-level correlation plots between + 'green, blue, orange, red' speaking styles and the 'voice scale scores'. + I want to go to one plot with one bar for each color." + + This function calculates the mean correlation per speaking style color by + averaging the correlations of all traits within each color. + + Parameters + ---------- + joined_df : pl.LazyFrame or pl.DataFrame + Pre-fetched data from joining speaking style data with target data. + Must have columns: Right_Anchor, score, and the target_column + speaking_styles : dict + Dictionary mapping color names to their constituent traits. + Typically imported from speaking_styles.SPEAKING_STYLES + target_column : str + The column to correlate against speaking style scores. + Default: "Voice_Scale_Score" (for voice scale 1-10) + Alternative: "Ranking_Points" (for top 3 voice ranking) + + Returns + ------- + tuple[pl.DataFrame, dict | None] + (DataFrame with columns [Color, correlation, n_traits], None) + """ + if isinstance(joined_df, pl.LazyFrame): + joined_df = joined_df.collect() + + color_correlations = [] + + for color, traits in speaking_styles.items(): + trait_corrs = [] + for trait in traits: + # Filter to this specific trait + subset = joined_df.filter(pl.col("Right_Anchor") == trait) + valid_data = subset.select(["score", target_column]).drop_nulls() + + if valid_data.height > 1: + corr_val = valid_data.select(pl.corr("score", target_column)).item() + if corr_val is not None: + trait_corrs.append(corr_val) + + # Average across all traits for this color + if trait_corrs: + avg_corr = sum(trait_corrs) / len(trait_corrs) + color_correlations.append({ + "Color": color, + "correlation": avg_corr, + "n_traits": len(trait_corrs) + }) + + result_df = pl.DataFrame(color_correlations) + return result_df, None + + def process_voice_ranking_data( df: Union[pl.LazyFrame, pl.DataFrame] ) -> pl.DataFrame: