correlation plots for best bc

2026-02-04 10:46:31 +01:00
parent ad1d8c6e58
commit e17646eb70
2 changed files with 164 additions and 78 deletions
--- a/03_quant_report.script.py
+++ b/03_quant_report.script.py
@@ -570,7 +570,7 @@ _content = """"""

 for _style, _traits in SPEAKING_STYLES.items():
    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_correlation(
+    _fig = S.plot_speaking_style_scale_correlation(
        data=joined_scale,
        style_color=_style,
        style_traits=_traits,
@@ -609,86 +609,145 @@ for _style, _traits in SPEAKING_STYLES.items():
 mo.md(_content)

 # %%
-mo.md(r"""
-## Correlations when "Best Brand Character" is chosen
-
-Select only the traits that fit with that character
-""")
+# ## Correlations when "Best Brand Character" is chosen
+# For each of the 4 brand characters, filter the dataset to only those respondents 
+# who selected that character as their #1 choice.

 # %%
-from reference import ORIGINAL_CHARACTER_TRAITS
-chosen_bc_traits = ORIGINAL_CHARACTER_TRAITS[BEST_CHOSEN_CHARACTER]
+# Prepare character-filtered data subsets
+char_rank_for_filter = S.get_character_ranking(data)[0].collect()
+
+CHARACTER_FILTER_MAP = {
+    'Familiar Friend': 'Character_Ranking_Familiar_Friend',
+    'The Coach': 'Character_Ranking_The_Coach',
+    'Personal Assistant': 'Character_Ranking_The_Personal_Assistant',
+    'Bank Teller': 'Character_Ranking_The_Bank_Teller',
+}
+
+def get_filtered_data_for_character(char_name: str) -> tuple[pl.DataFrame, pl.DataFrame, int]:
+    """Filter joined_scale and joined_ranking to respondents who ranked char_name #1."""
+    col = CHARACTER_FILTER_MAP[char_name]
+    respondents = char_rank_for_filter.filter(pl.col(col) == 1).select('_recordId')
+    n = respondents.height
+    filtered_scale = joined_scale.join(respondents, on='_recordId', how='inner')
+    filtered_ranking = joined_ranking.join(respondents, on='_recordId', how='inner')
+    return filtered_scale, filtered_ranking, n
+
+def _char_filename(char_name: str, suffix: str) -> str:
+    """Generate filename for character-filtered plots (without n-value).
+    
+    Format: bc_ranked_1_{suffix}__{char_slug}
+    This groups all plot types together in directory listings.
+    """
+    char_slug = char_name.lower().replace(' ', '_')
+    return f"bc_ranked_1_{suffix}__{char_slug}"
+
+

 # %%
-STYLES_SUBSET = utils.filter_speaking_styles(SPEAKING_STYLES, chosen_bc_traits)
-
-# %%
-mo.md(r"""
-### Individual Traits vs Ranking Points
-""")
-
-# %%
-_content = ""
-for _style, _traits in STYLES_SUBSET.items():
-    _fig = S.plot_speaking_style_ranking_correlation(
-        data=joined_ranking,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Ranking Points"""
+# ### Voice Weighted Ranking Score (by Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    _, _, n = get_filtered_data_for_character(char_name)
+    # Get top3 voices for this character subset using _recordIds
+    respondents = char_rank_for_filter.filter(
+        pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
+    ).select('_recordId')
+    # Collect top3_voices if it's a LazyFrame, then join
+    top3_df = top3_voices.collect() if isinstance(top3_voices, pl.LazyFrame) else top3_voices
+    filtered_top3 = top3_df.join(respondents, on='_recordId', how='inner')
+    weighted = calculate_weighted_ranking_scores(filtered_top3)
+    S.plot_weighted_ranking_score(
+        data=weighted,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Most Popular Voice - Weighted Score (1st=3pts, 2nd=2pts, 3rd=1pt)',
+        filename=_char_filename(char_name, "voice_weighted_ranking_score"),
+        color_gender=COLOR_GENDER,
    )
-    _content += f"""
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)

 # %%
-mo.md(r"""
-### Individual Traits vs Scale 1-10
-""")
-
-# %%
-_content = """"""
-
-for _style, _traits in STYLES_SUBSET.items():
-    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_correlation(
-        data=joined_scale,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Scale 1-10""",
+# ### Voice Scale 1-10 Average Scores (by Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    _, _, n = get_filtered_data_for_character(char_name)
+    # Get voice scale data for this character subset using _recordIds
+    respondents = char_rank_for_filter.filter(
+        pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
+    ).select('_recordId')
+    # Collect voice_1_10 if it's a LazyFrame, then join
+    voice_1_10_df = voice_1_10.collect() if isinstance(voice_1_10, pl.LazyFrame) else voice_1_10
+    filtered_voice_1_10 = voice_1_10_df.join(respondents, on='_recordId', how='inner')
+    S.plot_average_scores_with_counts(
+        data=filtered_voice_1_10,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Voice General Impression (Scale 1-10)',
+        filename=_char_filename(char_name, "voice_scale_1-10"),
+        x_label='Voice',
+        domain=[1, 10],
+        color_gender=COLOR_GENDER,
    )
-    _content += f"""
-{mo.ui.altair_chart(_fig)}

-"""
-mo.md(_content)
+

 # %%
-mo.md(r"""
-### Colors vs Scale 1-10 (Best Character)
-""")
+# ### Speaking Style Colors vs Scale 1-10 (only for Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+        continue
+    
+    filtered_scale, _, n = get_filtered_data_for_character(char_name)
+    color_corr, _ = utils.transform_speaking_style_color_correlation(filtered_scale, SPEAKING_STYLES)
+    S.plot_speaking_style_color_correlation(
+        data=color_corr,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Scale 1-10',
+        filename=_char_filename(char_name, "colors_vs_voice_scale_1-10"),
+    )

 # %%
-# Transform to get one row per color with average correlation
-_color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, STYLES_SUBSET)
-S.plot_speaking_style_color_correlation(
-    data=_color_corr_scale,
-    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Scale 1-10"""
-)
+# ### Speaking Style Colors vs Ranking Points (only for Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+        continue
+    
+    _, filtered_ranking, n = get_filtered_data_for_character(char_name)
+    color_corr, _ = utils.transform_speaking_style_color_correlation(
+        filtered_ranking, SPEAKING_STYLES, target_column="Ranking_Points"
+    )
+    S.plot_speaking_style_color_correlation(
+        data=color_corr,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Ranking Points',
+        filename=_char_filename(char_name, "colors_vs_voice_ranking_points"),
+    )

 # %%
-mo.md(r"""
-### Colors vs Ranking Points (Best Character)
-""")
+# ### Individual Traits vs Scale 1-10 (only for Best Character)
+for _style, _traits in SPEAKING_STYLES.items():
+    print(f"--- Speaking Style: {_style} ---")
+    for char_name in CHARACTER_FILTER_MAP:
+        if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+            continue
+        
+        filtered_scale, _, n = get_filtered_data_for_character(char_name)
+        S.plot_speaking_style_scale_correlation(
+            data=filtered_scale,
+            style_color=_style,
+            style_traits=_traits,
+            title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Scale 1-10',
+            filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_scale_1-10"),
+        )
+
+# %%
+# ### Individual Traits vs Ranking Points (only for Best Character)
+for _style, _traits in SPEAKING_STYLES.items():
+    print(f"--- Speaking Style: {_style} ---")
+    for char_name in CHARACTER_FILTER_MAP:
+        if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+            continue
+        
+        _, filtered_ranking, n = get_filtered_data_for_character(char_name)
+        S.plot_speaking_style_ranking_correlation(
+            data=filtered_ranking,
+            style_color=_style,
+            style_traits=_traits,
+            title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Ranking Points',
+            filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_ranking_points"),
+        )
+

 # %%
-_color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
-    joined_ranking, 
-    STYLES_SUBSET, 
-    target_column="Ranking_Points"
-)
-S.plot_speaking_style_color_correlation(
-    data=_color_corr_ranking,
-    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Ranking Points"""
-)