850 lines
28 KiB
Python
850 lines
28 KiB
Python
|
|
__generated_with = "0.19.7"
|
|
|
|
# %%
|
|
import marimo as mo
|
|
import polars as pl
|
|
from pathlib import Path
|
|
import argparse
|
|
import json
|
|
import re
|
|
from validation import check_progress, duration_validation, check_straight_liners
|
|
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
|
import utils
|
|
|
|
from speaking_styles import SPEAKING_STYLES
|
|
|
|
# %% Fixed Variables
|
|
|
|
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
|
# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
|
|
|
|
|
# %%
|
|
# CLI argument parsing for batch automation
|
|
# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
|
|
# When run in Jupyter: args will use defaults (all filters = None = all options selected)
|
|
|
|
# Central filter configuration - add new filters here only
|
|
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
|
FILTER_CONFIG = {
|
|
'age': 'options_age',
|
|
'gender': 'options_gender',
|
|
'ethnicity': 'options_ethnicity',
|
|
'income': 'options_income',
|
|
'consumer': 'options_consumer',
|
|
'business_owner': 'options_business_owner',
|
|
'ai_user': 'options_ai_user',
|
|
'investable_assets': 'options_investable_assets',
|
|
'industry': 'options_industry',
|
|
}
|
|
|
|
def parse_cli_args():
|
|
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
|
|
|
# Dynamically add filter arguments from config
|
|
for filter_name in FILTER_CONFIG:
|
|
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
|
|
|
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
|
parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
|
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
|
|
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
|
|
parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
|
|
|
|
# Only parse if running as script (not in Jupyter/interactive)
|
|
try:
|
|
# Check if running in Jupyter by looking for ipykernel
|
|
get_ipython() # noqa: F821 # type: ignore
|
|
# Return namespace with all filters set to None
|
|
no_filters = {f: None for f in FILTER_CONFIG}
|
|
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
|
|
except NameError:
|
|
args = parser.parse_args()
|
|
# Parse JSON strings to lists
|
|
for filter_name in FILTER_CONFIG:
|
|
val = getattr(args, filter_name)
|
|
setattr(args, filter_name, json.loads(val) if val else None)
|
|
return args
|
|
|
|
cli_args = parse_cli_args()
|
|
BEST_CHOSEN_CHARACTER = cli_args.best_character
|
|
|
|
|
|
|
|
# %%
|
|
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
|
try:
|
|
data_all = S.load_data()
|
|
except NotImplementedError as e:
|
|
mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
|
|
|
|
|
|
# %% Build filtered dataset based on CLI args
|
|
|
|
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
|
|
|
# Build filter values dict dynamically from FILTER_CONFIG
|
|
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
|
|
|
# %% Apply filters
|
|
_d = S.filter_data(data_all, **_active_filters)
|
|
|
|
# Write filter description file if filter-name is provided
|
|
if cli_args.filter_name and S.fig_save_dir:
|
|
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
|
_filter_slug = S._get_filter_slug()
|
|
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
|
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Build filter description
|
|
_filter_desc_lines = [
|
|
f"Filter: {cli_args.filter_name}",
|
|
"",
|
|
"Applied Filters:",
|
|
]
|
|
_short_desc_parts = []
|
|
for filter_name, options_attr in FILTER_CONFIG.items():
|
|
all_options = getattr(S, options_attr)
|
|
values = _active_filters[filter_name]
|
|
display_name = filter_name.replace('_', ' ').title()
|
|
# None means no filter applied (same as "All")
|
|
if values is not None and values != all_options:
|
|
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
|
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
|
else:
|
|
_filter_desc_lines.append(f" {display_name}: All")
|
|
|
|
# Write detailed description INSIDE the filter-slug directory
|
|
# Sanitize filter name for filename usage (replace / and other chars)
|
|
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
|
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
|
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
|
|
|
# Append to summary index file at figures/<export_date>/filter_index.txt
|
|
_summary_file = S.fig_save_dir / "filter_index.txt"
|
|
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
|
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
|
|
|
# Append or create the summary file
|
|
if _summary_file.exists():
|
|
_existing = _summary_file.read_text()
|
|
# Avoid duplicate entries for same slug
|
|
if _filter_slug not in _existing:
|
|
with _summary_file.open('a') as f:
|
|
f.write(_summary_line)
|
|
else:
|
|
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
|
_header += "Directory | Filter Name | Description\n"
|
|
_header += "-" * 80 + "\n"
|
|
_summary_file.write_text(_header + _summary_line)
|
|
|
|
# %% Apply straight-liner threshold filter (if specified)
|
|
# Removes respondents who straight-lined >= N question groups across
|
|
# speaking style and voice scale questions.
|
|
if cli_args.sl_threshold is not None:
|
|
_sl_n = cli_args.sl_threshold
|
|
S.sl_threshold = _sl_n # Store on Survey so filter slug/description include it
|
|
print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...")
|
|
_n_before = _d.select(pl.len()).collect().item()
|
|
|
|
# Extract question groups with renamed columns for check_straight_liners
|
|
_sl_ss_or, _ = S.get_ss_orange_red(_d)
|
|
_sl_ss_gb, _ = S.get_ss_green_blue(_d)
|
|
_sl_vs, _ = S.get_voice_scale_1_10(_d)
|
|
_sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId')
|
|
|
|
_, _sl_df = check_straight_liners(_sl_all_q, max_score=5)
|
|
|
|
if _sl_df is not None and not _sl_df.is_empty():
|
|
# Count straight-lined question groups per respondent
|
|
_sl_counts = (
|
|
_sl_df
|
|
.group_by("Record ID")
|
|
.agg(pl.len().alias("sl_count"))
|
|
.filter(pl.col("sl_count") >= _sl_n)
|
|
.select(pl.col("Record ID").alias("_recordId"))
|
|
)
|
|
# Anti-join to remove offending respondents
|
|
_d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy()
|
|
# Update filtered data on the Survey object so sample size is correct
|
|
S.data_filtered = _d
|
|
_n_after = _d.select(pl.len()).collect().item()
|
|
print(f" Removed {_n_before - _n_after} respondents ({_n_before} → {_n_after})")
|
|
else:
|
|
print(" No straight-liners detected — no respondents removed.")
|
|
|
|
# %% Apply voice-ranking completeness filter (if specified)
|
|
# Keeps only / excludes respondents who are missing the explicit voice
|
|
# ranking question (QID98) despite completing the top-3 selection (QID36).
|
|
if cli_args.voice_ranking_filter is not None:
|
|
S.voice_ranking_filter = cli_args.voice_ranking_filter # Store on Survey so filter slug/description include it
|
|
_vr_missing = S.get_top_3_voices_missing_ranking(_d)
|
|
_vr_missing_ids = _vr_missing.select('_recordId')
|
|
_n_before = _d.select(pl.len()).collect().item()
|
|
|
|
if cli_args.voice_ranking_filter == 'only-missing':
|
|
print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
|
|
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
|
|
elif cli_args.voice_ranking_filter == 'exclude-missing':
|
|
print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
|
|
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
|
|
|
|
S.data_filtered = _d
|
|
_n_after = _d.select(pl.len()).collect().item()
|
|
print(f" {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
|
|
|
|
# Save to logical variable name for further analysis
|
|
data = _d
|
|
data.collect()
|
|
|
|
|
|
|
|
# %%
|
|
# Check if all business owners are missing a 'Consumer type' in demographics
|
|
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
# Demographic Distributions
|
|
""")
|
|
|
|
# %%
|
|
demo_plot_cols = [
|
|
'Age',
|
|
'Gender',
|
|
# 'Race/Ethnicity',
|
|
'Bussiness_Owner',
|
|
'Consumer'
|
|
]
|
|
|
|
# %%
|
|
_content = """
|
|
|
|
"""
|
|
for c in demo_plot_cols:
|
|
_fig = S.plot_demographic_distribution(
|
|
data=S.get_demographics(data)[0],
|
|
column=c,
|
|
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
|
|
)
|
|
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
|
|
|
|
mo.md(_content)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
---
|
|
|
|
# Brand Character Results
|
|
""")
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Best performing: Original vs Refined frankenstein
|
|
""")
|
|
|
|
# %%
|
|
char_refine_rank = S.get_character_refine(data)[0]
|
|
# print(char_rank.collect().head())
|
|
print(char_refine_rank.collect().head())
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Character ranking points
|
|
""")
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Character ranking 1-2-3
|
|
""")
|
|
|
|
# %%
|
|
char_rank = S.get_character_ranking(data)[0]
|
|
|
|
# %%
|
|
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
|
|
S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
|
|
|
|
# %%
|
|
S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Statistical Significance Character Ranking
|
|
""")
|
|
|
|
# %%
|
|
# _pairwise_df, _meta = S.compute_ranking_significance(char_rank)
|
|
|
|
# # print(_pairwise_df.columns)
|
|
|
|
# mo.md(f"""
|
|
|
|
|
|
# {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
|
|
|
|
# {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
|
|
# """)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Character Ranking: times 1st place
|
|
""")
|
|
|
|
# %%
|
|
S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Prominent predefined personality traits wordcloud
|
|
""")
|
|
|
|
# %%
|
|
top8_traits = S.get_top_8_traits(data)[0]
|
|
S.plot_traits_wordcloud(
|
|
data=top8_traits,
|
|
column='Top_8_Traits',
|
|
title="Most Prominent Personality Traits",
|
|
)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Trait frequency per brand character
|
|
""")
|
|
|
|
# %%
|
|
char_df = S.get_character_refine(data)[0]
|
|
|
|
# %%
|
|
from theme import ColorPalette
|
|
|
|
# Assuming you already have char_df (your data from get_character_refine or similar)
|
|
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
|
|
character_colors = {
|
|
'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
|
|
'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
|
|
'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
|
|
'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
|
|
}
|
|
|
|
# Build consistent sort order (by total frequency across all characters)
|
|
all_trait_counts = {}
|
|
for char in characters:
|
|
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
|
for row in freq_df.iter_rows(named=True):
|
|
all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
|
|
|
|
consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
|
|
|
|
_content = """"""
|
|
# Generate 4 plots (one per character)
|
|
for char in characters:
|
|
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
|
main_color, highlight_color = character_colors[char]
|
|
chart = S.plot_single_character_trait_frequency(
|
|
data=freq_df,
|
|
character_name=char,
|
|
bar_color=main_color,
|
|
highlight_color=highlight_color,
|
|
trait_sort_order=consistent_sort_order,
|
|
)
|
|
_content += f"""
|
|
{mo.ui.altair_chart(chart)}
|
|
|
|
|
|
"""
|
|
|
|
mo.md(_content)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Statistical significance best characters
|
|
|
|
zie chat
|
|
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
|
|
>
|
|
""")
|
|
|
|
# %%
|
|
|
|
|
|
# %%
|
|
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
---
|
|
|
|
# Spoken Voice Results
|
|
""")
|
|
|
|
# %%
|
|
COLOR_GENDER = True
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Top 8 Most Chosen out of 18
|
|
""")
|
|
|
|
# %%
|
|
v_18_8_3 = S.get_18_8_3(data)[0]
|
|
|
|
# %%
|
|
S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice', color_gender=COLOR_GENDER)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Top 3 most chosen out of 8
|
|
""")
|
|
|
|
# %%
|
|
S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice', color_gender=COLOR_GENDER)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Voice Ranking Weighted Score
|
|
""")
|
|
|
|
# %%
|
|
top3_voices = S.get_top_3_voices(data)[0]
|
|
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
|
|
|
|
# %%
|
|
S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", color_gender=COLOR_GENDER)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Which voice is ranked best in the ranking question for top 3?
|
|
|
|
(not best 3 out of 8 question)
|
|
""")
|
|
|
|
# %%
|
|
S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Top 3 Voice Rankings (1st, 2nd, 3rd)", color_gender=COLOR_GENDER)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Statistical significance for voice ranking
|
|
""")
|
|
|
|
# %%
|
|
# print(top3_voices.collect().head())
|
|
|
|
# %%
|
|
|
|
# _pairwise_df, _metadata = S.compute_ranking_significance(
|
|
# top3_voices,alpha=0.05,correction="none")
|
|
|
|
# # View significant pairs
|
|
# # print(pairwise_df.filter(pl.col('significant') == True))
|
|
|
|
# # Create heatmap visualization
|
|
# _heatmap = S.plot_significance_heatmap(
|
|
# _pairwise_df,
|
|
# metadata=_metadata,
|
|
# title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
|
|
# )
|
|
|
|
# # Create summary bar chart
|
|
# _summary = S.plot_significance_summary(
|
|
# _pairwise_df,
|
|
# metadata=_metadata
|
|
# )
|
|
|
|
# mo.md(f"""
|
|
# {mo.ui.altair_chart(_heatmap)}
|
|
|
|
# {mo.ui.altair_chart(_summary)}
|
|
# """)
|
|
|
|
# %%
|
|
## Voice Ranked 1st the most
|
|
|
|
# %%
|
|
S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', color_gender=COLOR_GENDER)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Voice Scale 1-10
|
|
""")
|
|
|
|
# %%
|
|
# Get your voice scale data (from notebook)
|
|
voice_1_10, _ = S.get_voice_scale_1_10(data)
|
|
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)", color_gender=COLOR_GENDER)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Statistical Significance (Scale 1-10)
|
|
""")
|
|
|
|
# %%
|
|
# Compute pairwise significance tests
|
|
# pairwise_df, metadata = S.compute_pairwise_significance(
|
|
# voice_1_10,
|
|
# test_type="mannwhitney", # or "ttest", "chi2", "auto"
|
|
# alpha=0.05,
|
|
# correction="bonferroni" # or "holm", "none"
|
|
# )
|
|
|
|
# # View significant pairs
|
|
# # print(pairwise_df.filter(pl.col('significant') == True))
|
|
|
|
# # Create heatmap visualization
|
|
# _heatmap = S.plot_significance_heatmap(
|
|
# pairwise_df,
|
|
# metadata=metadata,
|
|
# title="Voice Rating Significance<br>(Pairwise Comparisons)"
|
|
# )
|
|
|
|
# # Create summary bar chart
|
|
# _summary = S.plot_significance_summary(
|
|
# pairwise_df,
|
|
# metadata=metadata
|
|
# )
|
|
|
|
# mo.md(f"""
|
|
# {mo.ui.altair_chart(_heatmap)}
|
|
|
|
# {mo.ui.altair_chart(_summary)}
|
|
# """)
|
|
|
|
# %%
|
|
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Ranking points for Voice per Chosen Brand Character
|
|
|
|
**missing mapping**
|
|
""")
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
## Correlation Speaking Styles
|
|
""")
|
|
|
|
# %%
|
|
ss_or, choice_map_or = S.get_ss_orange_red(data)
|
|
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
|
|
|
|
# Combine the data
|
|
ss_all = ss_or.join(ss_gb, on='_recordId')
|
|
_d = ss_all.collect()
|
|
|
|
choice_map = {**choice_map_or, **choice_map_gb}
|
|
# print(_d.head())
|
|
# print(choice_map)
|
|
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
|
|
|
df_style = utils.process_speaking_style_data(ss_all, choice_map)
|
|
|
|
vscales = S.get_voice_scale_1_10(data)[0]
|
|
df_scale_long = utils.process_voice_scale_data(vscales)
|
|
|
|
joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
|
|
|
|
df_ranking = utils.process_voice_ranking_data(top3_voices)
|
|
joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
|
|
|
|
# %%
|
|
joined_ranking.head()
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Colors vs Scale 1-10
|
|
""")
|
|
|
|
# %%
|
|
# Transform to get one row per color with average correlation
|
|
color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
|
|
S.plot_speaking_style_color_correlation(
|
|
data=color_corr_scale,
|
|
title="Correlation: Speaking Style Colors and Voice Scale 1-10"
|
|
)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Colors vs Ranking Points
|
|
""")
|
|
|
|
# %%
|
|
color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
|
|
joined_ranking,
|
|
SPEAKING_STYLES,
|
|
target_column="Ranking_Points"
|
|
)
|
|
S.plot_speaking_style_color_correlation(
|
|
data=color_corr_ranking,
|
|
title="Correlation: Speaking Style Colors and Voice Ranking Points"
|
|
)
|
|
|
|
# %%
|
|
# Gender-filtered correlation plots (Male vs Female voices)
|
|
from reference import VOICE_GENDER_MAPPING
|
|
|
|
MALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Male"]
|
|
FEMALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Female"]
|
|
|
|
# Filter joined data by voice gender
|
|
joined_scale_male = joined_scale.filter(pl.col("Voice").is_in(MALE_VOICES))
|
|
joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
|
|
joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
|
|
joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
|
|
|
|
# Colors vs Scale 1-10 (grouped by voice gender)
|
|
S.plot_speaking_style_color_correlation_by_gender(
|
|
data_male=joined_scale_male,
|
|
data_female=joined_scale_female,
|
|
speaking_styles=SPEAKING_STYLES,
|
|
target_column="Voice_Scale_Score",
|
|
title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
|
|
filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
|
|
)
|
|
|
|
# Colors vs Ranking Points (grouped by voice gender)
|
|
S.plot_speaking_style_color_correlation_by_gender(
|
|
data_male=joined_ranking_male,
|
|
data_female=joined_ranking_female,
|
|
speaking_styles=SPEAKING_STYLES,
|
|
target_column="Ranking_Points",
|
|
title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
|
|
filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
|
|
)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Individual Traits vs Scale 1-10
|
|
""")
|
|
|
|
# %%
|
|
_content = """"""
|
|
|
|
for _style, _traits in SPEAKING_STYLES.items():
|
|
# print(f"Correlation plot for {style}...")
|
|
_fig = S.plot_speaking_style_scale_correlation(
|
|
data=joined_scale,
|
|
style_color=_style,
|
|
style_traits=_traits,
|
|
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10",
|
|
)
|
|
_content += f"""
|
|
#### Speaking Style **{_style}**:
|
|
|
|
{mo.ui.altair_chart(_fig)}
|
|
|
|
"""
|
|
mo.md(_content)
|
|
|
|
# %%
|
|
mo.md(r"""
|
|
### Individual Traits vs Ranking Points
|
|
""")
|
|
|
|
# %%
|
|
_content = """"""
|
|
|
|
for _style, _traits in SPEAKING_STYLES.items():
|
|
# print(f"Correlation plot for {style}...")
|
|
_fig = S.plot_speaking_style_ranking_correlation(
|
|
data=joined_ranking,
|
|
style_color=_style,
|
|
style_traits=_traits,
|
|
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
|
|
)
|
|
_content += f"""
|
|
#### Speaking Style **{_style}**:
|
|
|
|
{mo.ui.altair_chart(_fig)}
|
|
|
|
"""
|
|
mo.md(_content)
|
|
|
|
# %%
|
|
# Individual Traits vs Scale 1-10 (grouped by voice gender)
|
|
_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
|
|
|
|
for _style, _traits in SPEAKING_STYLES.items():
|
|
_fig = S.plot_speaking_style_scale_correlation_by_gender(
|
|
data_male=joined_scale_male,
|
|
data_female=joined_scale_female,
|
|
style_color=_style,
|
|
style_traits=_traits,
|
|
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
|
|
filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
|
|
)
|
|
_content += f"""
|
|
#### Speaking Style **{_style}**:
|
|
|
|
{mo.ui.altair_chart(_fig)}
|
|
|
|
"""
|
|
mo.md(_content)
|
|
|
|
# %%
|
|
# Individual Traits vs Ranking Points (grouped by voice gender)
|
|
_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
|
|
|
|
for _style, _traits in SPEAKING_STYLES.items():
|
|
_fig = S.plot_speaking_style_ranking_correlation_by_gender(
|
|
data_male=joined_ranking_male,
|
|
data_female=joined_ranking_female,
|
|
style_color=_style,
|
|
style_traits=_traits,
|
|
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
|
|
filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
|
|
)
|
|
_content += f"""
|
|
#### Speaking Style **{_style}**:
|
|
|
|
{mo.ui.altair_chart(_fig)}
|
|
|
|
"""
|
|
mo.md(_content)
|
|
|
|
# %%
|
|
# ## Correlations when "Best Brand Character" is chosen
|
|
# For each of the 4 brand characters, filter the dataset to only those respondents
|
|
# who selected that character as their #1 choice.
|
|
|
|
# %%
|
|
# Prepare character-filtered data subsets
|
|
char_rank_for_filter = S.get_character_ranking(data)[0].collect()
|
|
|
|
CHARACTER_FILTER_MAP = {
|
|
'Familiar Friend': 'Character_Ranking_Familiar_Friend',
|
|
'The Coach': 'Character_Ranking_The_Coach',
|
|
'Personal Assistant': 'Character_Ranking_The_Personal_Assistant',
|
|
'Bank Teller': 'Character_Ranking_The_Bank_Teller',
|
|
}
|
|
|
|
def get_filtered_data_for_character(char_name: str) -> tuple[pl.DataFrame, pl.DataFrame, int]:
|
|
"""Filter joined_scale and joined_ranking to respondents who ranked char_name #1."""
|
|
col = CHARACTER_FILTER_MAP[char_name]
|
|
respondents = char_rank_for_filter.filter(pl.col(col) == 1).select('_recordId')
|
|
n = respondents.height
|
|
filtered_scale = joined_scale.join(respondents, on='_recordId', how='inner')
|
|
filtered_ranking = joined_ranking.join(respondents, on='_recordId', how='inner')
|
|
return filtered_scale, filtered_ranking, n
|
|
|
|
def _char_filename(char_name: str, suffix: str) -> str:
|
|
"""Generate filename for character-filtered plots (without n-value).
|
|
|
|
Format: bc_ranked_1_{suffix}__{char_slug}
|
|
This groups all plot types together in directory listings.
|
|
"""
|
|
char_slug = char_name.lower().replace(' ', '_')
|
|
return f"bc_ranked_1_{suffix}__{char_slug}"
|
|
|
|
|
|
|
|
# %%
|
|
# ### Voice Weighted Ranking Score (by Best Character)
|
|
for char_name in CHARACTER_FILTER_MAP:
|
|
_, _, n = get_filtered_data_for_character(char_name)
|
|
# Get top3 voices for this character subset using _recordIds
|
|
respondents = char_rank_for_filter.filter(
|
|
pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
|
|
).select('_recordId')
|
|
# Collect top3_voices if it's a LazyFrame, then join
|
|
top3_df = top3_voices.collect() if isinstance(top3_voices, pl.LazyFrame) else top3_voices
|
|
filtered_top3 = top3_df.join(respondents, on='_recordId', how='inner')
|
|
weighted = calculate_weighted_ranking_scores(filtered_top3)
|
|
S.plot_weighted_ranking_score(
|
|
data=weighted,
|
|
title=f'"{char_name}" Ranked #1 (n={n})<br>Most Popular Voice - Weighted Score (1st=3pts, 2nd=2pts, 3rd=1pt)',
|
|
filename=_char_filename(char_name, "voice_weighted_ranking_score"),
|
|
color_gender=COLOR_GENDER,
|
|
)
|
|
|
|
# %%
|
|
# ### Voice Scale 1-10 Average Scores (by Best Character)
|
|
for char_name in CHARACTER_FILTER_MAP:
|
|
_, _, n = get_filtered_data_for_character(char_name)
|
|
# Get voice scale data for this character subset using _recordIds
|
|
respondents = char_rank_for_filter.filter(
|
|
pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
|
|
).select('_recordId')
|
|
# Collect voice_1_10 if it's a LazyFrame, then join
|
|
voice_1_10_df = voice_1_10.collect() if isinstance(voice_1_10, pl.LazyFrame) else voice_1_10
|
|
filtered_voice_1_10 = voice_1_10_df.join(respondents, on='_recordId', how='inner')
|
|
S.plot_average_scores_with_counts(
|
|
data=filtered_voice_1_10,
|
|
title=f'"{char_name}" Ranked #1 (n={n})<br>Voice General Impression (Scale 1-10)',
|
|
filename=_char_filename(char_name, "voice_scale_1-10"),
|
|
x_label='Voice',
|
|
domain=[1, 10],
|
|
color_gender=COLOR_GENDER,
|
|
)
|
|
|
|
|
|
|
|
# %%
|
|
# ### Speaking Style Colors vs Scale 1-10 (only for Best Character)
|
|
for char_name in CHARACTER_FILTER_MAP:
|
|
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
|
continue
|
|
|
|
filtered_scale, _, n = get_filtered_data_for_character(char_name)
|
|
color_corr, _ = utils.transform_speaking_style_color_correlation(filtered_scale, SPEAKING_STYLES)
|
|
S.plot_speaking_style_color_correlation(
|
|
data=color_corr,
|
|
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Scale 1-10',
|
|
filename=_char_filename(char_name, "colors_vs_voice_scale_1-10"),
|
|
)
|
|
|
|
# %%
|
|
# ### Speaking Style Colors vs Ranking Points (only for Best Character)
|
|
for char_name in CHARACTER_FILTER_MAP:
|
|
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
|
continue
|
|
|
|
_, filtered_ranking, n = get_filtered_data_for_character(char_name)
|
|
color_corr, _ = utils.transform_speaking_style_color_correlation(
|
|
filtered_ranking, SPEAKING_STYLES, target_column="Ranking_Points"
|
|
)
|
|
S.plot_speaking_style_color_correlation(
|
|
data=color_corr,
|
|
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Ranking Points',
|
|
filename=_char_filename(char_name, "colors_vs_voice_ranking_points"),
|
|
)
|
|
|
|
# %%
|
|
# ### Individual Traits vs Scale 1-10 (only for Best Character)
|
|
for _style, _traits in SPEAKING_STYLES.items():
|
|
print(f"--- Speaking Style: {_style} ---")
|
|
for char_name in CHARACTER_FILTER_MAP:
|
|
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
|
continue
|
|
|
|
filtered_scale, _, n = get_filtered_data_for_character(char_name)
|
|
S.plot_speaking_style_scale_correlation(
|
|
data=filtered_scale,
|
|
style_color=_style,
|
|
style_traits=_traits,
|
|
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Scale 1-10',
|
|
filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_scale_1-10"),
|
|
)
|
|
|
|
# %%
|
|
# ### Individual Traits vs Ranking Points (only for Best Character)
|
|
for _style, _traits in SPEAKING_STYLES.items():
|
|
print(f"--- Speaking Style: {_style} ---")
|
|
for char_name in CHARACTER_FILTER_MAP:
|
|
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
|
continue
|
|
|
|
_, filtered_ranking, n = get_filtered_data_for_character(char_name)
|
|
S.plot_speaking_style_ranking_correlation(
|
|
data=filtered_ranking,
|
|
style_color=_style,
|
|
style_traits=_traits,
|
|
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Ranking Points',
|
|
filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_ranking_points"),
|
|
)
|
|
|
|
|
|
# %%
|