From 840bd2940d609439b60f37ee0fe1da383e744c3d Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Thu, 5 Feb 2026 11:50:00 +0100 Subject: [PATCH] other top bc's --- XX_quant_report.script.py | 9 +-- XX_statistical_significance.script.py | 83 +++++++++++++++++++++++++-- plots.py | 4 +- utils.py | 10 +++- 4 files changed, 93 insertions(+), 13 deletions(-) diff --git a/XX_quant_report.script.py b/XX_quant_report.script.py index eaeca7a..5cf7222 100644 --- a/XX_quant_report.script.py +++ b/XX_quant_report.script.py @@ -20,8 +20,6 @@ from speaking_styles import SPEAKING_STYLES RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv' QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' -BEST_CHOSEN_CHARACTER = "the_coach" - # %% # CLI argument parsing for batch automation @@ -50,6 +48,8 @@ def parse_cli_args(): parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') + parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') + parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")') # Only parse if running as script (not in Jupyter/interactive) try: @@ -57,7 +57,7 @@ def parse_cli_args(): get_ipython() # noqa: F821 # type: ignore # Return namespace with all filters set to None no_filters = {f: None for f in FILTER_CONFIG} - return argparse.Namespace(**no_filters, filter_name=None) + return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach") except NameError: args = parser.parse_args() # Parse JSON strings to lists @@ -67,11 +67,12 @@ def parse_cli_args(): return args cli_args = parse_cli_args() +BEST_CHOSEN_CHARACTER = cli_args.best_character # %% -S = QualtricsSurvey(RESULTS_FILE, QSF_FILE) +S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir) try: data_all = S.load_data() except NotImplementedError as e: diff --git a/XX_statistical_significance.script.py b/XX_statistical_significance.script.py index 8d921d3..3d2f867 100644 --- a/XX_statistical_significance.script.py +++ b/XX_statistical_significance.script.py @@ -1,12 +1,12 @@ """Extra statistical significance analyses for quant report.""" # %% Imports -from utils import QualtricsSurvey +import utils import polars as pl import argparse import json import re - +from pathlib import Path @@ -39,6 +39,7 @@ def parse_cli_args(): parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') + parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') # Only parse if running as script (not in Jupyter/interactive) try: @@ -46,7 +47,9 @@ def parse_cli_args(): get_ipython() # noqa: F821 # type: ignore # Return namespace with all filters set to None no_filters = {f: None for f in FILTER_CONFIG} - return argparse.Namespace(**no_filters, filter_name=None) + # Use the same default as argparse + default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}' + return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir) except NameError: args = parser.parse_args() # Parse JSON strings to lists @@ -59,7 +62,7 @@ cli_args = parse_cli_args() # %% -S = QualtricsSurvey(RESULTS_FILE, QSF_FILE) +S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir) data_all = S.load_data() @@ -125,4 +128,76 @@ if cli_args.filter_name and S.fig_save_dir: data = _d data.collect() +# %% Character coach significatly higher than others + + +char_rank = S.get_character_ranking(data)[0] + + + +_pairwise_df, _meta = S.compute_ranking_significance( + char_rank, + alpha=0.05, + correction="none", + ) + +# %% [markdown] +""" +### Methodology Analysis + +**Input Data (`char_rank`)**: +* Generated by `S.get_character_ranking(data)`. +* Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.). +* Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice). + +**Processing**: +* The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite). +* It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd). + +**Statistical Test**: +* **Test Used**: Pairwise Z-test for two proportions (uncorrected). +* **Comparison**: It compares the **Rank 1 Share** of every pair of characters. + * *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?" +* **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance. +""" + + +# %% Plot heatmap of pairwise significance +S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference") + +# %% Plot summary of significant differences (e.g., which characters are significantly higher than others) +# S.plot_significance_summary(_pairwise_df, metadata=_meta) + +# %% [markdown] +""" +# Analysis: Significance of "The Coach" + +**Parameters**: `alpha=0.05`, `correction='none'` +* **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083). + +**Results**: +"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field. + +* **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001). +* **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level. + * *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful. + +**Conclusion**: +Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison. +""" + + +# %% voices analysis +top3_voices = S.get_top_3_voices(data)[0] + + +_pairwise_df_voice, _metadata = S.compute_ranking_significance( + top3_voices,alpha=0.05,correction="none") + + +S.plot_significance_heatmap( + _pairwise_df_voice, + metadata=_metadata, + title="Statistical Significance: Voice Top Choice Preference" +) # %% diff --git a/plots.py b/plots.py index 29e2180..c633ee7 100644 --- a/plots.py +++ b/plots.py @@ -2308,9 +2308,9 @@ class QualtricsPlotsMixin: # Base heatmap heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode( x=alt.X('col:N', title=None, sort=all_groups, - axis=alt.Axis(labelAngle=-45, labelLimit=150)), + axis=alt.Axis(labelAngle=-45, labelLimit=150, grid=False)), y=alt.Y('row:N', title=None, sort=all_groups, - axis=alt.Axis(labelLimit=150)), + axis=alt.Axis(labelLimit=150, grid=False)), color=alt.Color('sig_category:N', scale=alt.Scale(domain=sig_domain, range=sig_range), legend=alt.Legend( diff --git a/utils.py b/utils.py index 759f1dd..3442f29 100644 --- a/utils.py +++ b/utils.py @@ -762,7 +762,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data class QualtricsSurvey(QualtricsPlotsMixin): """Class to handle Qualtrics survey data.""" - def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]): + def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path], figures_dir: Union[str, Path] = None): if isinstance(data_path, str): data_path = Path(data_path) @@ -774,8 +774,12 @@ class QualtricsSurvey(QualtricsPlotsMixin): self.qid_descr_map = self._extract_qid_descr_map() self.qsf:dict = self._load_qsf() - # get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21' - self.fig_save_dir = Path('figures') / self.data_filepath.parts[2] + if figures_dir: + self.fig_save_dir = Path(figures_dir) + else: + # get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21' + self.fig_save_dir = Path('figures') / self.data_filepath.parts[2] + if not self.fig_save_dir.exists(): self.fig_save_dir.mkdir(parents=True, exist_ok=True)