"""Extra statistical significance analyses for quant report.""" # %% Imports import utils import polars as pl import argparse import json import re from pathlib import Path # %% Fixed Variables RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv' QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' # %% CLI argument parsing for batch automation # When run as script: uv run XX_statistical_significance.script.py --age '["18 # Central filter configuration - add new filters here only # Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name' FILTER_CONFIG = { 'age': 'options_age', 'gender': 'options_gender', 'ethnicity': 'options_ethnicity', 'income': 'options_income', 'consumer': 'options_consumer', 'business_owner': 'options_business_owner', 'ai_user': 'options_ai_user', 'investable_assets': 'options_investable_assets', 'industry': 'options_industry', } def parse_cli_args(): parser = argparse.ArgumentParser(description='Generate quant report with optional filters') # Dynamically add filter arguments from config for filter_name in FILTER_CONFIG: parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') # Only parse if running as script (not in Jupyter/interactive) try: # Check if running in Jupyter by looking for ipykernel get_ipython() # noqa: F821 # type: ignore # Return namespace with all filters set to None no_filters = {f: None for f in FILTER_CONFIG} # Use the same default as argparse default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}' return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir) except NameError: args = parser.parse_args() # Parse JSON strings to lists for filter_name in FILTER_CONFIG: val = getattr(args, filter_name) setattr(args, filter_name, json.loads(val) if val else None) return args cli_args = parse_cli_args() # %% S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir) data_all = S.load_data() # %% Build filtered dataset based on CLI args # CLI args: None means "no filter applied" - filter_data() will skip None filters # Build filter values dict dynamically from FILTER_CONFIG _active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG} _d = S.filter_data(data_all, **_active_filters) # Write filter description file if filter-name is provided if cli_args.filter_name and S.fig_save_dir: # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.) _filter_slug = S._get_filter_slug() _filter_slug_dir = S.fig_save_dir / _filter_slug _filter_slug_dir.mkdir(parents=True, exist_ok=True) # Build filter description _filter_desc_lines = [ f"Filter: {cli_args.filter_name}", "", "Applied Filters:", ] _short_desc_parts = [] for filter_name, options_attr in FILTER_CONFIG.items(): all_options = getattr(S, options_attr) values = _active_filters[filter_name] display_name = filter_name.replace('_', ' ').title() # None means no filter applied (same as "All") if values is not None and values != all_options: _short_desc_parts.append(f"{display_name}: {', '.join(values)}") _filter_desc_lines.append(f" {display_name}: {', '.join(values)}") else: _filter_desc_lines.append(f" {display_name}: All") # Write detailed description INSIDE the filter-slug directory # Sanitize filter name for filename usage (replace / and other chars) _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name) _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt" _filter_file.write_text('\n'.join(_filter_desc_lines)) # Append to summary index file at figures//filter_index.txt _summary_file = S.fig_save_dir / "filter_index.txt" _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents" _summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n" # Append or create the summary file if _summary_file.exists(): _existing = _summary_file.read_text() # Avoid duplicate entries for same slug if _filter_slug not in _existing: with _summary_file.open('a') as f: f.write(_summary_line) else: _header = "Filter Index\n" + "=" * 80 + "\n\n" _header += "Directory | Filter Name | Description\n" _header += "-" * 80 + "\n" _summary_file.write_text(_header + _summary_line) # Save to logical variable name for further analysis data = _d data.collect() # %% Character coach significatly higher than others char_rank = S.get_character_ranking(data)[0] _pairwise_df, _meta = S.compute_ranking_significance( char_rank, alpha=0.05, correction="none", ) # %% [markdown] """ ### Methodology Analysis **Input Data (`char_rank`)**: * Generated by `S.get_character_ranking(data)`. * Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.). * Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice). **Processing**: * The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite). * It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd). **Statistical Test**: * **Test Used**: Pairwise Z-test for two proportions (uncorrected). * **Comparison**: It compares the **Rank 1 Share** of every pair of characters. * *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?" * **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance. """ # %% Plot heatmap of pairwise significance S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference") # %% Plot summary of significant differences (e.g., which characters are significantly higher than others) # S.plot_significance_summary(_pairwise_df, metadata=_meta) # %% [markdown] """ # Analysis: Significance of "The Coach" **Parameters**: `alpha=0.05`, `correction='none'` * **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083). **Results**: "The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field. * **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001). * **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level. * *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful. **Conclusion**: Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison. """ # %% Mentions significance analysis char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance( char_rank, alpha=0.05, correction="none", ) S.plot_significance_heatmap( char_pairwise_df_mentions, metadata=_meta_mentions, title="Statistical Significance: Character Total Mentions (Top 3 Visibility)" ) # %% voices analysis top3_voices = S.get_top_3_voices(data)[0] _pairwise_df_voice, _metadata = S.compute_ranking_significance( top3_voices,alpha=0.05,correction="none") S.plot_significance_heatmap( _pairwise_df_voice, metadata=_metadata, title="Statistical Significance: Voice Top Choice Preference" ) # %% Total Mentions Significance (Rank 1+2+3 Combined) # This tests "Quantity" (Visibility) instead of "Quality" (Preference) _pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance( top3_voices, alpha=0.05, correction="none" ) S.plot_significance_heatmap( _pairwise_df_mentions, metadata=_meta_mentions, title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)" ) # %% Male Voices Only Analysis import reference def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame: """Filter ranking columns to keep only those matching target gender.""" cols_to_keep = [] # Always keep identifier if present if '_recordId' in df.columns: cols_to_keep.append('_recordId') for col in df.columns: # Check if column is a voice column (contains Vxx) # Format is typically "Top_3_Voices_ranking__V14" if '__V' in col: voice_id = col.split('__')[1] if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender: cols_to_keep.append(col) return df.select(cols_to_keep) # Get full ranking data as DataFrame df_voices = top3_voices.collect() # Filter for Male voices df_male_voices = filter_voices_by_gender(df_voices, 'Male') # 1. Male Voices: Top Choice Preference (Rank 1) _pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance( df_male_voices, alpha=0.05, correction="none" ) S.plot_significance_heatmap( _pairwise_male_pref, metadata=_meta_male_pref, title="Male Voices Only: Top Choice Preference Significance" ) # 2. Male Voices: Total Mentions (Visibility) _pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance( df_male_voices, alpha=0.05, correction="none" ) S.plot_significance_heatmap( _pairwise_male_vis, metadata=_meta_male_vis, title="Male Voices Only: Total Mentions Significance" ) # %% Male Voices (Excluding Bottom 3: V88, V86, V81) # Start with the male voices dataframe from the previous step voices_to_exclude = ['V88', 'V86', 'V81'] def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame: """Filter ranking columns to exclude specific voices.""" cols_to_keep = [] # Always keep identifier if present if '_recordId' in df.columns: cols_to_keep.append('_recordId') for col in df.columns: # Check if column is a voice column (contains Vxx) if '__V' in col: voice_id = col.split('__')[1] if voice_id not in exclude_list: cols_to_keep.append(col) return df.select(cols_to_keep) df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude) # 1. Male Top Candidates: Top Choice Preference _pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance( df_male_top, alpha=0.05, correction="none" ) S.plot_significance_heatmap( _pairwise_male_top_pref, metadata=_meta_male_top_pref, title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance" ) # 2. Male Top Candidates: Total Mentions _pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance( df_male_top, alpha=0.05, correction="none" ) S.plot_significance_heatmap( _pairwise_male_top_vis, metadata=_meta_male_top_vis, title="Male Voices (Excl. Bottom 3): Total Mentions Significance" ) # %% [markdown] """ # Rank 1 Selection Significance (Voice Level) Similar to the Total Mentions significance analysis above, but counting only how many times each voice was ranked **1st** (out of all respondents). This isolates first-choice preference rather than overall top-3 visibility. """ # %% Rank 1 Significance: All Voices _pairwise_df_rank1, _meta_rank1 = S.compute_rank1_significance( top3_voices, alpha=0.05, correction="none", ) S.plot_significance_heatmap( _pairwise_df_rank1, metadata=_meta_rank1, title="Statistical Significance: Voice Rank 1 Selection" ) # %% Rank 1 Significance: Male Voices Only _pairwise_df_rank1_male, _meta_rank1_male = S.compute_rank1_significance( df_male_voices, alpha=0.05, correction="none", ) S.plot_significance_heatmap( _pairwise_df_rank1_male, metadata=_meta_rank1_male, title="Male Voices Only: Rank 1 Selection Significance" ) # %%