JPMC-quant/XX_statistical_significance.script.py

"""Extra statistical significance analyses for quant report."""
# %% Imports

import utils
import polars as pl
import argparse
import json
import re
from pathlib import Path


# %% Fixed Variables
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'


# %% CLI argument parsing for batch automation
# When run as script: uv run XX_statistical_significance.script.py --age '["18
# Central filter configuration - add new filters here only
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
FILTER_CONFIG = {
    'age': 'options_age',
    'gender': 'options_gender',
    'ethnicity': 'options_ethnicity',
    'income': 'options_income',
    'consumer': 'options_consumer',
    'business_owner': 'options_business_owner',
    'ai_user': 'options_ai_user',
    'investable_assets': 'options_investable_assets',
    'industry': 'options_industry',
}

def parse_cli_args():
    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')

    # Dynamically add filter arguments from config
    for filter_name in FILTER_CONFIG:
        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')

    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
    parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')

    # Only parse if running as script (not in Jupyter/interactive)
    try:
        # Check if running in Jupyter by looking for ipykernel
        get_ipython()  # noqa: F821 # type: ignore
        # Return namespace with all filters set to None
        no_filters = {f: None for f in FILTER_CONFIG}
        # Use the same default as argparse
        default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}'
        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
    except NameError:
        args = parser.parse_args()
        # Parse JSON strings to lists
        for filter_name in FILTER_CONFIG:
            val = getattr(args, filter_name)
            setattr(args, filter_name, json.loads(val) if val else None)
        return args

cli_args = parse_cli_args()


# %%
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
data_all = S.load_data()


# %% Build filtered dataset based on CLI args

# CLI args: None means "no filter applied" - filter_data() will skip None filters

# Build filter values dict dynamically from FILTER_CONFIG
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}

_d = S.filter_data(data_all, **_active_filters)

# Write filter description file if filter-name is provided
if cli_args.filter_name and S.fig_save_dir:
    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
    _filter_slug = S._get_filter_slug()
    _filter_slug_dir = S.fig_save_dir / _filter_slug
    _filter_slug_dir.mkdir(parents=True, exist_ok=True)

    # Build filter description
    _filter_desc_lines = [
        f"Filter: {cli_args.filter_name}",
        "",
        "Applied Filters:",
    ]
    _short_desc_parts = []
    for filter_name, options_attr in FILTER_CONFIG.items():
        all_options = getattr(S, options_attr)
        values = _active_filters[filter_name]
        display_name = filter_name.replace('_', ' ').title()
        # None means no filter applied (same as "All")
        if values is not None and values != all_options:
            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
        else:
            _filter_desc_lines.append(f"  {display_name}: All")

    # Write detailed description INSIDE the filter-slug directory
    # Sanitize filter name for filename usage (replace / and other chars)
    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
    _filter_file.write_text('\n'.join(_filter_desc_lines))

    # Append to summary index file at figures/<export_date>/filter_index.txt
    _summary_file = S.fig_save_dir / "filter_index.txt"
    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"

    # Append or create the summary file
    if _summary_file.exists():
        _existing = _summary_file.read_text()
        # Avoid duplicate entries for same slug
        if _filter_slug not in _existing:
            with _summary_file.open('a') as f:
                f.write(_summary_line)
    else:
        _header = "Filter Index\n" + "=" * 80 + "\n\n"
        _header += "Directory  |  Filter Name  |  Description\n"
        _header += "-" * 80 + "\n"
        _summary_file.write_text(_header + _summary_line)

# Save to logical variable name for further analysis
data = _d
data.collect()

# %% Character coach significatly higher than others


char_rank = S.get_character_ranking(data)[0]


_pairwise_df, _meta = S.compute_ranking_significance(
    char_rank,
    alpha=0.05,
    correction="none",
    )

# %% [markdown]
"""
### Methodology Analysis

**Input Data (`char_rank`)**:
*   Generated by `S.get_character_ranking(data)`.
*   Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.).
*   Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice).

**Processing**:
*   The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite).
*   It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd).

**Statistical Test**:
*   **Test Used**: Pairwise Z-test for two proportions (uncorrected).
*   **Comparison**: It compares the **Rank 1 Share** of every pair of characters.
    *   *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?"
*   **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance.
"""


# %% Plot heatmap of pairwise significance
S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference")

# %% Plot summary of significant differences (e.g., which characters are significantly higher than others)
# S.plot_significance_summary(_pairwise_df, metadata=_meta)

# %% [markdown]
"""
# Analysis: Significance of "The Coach"

**Parameters**: `alpha=0.05`, `correction='none'`
*   **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083).

**Results**:
"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field.

*   **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001).
*   **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level.
    *   *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful.

**Conclusion**:
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
"""


# %% voices analysis
top3_voices = S.get_top_3_voices(data)[0]


_pairwise_df_voice, _metadata = S.compute_ranking_significance(
    top3_voices,alpha=0.05,correction="none")


S.plot_significance_heatmap(
    _pairwise_df_voice,
    metadata=_metadata,
    title="Statistical Significance: Voice Top Choice Preference"
)
# %%