other top bc's

2026-02-05 11:50:00 +01:00
parent af9a15ccb0
commit 840bd2940d
4 changed files with 93 additions and 13 deletions
--- a/XX_statistical_significance.script.py
+++ b/XX_statistical_significance.script.py
@@ -1,12 +1,12 @@
 """Extra statistical significance analyses for quant report."""
 # %% Imports

-from utils import QualtricsSurvey
+import utils
 import polars as pl
 import argparse
 import json
 import re
-
+from pathlib import Path



@@ -39,6 +39,7 @@ def parse_cli_args():
        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
    
    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
    
    # Only parse if running as script (not in Jupyter/interactive)
    try:
@@ -46,7 +47,9 @@ def parse_cli_args():
        get_ipython()  # noqa: F821 # type: ignore
        # Return namespace with all filters set to None
        no_filters = {f: None for f in FILTER_CONFIG}
-        return argparse.Namespace(**no_filters, filter_name=None)
+        # Use the same default as argparse
+        default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
    except NameError:
        args = parser.parse_args()
        # Parse JSON strings to lists
@@ -59,7 +62,7 @@ cli_args = parse_cli_args()


 # %%
-S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
 data_all = S.load_data()


@@ -125,4 +128,76 @@ if cli_args.filter_name and S.fig_save_dir:
 data = _d
 data.collect()

+# %% Character coach significatly higher than others
+
+
+char_rank = S.get_character_ranking(data)[0]
+
+
+
+_pairwise_df, _meta = S.compute_ranking_significance(
+    char_rank,
+    alpha=0.05,
+    correction="none",
+    )
+
+# %% [markdown]
+"""
+### Methodology Analysis
+
+**Input Data (`char_rank`)**:
+*   Generated by `S.get_character_ranking(data)`.
+*   Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.).
+*   Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice).
+
+**Processing**:
+*   The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite).
+*   It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd).
+
+**Statistical Test**:
+*   **Test Used**: Pairwise Z-test for two proportions (uncorrected).
+*   **Comparison**: It compares the **Rank 1 Share** of every pair of characters.
+    *   *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?"
+*   **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance.
+"""
+
+
+# %% Plot heatmap of pairwise significance
+S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference")
+
+# %% Plot summary of significant differences (e.g., which characters are significantly higher than others)
+# S.plot_significance_summary(_pairwise_df, metadata=_meta)
+
+# %% [markdown]
+"""
+# Analysis: Significance of "The Coach"
+
+**Parameters**: `alpha=0.05`, `correction='none'`
+*   **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083).
+
+**Results**:
+"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field.
+
+*   **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001).
+*   **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level.
+    *   *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful.
+
+**Conclusion**:
+Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
+"""
+
+
+# %% voices analysis
+top3_voices = S.get_top_3_voices(data)[0]
+
+
+_pairwise_df_voice, _metadata = S.compute_ranking_significance(
+    top3_voices,alpha=0.05,correction="none")
+
+
+S.plot_significance_heatmap(
+    _pairwise_df_voice, 
+    metadata=_metadata,
+    title="Statistical Significance: Voice Top Choice Preference"
+)
 # %%