correlation matrix speech characteristics vs score

started speech data notebook
missing data analysis
2026-02-10 16:50:47 +01:00 · 2026-02-10 14:58:13 +01:00 · 2026-02-10 14:24:26 +01:00 · 2026-02-09 18:37:41 +01:00 · 2026-02-09 17:57:04 +01:00 · 2026-02-09 17:26:45 +01:00
16 changed files with 5444 additions and 460 deletions
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "wakatime.vscode-wakatime"
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "chat.tools.terminal.autoApprove": {
+        "/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/.venv/bin/python": true
+    }
+}
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -21,9 +21,14 @@ def _():

@app.cell
 def _():
-    TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
+    return
+
+
+@app.cell
+def _():
+    TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
    # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
-    TAG_IMAGE_DIR = Path('figures/2-2-26')
+    TAG_IMAGE_DIR = Path('figures/debug')
    return TAG_IMAGE_DIR, TAG_SOURCE


@@ -47,10 +52,10 @@ def _():

@app.cell
 def _():
-    REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
-    REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
+    REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
+    # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')

-    NEW_IMAGES_DIR = Path('figures/2-2-26')
+    NEW_IMAGES_DIR = Path('figures/2-4-26')
    return NEW_IMAGES_DIR, REPLACE_SOURCE


--- a/XX_detailed_trait_analysis.py
+++ b/XX_detailed_trait_analysis.py
@@ -0,0 +1,263 @@
+"""Extra analyses of the traits"""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+from validation import check_straight_liners
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+# %% Voices per trait
+
+
+ss_or, choice_map_or = S.get_ss_orange_red(data)
+ss_gb, choice_map_gb = S.get_ss_green_blue(data)
+
+# Combine the data
+ss_all = ss_or.join(ss_gb, on='_recordId')
+_d = ss_all.collect()
+
+choice_map = {**choice_map_or, **choice_map_gb}
+# print(_d.head())
+# print(choice_map)
+ss_long = utils.process_speaking_style_data(ss_all, choice_map)
+
+
+# %% Create plots
+
+for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
+    trait_d = ss_long.filter(pl.col("Description") == trait)
+
+    S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True)
+
+
+
+
+
+# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
+# Save with different filename suffix so we can compare with/without straight-liners
+
+print("\n--- Straight-lining Checks on TRAITS ---")
+sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
+sl_traits_df
+
+# %%
+
+if sl_traits_df is not None and not sl_traits_df.is_empty():
+    sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
+    n_sl_groups = sl_traits_df.height
+    print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
+    
+    # Create key in ss_long to match sl_traits_df for anti-join
+    # Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
+    # ss_long has "Style_Group" and "Voice"
+    ss_long_w_key = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    
+    # Prepare filter table: Record ID + Question Group
+    sl_filter = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+
+    # Anti-join to remove specific question blocks that were straight-lined
+    ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
+    
+    # Re-plot with suffix in title
+    print("Re-plotting traits (Cleaned)...")
+    for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
+        trait_d = ss_long_clean.filter(pl.col("Description") == trait)
+        
+        # Modify title to create unique filename (and display title)
+        title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)"
+        
+        S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
+else:
+    print("No straight-liners found on traits.")
+
+
+
+
+# %% Compare All vs Cleaned
+if sl_traits_df is not None and not sl_traits_df.is_empty():
+    print("Generating Comparison Plots (All vs Cleaned)...")
+    
+    # Always apply the per-question-group filtering here to ensure consistency
+    # (Matches the logic used in the re-plotting section above)
+    print("Applying filter to remove straight-lined question blocks...")
+    ss_long_w_key = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    sl_filter = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+    ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
+
+    sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
+
+    # --- Verification Prints ---
+    print(f"\n--- Verification of Filter ---")
+    print(f"Original Row Count: {ss_long.height}")
+    print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
+    print(f"Sample IDs affected: {sl_ids[:5]}")
+    print(f"Cleaned Row Count: {ss_long_clean.height}")
+    print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
+    
+    # Verify removal
+    # Re-construct key to verify
+    ss_long_check = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    sl_filter_check = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+    
+    should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
+    print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
+    
+    # Show what was removed (the straight lining behavior)
+    print("\nSample of Straight-Liner Data (Values that caused removal):")
+    print(sl_traits_df.head(5))
+    print("-" * 30 + "\n")
+    # ---------------------------
+    
+    for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
+        
+        # Get data for this trait from both datasets
+        trait_d_all = ss_long.filter(pl.col("Description") == trait)
+        trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
+        
+        # Plot comparison
+        title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)"
+        
+        S.plot_speaking_style_trait_scores_comparison(
+            trait_d_all, 
+            trait_d_clean, 
+            title=title_comp,
+            height=600  # Slightly taller for grouped bars
+        )
+
--- a/XX_quant_report.script.py
+++ b/XX_quant_report.script.py
@@ -7,13 +7,20 @@ import polars as pl
 from pathlib import Path
 import argparse
 import json
-
+import re
 from validation import check_progress, duration_validation, check_straight_liners
 from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
 import utils

 from speaking_styles import SPEAKING_STYLES

+# %% Fixed Variables
+
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
 # %%
 # CLI argument parsing for batch automation
 # When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
@@ -41,13 +48,18 @@ def parse_cli_args():
        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
    
    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
+    parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
+    parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
    
    # Only parse if running as script (not in Jupyter/interactive)
    try:
        # Check if running in Jupyter by looking for ipykernel
-        get_ipython()  # noqa: F821
+        get_ipython()  # noqa: F821 # type: ignore
        # Return namespace with all filters set to None
-        return argparse.Namespace(**{f: None for f in FILTER_CONFIG}, filter_name=None)
+        no_filters = {f: None for f in FILTER_CONFIG}
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
    except NameError:
        args = parser.parse_args()
        # Parse JSON strings to lists
@@ -57,71 +69,26 @@ def parse_cli_args():
        return args

 cli_args = parse_cli_args()
+BEST_CHOSEN_CHARACTER = cli_args.best_character
+
+

 # %%
-
-# file_browser = mo.ui.file_browser(
-#     initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
-# )
-# file_browser
-
-# # %%
-# mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
-# RESULTS_FILE = Path(file_browser.path(index=0))
-
-RESULTS_FILE = 'data/exports/2-3-26_Copy-2-2-26/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
-QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
-
-# %%
-S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
+S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
 try:
    data_all = S.load_data()
 except NotImplementedError as e:
    mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))

-# %%
-BEST_CHOSEN_CHARACTER = "the_coach"

-# # %%
-# filter_form = mo.md('''
+# %% Build filtered dataset based on CLI args

+# CLI args: None means "no filter applied" - filter_data() will skip None filters

-
-# {age}
-
-# {gender}
-
-# {ethnicity}
-
-# {income}
-
-# {consumer}
-# '''
-# ).batch(
-#     age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
-#     gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
-#     ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
-#     income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
-#     consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
-# ).form()
-# mo.md(f'''
-# ---
-
-# # Data Filter
-
-# {filter_form}
-# ''')
-
-# %%
-# mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
-# CLI args: None means "all options selected" (use S.options_* defaults)
 # Build filter values dict dynamically from FILTER_CONFIG
-_active_filters = {}
-for filter_name, options_attr in FILTER_CONFIG.items():
-    cli_value = getattr(cli_args, filter_name)
-    all_options = getattr(S, options_attr)
-    _active_filters[filter_name] = cli_value if cli_value is not None else all_options
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}

+# %% Apply filters
 _d = S.filter_data(data_all, **_active_filters)

 # Write filter description file if filter-name is provided
@@ -142,14 +109,17 @@ if cli_args.filter_name and S.fig_save_dir:
        all_options = getattr(S, options_attr)
        values = _active_filters[filter_name]
        display_name = filter_name.replace('_', ' ').title()
-        if values != all_options:
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
        else:
            _filter_desc_lines.append(f"  {display_name}: All")
    
    # Write detailed description INSIDE the filter-slug directory
-    _filter_file = _filter_slug_dir / f"{cli_args.filter_name}.txt"
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
    _filter_file.write_text('\n'.join(_filter_desc_lines))
    
    # Append to summary index file at figures/<export_date>/filter_index.txt
@@ -170,14 +140,65 @@ if cli_args.filter_name and S.fig_save_dir:
        _header += "-" * 80 + "\n"
        _summary_file.write_text(_header + _summary_line)

-# Stop execution and prevent other cells from running if no data is selected
-# mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
-data = _d
+# %% Apply straight-liner threshold filter (if specified)
+# Removes respondents who straight-lined >= N question groups across
+# speaking style and voice scale questions.
+if cli_args.sl_threshold is not None:
+    _sl_n = cli_args.sl_threshold
+    S.sl_threshold = _sl_n  # Store on Survey so filter slug/description include it
+    print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...")
+    _n_before = _d.select(pl.len()).collect().item()

-# data = data_validated
+    # Extract question groups with renamed columns for check_straight_liners
+    _sl_ss_or, _ = S.get_ss_orange_red(_d)
+    _sl_ss_gb, _ = S.get_ss_green_blue(_d)
+    _sl_vs, _ = S.get_voice_scale_1_10(_d)
+    _sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId')
+
+    _, _sl_df = check_straight_liners(_sl_all_q, max_score=5)
+
+    if _sl_df is not None and not _sl_df.is_empty():
+        # Count straight-lined question groups per respondent
+        _sl_counts = (
+            _sl_df
+            .group_by("Record ID")
+            .agg(pl.len().alias("sl_count"))
+            .filter(pl.col("sl_count") >= _sl_n)
+            .select(pl.col("Record ID").alias("_recordId"))
+        )
+        # Anti-join to remove offending respondents
+        _d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy()
+        # Update filtered data on the Survey object so sample size is correct
+        S.data_filtered = _d
+        _n_after = _d.select(pl.len()).collect().item()
+        print(f"  Removed {_n_before - _n_after} respondents ({_n_before} → {_n_after})")
+    else:
+        print("  No straight-liners detected — no respondents removed.")
+
+# %% Apply voice-ranking completeness filter (if specified)
+# Keeps only / excludes respondents who are missing the explicit voice
+# ranking question (QID98) despite completing the top-3 selection (QID36).
+if cli_args.voice_ranking_filter is not None:
+    S.voice_ranking_filter = cli_args.voice_ranking_filter  # Store on Survey so filter slug/description include it
+    _vr_missing = S.get_top_3_voices_missing_ranking(_d)
+    _vr_missing_ids = _vr_missing.select('_recordId')
+    _n_before = _d.select(pl.len()).collect().item()
+
+    if cli_args.voice_ranking_filter == 'only-missing':
+        print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
+        _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
+    elif cli_args.voice_ranking_filter == 'exclude-missing':
+        print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
+        _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
+
+    S.data_filtered = _d
+    _n_after = _d.select(pl.len()).collect().item()
+    print(f"  {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
+
+# Save to logical variable name for further analysis
+data = _d
 data.collect()

-# %%


 # %%
@@ -560,6 +581,39 @@ S.plot_speaking_style_color_correlation(
    title="Correlation: Speaking Style Colors and Voice Ranking Points"
 )

+# %%
+# Gender-filtered correlation plots (Male vs Female voices)
+from reference import VOICE_GENDER_MAPPING
+
+MALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Male"]
+FEMALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Female"]
+
+# Filter joined data by voice gender
+joined_scale_male = joined_scale.filter(pl.col("Voice").is_in(MALE_VOICES))
+joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
+joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
+joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
+
+# Colors vs Scale 1-10 (grouped by voice gender)
+S.plot_speaking_style_color_correlation_by_gender(
+    data_male=joined_scale_male,
+    data_female=joined_scale_female,
+    speaking_styles=SPEAKING_STYLES,
+    target_column="Voice_Scale_Score",
+    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
+    filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
+)
+
+# Colors vs Ranking Points (grouped by voice gender)
+S.plot_speaking_style_color_correlation_by_gender(
+    data_male=joined_ranking_male,
+    data_female=joined_ranking_female,
+    speaking_styles=SPEAKING_STYLES,
+    target_column="Ranking_Points",
+    title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
+    filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
+)
+
 # %%
 mo.md(r"""
 ### Individual Traits vs Scale 1-10
@@ -570,7 +624,7 @@ _content = """"""

 for _style, _traits in SPEAKING_STYLES.items():
    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_correlation(
+    _fig = S.plot_speaking_style_scale_correlation(
        data=joined_scale,
        style_color=_style,
        style_traits=_traits,
@@ -609,86 +663,187 @@ for _style, _traits in SPEAKING_STYLES.items():
 mo.md(_content)

 # %%
-mo.md(r"""
-## Correlations when "Best Brand Character" is chosen
+# Individual Traits vs Scale 1-10 (grouped by voice gender)
+_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""

-Select only the traits that fit with that character
-""")
-
-# %%
-from reference import ORIGINAL_CHARACTER_TRAITS
-chosen_bc_traits = ORIGINAL_CHARACTER_TRAITS[BEST_CHOSEN_CHARACTER]
-
-# %%
-STYLES_SUBSET = utils.filter_speaking_styles(SPEAKING_STYLES, chosen_bc_traits)
-
-# %%
-mo.md(r"""
-### Individual Traits vs Ranking Points
-""")
-
-# %%
-_content = ""
-for _style, _traits in STYLES_SUBSET.items():
-    _fig = S.plot_speaking_style_ranking_correlation(
-        data=joined_ranking,
+for _style, _traits in SPEAKING_STYLES.items():
+    _fig = S.plot_speaking_style_scale_correlation_by_gender(
+        data_male=joined_scale_male,
+        data_female=joined_scale_female,
        style_color=_style,
        style_traits=_traits,
-        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Ranking Points"""
+        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
+        filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
    )
    _content += f"""
+#### Speaking Style **{_style}**:
+
 {mo.ui.altair_chart(_fig)}

 """
 mo.md(_content)

 # %%
-mo.md(r"""
-### Individual Traits vs Scale 1-10
-""")
+# Individual Traits vs Ranking Points (grouped by voice gender)
+_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""

-# %%
-_content = """"""
-
-for _style, _traits in STYLES_SUBSET.items():
-    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_correlation(
-        data=joined_scale,
+for _style, _traits in SPEAKING_STYLES.items():
+    _fig = S.plot_speaking_style_ranking_correlation_by_gender(
+        data_male=joined_ranking_male,
+        data_female=joined_ranking_female,
        style_color=_style,
        style_traits=_traits,
-        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Scale 1-10""",
+        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
+        filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
    )
    _content += f"""
+#### Speaking Style **{_style}**:
+
 {mo.ui.altair_chart(_fig)}

 """
 mo.md(_content)

 # %%
-mo.md(r"""
-### Colors vs Scale 1-10 (Best Character)
-""")
+# ## Correlations when "Best Brand Character" is chosen
+# For each of the 4 brand characters, filter the dataset to only those respondents 
+# who selected that character as their #1 choice.

 # %%
-# Transform to get one row per color with average correlation
-_color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, STYLES_SUBSET)
-S.plot_speaking_style_color_correlation(
-    data=_color_corr_scale,
-    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Scale 1-10"""
-)
+# Prepare character-filtered data subsets
+char_rank_for_filter = S.get_character_ranking(data)[0].collect()
+
+CHARACTER_FILTER_MAP = {
+    'Familiar Friend': 'Character_Ranking_Familiar_Friend',
+    'The Coach': 'Character_Ranking_The_Coach',
+    'Personal Assistant': 'Character_Ranking_The_Personal_Assistant',
+    'Bank Teller': 'Character_Ranking_The_Bank_Teller',
+}
+
+def get_filtered_data_for_character(char_name: str) -> tuple[pl.DataFrame, pl.DataFrame, int]:
+    """Filter joined_scale and joined_ranking to respondents who ranked char_name #1."""
+    col = CHARACTER_FILTER_MAP[char_name]
+    respondents = char_rank_for_filter.filter(pl.col(col) == 1).select('_recordId')
+    n = respondents.height
+    filtered_scale = joined_scale.join(respondents, on='_recordId', how='inner')
+    filtered_ranking = joined_ranking.join(respondents, on='_recordId', how='inner')
+    return filtered_scale, filtered_ranking, n
+
+def _char_filename(char_name: str, suffix: str) -> str:
+    """Generate filename for character-filtered plots (without n-value).
+    
+    Format: bc_ranked_1_{suffix}__{char_slug}
+    This groups all plot types together in directory listings.
+    """
+    char_slug = char_name.lower().replace(' ', '_')
+    return f"bc_ranked_1_{suffix}__{char_slug}"
+
+

 # %%
-mo.md(r"""
-### Colors vs Ranking Points (Best Character)
-""")
+# ### Voice Weighted Ranking Score (by Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    _, _, n = get_filtered_data_for_character(char_name)
+    # Get top3 voices for this character subset using _recordIds
+    respondents = char_rank_for_filter.filter(
+        pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
+    ).select('_recordId')
+    # Collect top3_voices if it's a LazyFrame, then join
+    top3_df = top3_voices.collect() if isinstance(top3_voices, pl.LazyFrame) else top3_voices
+    filtered_top3 = top3_df.join(respondents, on='_recordId', how='inner')
+    weighted = calculate_weighted_ranking_scores(filtered_top3)
+    S.plot_weighted_ranking_score(
+        data=weighted,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Most Popular Voice - Weighted Score (1st=3pts, 2nd=2pts, 3rd=1pt)',
+        filename=_char_filename(char_name, "voice_weighted_ranking_score"),
+        color_gender=COLOR_GENDER,
+    )
+
+# %%
+# ### Voice Scale 1-10 Average Scores (by Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    _, _, n = get_filtered_data_for_character(char_name)
+    # Get voice scale data for this character subset using _recordIds
+    respondents = char_rank_for_filter.filter(
+        pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
+    ).select('_recordId')
+    # Collect voice_1_10 if it's a LazyFrame, then join
+    voice_1_10_df = voice_1_10.collect() if isinstance(voice_1_10, pl.LazyFrame) else voice_1_10
+    filtered_voice_1_10 = voice_1_10_df.join(respondents, on='_recordId', how='inner')
+    S.plot_average_scores_with_counts(
+        data=filtered_voice_1_10,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Voice General Impression (Scale 1-10)',
+        filename=_char_filename(char_name, "voice_scale_1-10"),
+        x_label='Voice',
+        domain=[1, 10],
+        color_gender=COLOR_GENDER,
+    )
+
+
+
+# %%
+# ### Speaking Style Colors vs Scale 1-10 (only for Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+        continue
+    
+    filtered_scale, _, n = get_filtered_data_for_character(char_name)
+    color_corr, _ = utils.transform_speaking_style_color_correlation(filtered_scale, SPEAKING_STYLES)
+    S.plot_speaking_style_color_correlation(
+        data=color_corr,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Scale 1-10',
+        filename=_char_filename(char_name, "colors_vs_voice_scale_1-10"),
+    )
+
+# %%
+# ### Speaking Style Colors vs Ranking Points (only for Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+        continue
+    
+    _, filtered_ranking, n = get_filtered_data_for_character(char_name)
+    color_corr, _ = utils.transform_speaking_style_color_correlation(
+        filtered_ranking, SPEAKING_STYLES, target_column="Ranking_Points"
+    )
+    S.plot_speaking_style_color_correlation(
+        data=color_corr,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Ranking Points',
+        filename=_char_filename(char_name, "colors_vs_voice_ranking_points"),
+    )
+
+# %%
+# ### Individual Traits vs Scale 1-10 (only for Best Character)
+for _style, _traits in SPEAKING_STYLES.items():
+    print(f"--- Speaking Style: {_style} ---")
+    for char_name in CHARACTER_FILTER_MAP:
+        if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+            continue
+        
+        filtered_scale, _, n = get_filtered_data_for_character(char_name)
+        S.plot_speaking_style_scale_correlation(
+            data=filtered_scale,
+            style_color=_style,
+            style_traits=_traits,
+            title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Scale 1-10',
+            filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_scale_1-10"),
+        )
+
+# %%
+# ### Individual Traits vs Ranking Points (only for Best Character)
+for _style, _traits in SPEAKING_STYLES.items():
+    print(f"--- Speaking Style: {_style} ---")
+    for char_name in CHARACTER_FILTER_MAP:
+        if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+            continue
+        
+        _, filtered_ranking, n = get_filtered_data_for_character(char_name)
+        S.plot_speaking_style_ranking_correlation(
+            data=filtered_ranking,
+            style_color=_style,
+            style_traits=_traits,
+            title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Ranking Points',
+            filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_ranking_points"),
+        )
+

 # %%
-_color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
-    joined_ranking, 
-    STYLES_SUBSET, 
-    target_column="Ranking_Points"
-)
-S.plot_speaking_style_color_correlation(
-    data=_color_corr_ranking,
-    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Ranking Points"""
-)
--- a/XX_statistical_significance.script.py
+++ b/XX_statistical_significance.script.py
@@ -0,0 +1,370 @@
+"""Extra statistical significance analyses for quant report."""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+# %% Character coach significatly higher than others
+
+
+char_rank = S.get_character_ranking(data)[0]
+
+
+
+_pairwise_df, _meta = S.compute_ranking_significance(
+    char_rank,
+    alpha=0.05,
+    correction="none",
+    )
+
+# %% [markdown]
+"""
+### Methodology Analysis
+
+**Input Data (`char_rank`)**:
+*   Generated by `S.get_character_ranking(data)`.
+*   Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.).
+*   Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice).
+
+**Processing**:
+*   The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite).
+*   It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd).
+
+**Statistical Test**:
+*   **Test Used**: Pairwise Z-test for two proportions (uncorrected).
+*   **Comparison**: It compares the **Rank 1 Share** of every pair of characters.
+    *   *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?"
+*   **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance.
+"""
+
+
+# %% Plot heatmap of pairwise significance
+S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference")
+
+# %% Plot summary of significant differences (e.g., which characters are significantly higher than others)
+# S.plot_significance_summary(_pairwise_df, metadata=_meta)
+
+# %% [markdown]
+"""
+# Analysis: Significance of "The Coach"
+
+**Parameters**: `alpha=0.05`, `correction='none'`
+*   **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083).
+
+**Results**:
+"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field.
+
+*   **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001).
+*   **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level.
+    *   *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful.
+
+**Conclusion**:
+Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
+"""
+
+# %% Mentions significance analysis
+
+char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
+    char_rank,
+    alpha=0.05,
+    correction="none",
+)
+
+S.plot_significance_heatmap(
+    char_pairwise_df_mentions,
+    metadata=_meta_mentions,
+    title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
+)
+
+
+# %% voices analysis
+top3_voices = S.get_top_3_voices(data)[0]
+
+
+_pairwise_df_voice, _metadata = S.compute_ranking_significance(
+    top3_voices,alpha=0.05,correction="none")
+
+
+S.plot_significance_heatmap(
+    _pairwise_df_voice, 
+    metadata=_metadata,
+    title="Statistical Significance: Voice Top Choice Preference"
+)
+# %% Total Mentions Significance (Rank 1+2+3 Combined)
+# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
+
+_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
+    top3_voices,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_df_mentions,
+    metadata=_meta_mentions,
+    title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
+)
+# %% Male Voices Only Analysis
+import reference
+
+def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
+    """Filter ranking columns to keep only those matching target gender."""
+    cols_to_keep = []
+    
+    # Always keep identifier if present
+    if '_recordId' in df.columns:
+        cols_to_keep.append('_recordId')
+        
+    for col in df.columns:
+        # Check if column is a voice column (contains Vxx)
+        # Format is typically "Top_3_Voices_ranking__V14"
+        if '__V' in col:
+            voice_id = col.split('__')[1]
+            if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
+                cols_to_keep.append(col)
+                
+    return df.select(cols_to_keep)
+
+# Get full ranking data as DataFrame
+df_voices = top3_voices.collect()
+
+# Filter for Male voices
+df_male_voices = filter_voices_by_gender(df_voices, 'Male')
+
+# 1. Male Voices: Top Choice Preference (Rank 1)
+_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
+    df_male_voices,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_pref,
+    metadata=_meta_male_pref,
+    title="Male Voices Only: Top Choice Preference Significance"
+)
+
+# 2. Male Voices: Total Mentions (Visibility)
+_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
+    df_male_voices,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_vis,
+    metadata=_meta_male_vis,
+    title="Male Voices Only: Total Mentions Significance"
+)
+# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
+
+# Start with the male voices dataframe from the previous step
+voices_to_exclude = ['V88', 'V86', 'V81']
+
+def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
+    """Filter ranking columns to exclude specific voices."""
+    cols_to_keep = []
+    
+    # Always keep identifier if present
+    if '_recordId' in df.columns:
+        cols_to_keep.append('_recordId')
+        
+    for col in df.columns:
+        # Check if column is a voice column (contains Vxx)
+        if '__V' in col:
+            voice_id = col.split('__')[1]
+            if voice_id not in exclude_list:
+                cols_to_keep.append(col)
+                
+    return df.select(cols_to_keep)
+
+df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
+
+# 1. Male Top Candidates: Top Choice Preference
+_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
+    df_male_top,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_top_pref,
+    metadata=_meta_male_top_pref,
+    title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
+)
+
+# 2. Male Top Candidates: Total Mentions
+_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
+    df_male_top,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_top_vis,
+    metadata=_meta_male_top_vis,
+    title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
+)
+
+# %% [markdown]
+"""
+# Rank 1 Selection Significance (Voice Level)
+
+Similar to the Total Mentions significance analysis above, but counting
+only how many times each voice was ranked **1st** (out of all respondents).
+This isolates first-choice preference rather than overall top-3 visibility.
+"""
+
+# %% Rank 1 Significance: All Voices
+
+_pairwise_df_rank1, _meta_rank1 = S.compute_rank1_significance(
+    top3_voices,
+    alpha=0.05,
+    correction="none",
+)
+
+S.plot_significance_heatmap(
+    _pairwise_df_rank1,
+    metadata=_meta_rank1,
+    title="Statistical Significance: Voice Rank 1 Selection"
+)
+
+# %% Rank 1 Significance: Male Voices Only
+
+_pairwise_df_rank1_male, _meta_rank1_male = S.compute_rank1_significance(
+    df_male_voices,
+    alpha=0.05,
+    correction="none",
+)
+
+S.plot_significance_heatmap(
+    _pairwise_df_rank1_male,
+    metadata=_meta_rank1_male,
+    title="Male Voices Only: Rank 1 Selection Significance"
+)
+
+# %%
--- a/XX_straight_liners.py
+++ b/XX_straight_liners.py
@@ -0,0 +1,267 @@
+"""Extra analyses of the straight-liners"""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+from validation import check_straight_liners
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+
+# %% Determine straight-liner repeat offenders
+# Extract question groups with renamed columns that check_straight_liners expects.
+# The raw `data` has QID-based column names; the getter methods rename them to
+# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc.
+
+ss_or, _ = S.get_ss_orange_red(data)
+ss_gb, _ = S.get_ss_green_blue(data)
+vs, _ = S.get_voice_scale_1_10(data)
+
+# Combine all question groups into one wide LazyFrame (joined on _recordId)
+all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
+
+# Run straight-liner detection across all question groups
+# max_score=5 catches all speaking-style straight-lining (1-5 scale)
+# and voice-scale values ≤5 on the 1-10 scale
+# Note: sl_threshold is NOT set on S here — this script analyses straight-liners,
+# it doesn't filter them out of the dataset.
+print("Running straight-liner detection across all question groups...")
+sl_report, sl_df = check_straight_liners(all_questions, max_score=5)
+
+# %% Quantify repeat offenders
+# sl_df has one row per (Record ID, Question Group) that was straight-lined.
+# Group by Record ID to count how many question groups each person SL'd.
+
+if sl_df is not None and not sl_df.is_empty():
+    total_respondents = data.select(pl.len()).collect().item()
+
+    # Per-respondent count of straight-lined question groups
+    respondent_sl_counts = (
+        sl_df
+        .group_by("Record ID")
+        .agg(pl.len().alias("sl_count"))
+        .sort("sl_count", descending=True)
+    )
+
+    max_sl = respondent_sl_counts["sl_count"].max()
+    print(f"\nTotal respondents: {total_respondents}")
+    print(f"Respondents who straight-lined at least 1 question group: "
+          f"{respondent_sl_counts.height}")
+    print(f"Maximum question groups straight-lined by one person: {max_sl}")
+    print()
+
+    # Build cumulative distribution: for each threshold N, count respondents
+    # who straight-lined >= N question groups
+    cumulative_rows = []
+    for threshold in range(1, max_sl + 1):
+        count = respondent_sl_counts.filter(
+            pl.col("sl_count") >= threshold
+        ).height
+        pct = (count / total_respondents) * 100
+        cumulative_rows.append({
+            "threshold": threshold,
+            "count": count,
+            "pct": pct,
+        })
+        print(
+            f"  ≥{threshold} question groups straight-lined: "
+            f"{count} respondents ({pct:.1f}%)"
+        )
+
+    cumulative_df = pl.DataFrame(cumulative_rows)
+    print(f"\n{cumulative_df}")
+
+    # %% Save cumulative data to CSV
+    _filter_slug = S._get_filter_slug()
+    _csv_dir = Path(S.fig_save_dir) / _filter_slug
+    _csv_dir.mkdir(parents=True, exist_ok=True)
+
+    _csv_path = _csv_dir / "straight_liner_repeat_offenders.csv"
+    cumulative_df.write_csv(_csv_path)
+    print(f"Saved cumulative data to {_csv_path}")
+
+    # %% Plot the cumulative distribution
+    S.plot_straight_liner_repeat_offenders(
+        cumulative_df,
+        total_respondents=total_respondents,
+    )
+
+    # %% Per-question straight-lining frequency
+    # Build human-readable question group names from the raw keys
+    def _humanise_question_group(key: str) -> str:
+        """Convert internal question group key to a readable label.
+
+        Examples:
+            SS_Green_Blue__V14  → Green/Blue – V14
+            SS_Orange_Red__V48  → Orange/Red – V48
+            Voice_Scale_1_10    → Voice Scale (1-10)
+        """
+        if key.startswith("SS_Green_Blue__"):
+            voice = key.split("__")[1]
+            return f"Green/Blue – {voice}"
+        if key.startswith("SS_Orange_Red__"):
+            voice = key.split("__")[1]
+            return f"Orange/Red – {voice}"
+        if key == "Voice_Scale_1_10":
+            return "Voice Scale (1-10)"
+        # Fallback: replace underscores
+        return key.replace("_", " ")
+
+    per_question_counts = (
+        sl_df
+        .group_by("Question Group")
+        .agg(pl.col("Record ID").n_unique().alias("count"))
+        .sort("count", descending=True)
+        .with_columns(
+            (pl.col("count") / total_respondents * 100).alias("pct")
+        )
+    )
+
+    # Add human-readable names
+    per_question_counts = per_question_counts.with_columns(
+        pl.col("Question Group").map_elements(
+            _humanise_question_group, return_dtype=pl.Utf8
+        ).alias("question")
+    )
+
+    print("\n--- Per-Question Straight-Lining Frequency ---")
+    print(per_question_counts)
+
+    # Save per-question data to CSV
+    _csv_path_pq = _csv_dir / "straight_liner_per_question.csv"
+    per_question_counts.write_csv(_csv_path_pq)
+    print(f"Saved per-question data to {_csv_path_pq}")
+
+    # Plot
+    S.plot_straight_liner_per_question(
+        per_question_counts,
+        total_respondents=total_respondents,
+    )
+
+    # %% Show the top repeat offenders (respondents with most SL'd groups)
+    print("\n--- Top Repeat Offenders ---")
+    print(respondent_sl_counts.head(20))
+
+else:
+    print("No straight-liners detected in the dataset.")
--- a/analysis_missing_voice_ranking.ipynb
+++ b/analysis_missing_voice_ranking.ipynb
--- a/docs/README.pdf
+++ b/docs/README.pdf
--- a/docs/figures_structure_manual.md
+++ b/docs/figures_structure_manual.md
@@ -0,0 +1,104 @@
+# Appendix: Quantitative Analysis Plots - Folder Structure Manual
+
+This folder contains all the quantitative analysis plots, sorted by the filters applied to the dataset. Each folder corresponds to a specific demographic cut.
+
+## Folder Overview
+
+* `All_Respondents/`: Analysis of the full dataset (no filters).
+* `filter_index.txt`: A master list of every folder code and its corresponding demographic filter.
+* **Filter Folders**: All other folders represent specific demographic cuts (e.g., `Age-18to21years`, `Gen-Woman`).
+
+## How to Navigate
+
+Each folder contains the same set of charts generated for that specific filter.
+
+## Directory Reference Table
+
+Below is the complete list of folder names. These names are encodings of the filters applied to the dataset, which we use to maintain consistency across our analysis. 
+
+| Directory Code | Filter Description |
+| :--- | :--- |
+| All_Respondents | All Respondents |
+| Age-18to21years | Age: 18 to 21 years |
+| Age-22to24years | Age: 22 to 24 years |
+| Age-25to34years | Age: 25 to 34 years |
+| Age-35to40years | Age: 35 to 40 years |
+| Age-41to50years | Age: 41 to 50 years |
+| Age-51to59years | Age: 51 to 59 years |
+| Age-60to70years | Age: 60 to 70 years |
+| Age-70yearsormore | Age: 70 years or more |
+| Gen-Man | Gender: Man |
+| Gen-Prefernottosay | Gender: Prefer not to say |
+| Gen-Woman | Gender: Woman |
+| Eth-6_grps_c64411 | Ethnicity: All options containing 'Alaska Native or Indigenous American' |
+| Eth-6_grps_8f145b | Ethnicity: All options containing 'Asian or Asian American' |
+| Eth-8_grps_71ac47 | Ethnicity: All options containing 'Black or African American' |
+| Eth-7_grps_c5b3ce | Ethnicity: All options containing 'Hispanic or Latinx' |
+| Eth-BlackorAfricanAmerican<br>MiddleEasternorNorthAfrican<br>WhiteorCaucasian+<br>MiddleEasternorNorthAfrican | Ethnicity: Middle Eastern or North African |
+| Eth-AsianorAsianAmericanBlackorAfricanAmerican<br>NativeHawaiianorOtherPacificIslander+<br>NativeHawaiianorOtherPacificIslander | Ethnicity: Native Hawaiian or Other Pacific Islander |
+| Eth-10_grps_cef760 | Ethnicity: All options containing 'White or Caucasian' |
+| Inc-100000to149999 | Income: $100,000 to $149,999 |
+| Inc-150000to199999 | Income: $150,000 to $199,999 |
+| Inc-200000ormore | Income: $200,000 or more |
+| Inc-25000to34999 | Income: $25,000 to $34,999 |
+| Inc-35000to54999 | Income: $35,000 to $54,999 |
+| Inc-55000to79999 | Income: $55,000 to $79,999 |
+| Inc-80000to99999 | Income: $80,000 to $99,999 |
+| Inc-Lessthan25000 | Income: Less than $25,000 |
+| Cons-Lower_Mass_A+Lower_Mass_B | Consumer: Lower_Mass_A, Lower_Mass_B |
+| Cons-MassAffluent_A+MassAffluent_B | Consumer: MassAffluent_A, MassAffluent_B |
+| Cons-Mass_A+Mass_B | Consumer: Mass_A, Mass_B |
+| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_A+<br>Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_A, Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
+| Cons-Early_Professional | Consumer: Early_Professional |
+| Cons-Lower_Mass_B | Consumer: Lower_Mass_B |
+| Cons-MassAffluent_B | Consumer: MassAffluent_B |
+| Cons-Mass_B | Consumer: Mass_B |
+| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
+| Cons-Starter | Consumer: Starter |
+| BizOwn-No | Business Owner: No |
+| BizOwn-Yes | Business Owner: Yes |
+| AI-Daily | Ai User: Daily |
+| AI-Lessthanonceamonth | Ai User: Less than once a month |
+| AI-Morethanoncedaily | Ai User: More than once daily |
+| AI-Multipletimesperweek | Ai User: Multiple times per week |
+| AI-Onceamonth | Ai User: Once a month |
+| AI-Onceaweek | Ai User: Once a week |
+| AI-RarelyNever | Ai User: Rarely/Never |
+| AI-Daily+<br>Morethanoncedaily+<br>Multipletimesperweek | Ai User: Daily, More than once daily, Multiple times per week |
+| AI-4_grps_d4f57a | Ai User: Once a week, Once a month, Less than once a month, Rarely/Never |
+| InvAsts-0to24999 | Investable Assets: $0 to $24,999 |
+| InvAsts-150000to249999 | Investable Assets: $150,000 to $249,999 |
+| InvAsts-1Mto4.9M | Investable Assets: $1M to $4.9M |
+| InvAsts-25000to49999 | Investable Assets: $25,000 to $49,999 |
+| InvAsts-250000to499999 | Investable Assets: $250,000 to $499,999 |
+| InvAsts-50000to149999 | Investable Assets: $50,000 to $149,999 |
+| InvAsts-500000to999999 | Investable Assets: $500,000 to $999,999 |
+| InvAsts-5Mormore | Investable Assets: $5M or more |
+| InvAsts-Prefernottoanswer | Investable Assets: Prefer not to answer |
+| Ind-Agricultureforestryfishingorhunting | Industry: Agriculture, forestry, fishing, or hunting |
+| Ind-Artsentertainmentorrecreation | Industry: Arts, entertainment, or recreation |
+| Ind-Broadcasting | Industry: Broadcasting |
+| Ind-Construction | Industry: Construction |
+| Ind-EducationCollegeuniversityoradult | Industry: Education – College, university, or adult |
+| Ind-EducationOther | Industry: Education – Other |
+| Ind-EducationPrimarysecondaryK-12 | Industry: Education – Primary/secondary (K-12) |
+| Ind-Governmentandpublicadministration | Industry: Government and public administration |
+| Ind-Hotelandfoodservices | Industry: Hotel and food services |
+| Ind-InformationOther | Industry: Information – Other |
+| Ind-InformationServicesanddata | Industry: Information – Services and data |
+| Ind-Legalservices | Industry: Legal services |
+| Ind-ManufacturingComputerandelectronics | Industry: Manufacturing – Computer and electronics |
+| Ind-ManufacturingOther | Industry: Manufacturing – Other |
+| Ind-Notemployed | Industry: Not employed |
+| Ind-Otherindustrypleasespecify | Industry: Other industry (please specify) |
+| Ind-Processing | Industry: Processing |
+| Ind-Publishing | Industry: Publishing |
+| Ind-Realestaterentalorleasing | Industry: Real estate, rental, or leasing |
+| Ind-Retired | Industry: Retired |
+| Ind-Scientificortechnicalservices | Industry: Scientific or technical services |
+| Ind-Software | Industry: Software |
+| Ind-Telecommunications | Industry: Telecommunications |
+| Ind-Transportationandwarehousing | Industry: Transportation and warehousing |
+| Ind-Utilities | Industry: Utilities |
+| Ind-Wholesale | Industry: Wholesale |
+
--- a/plots.py
+++ b/plots.py
--- a/potential_dataset_issues.md
+++ b/potential_dataset_issues.md
@@ -0,0 +1,3 @@
+- V46 not in scale 1-10. Qualtrics 
+- Straightliners
+- V45 goed in qual maar slecht in quant
--- a/run_filter_combinations.py
+++ b/run_filter_combinations.py
@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
 Usage:
    uv run python run_filter_combinations.py
    uv run python run_filter_combinations.py --dry-run  # Preview combinations without running
+    uv run python run_filter_combinations.py --category age  # Only run age combinations
+    uv run python run_filter_combinations.py --category consumer  # Only run consumer segment combinations
 """

 import subprocess
@@ -31,123 +33,171 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
 REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'


-def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
+def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
    """
    Generate all single-filter combinations.
    
    Each combination isolates ONE filter value while keeping all others at "all selected".
-    Returns list of dicts with filter kwargs for each run.
+    
+    Args:
+        survey: QualtricsSurvey instance with loaded data
+        category: Optional filter category to limit combinations to.
+                  Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
+                               'business_owner', 'ai_user', 'investable_assets', 'industry'
+                  If None or 'all', generates all combinations.
+    
+    Returns:
+        List of dicts with filter kwargs for each run.
    """
    combinations = []
    
    # Add "All Respondents" run (no filters = all options selected)
-    combinations.append({
-        'name': 'All_Respondents',
-        'filters': {}  # Empty = use defaults (all selected)
-    })
+    if not category or category in ['all_filters', 'all']:
+        combinations.append({
+            'name': 'All_Respondents',
+            'filters': {}  # Empty = use defaults (all selected)
+        })
    
    # Age groups - one at a time
-    for age in survey.options_age:
-        combinations.append({
-            'name': f'Age-{age}',
-            'filters': {'age': [age]}
-        })
-    
-    # Gender - one at a time
-    for gender in survey.options_gender:
-        combinations.append({
-            'name': f'Gender-{gender}',
-            'filters': {'gender': [gender]}
-        })
-    
-    # Ethnicity - grouped by individual values
-    # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
-    # Create filters that include ALL options containing each individual ethnicity value
-    ethnicity_values = set()
-    for ethnicity_option in survey.options_ethnicity:
-        # Split by comma and strip whitespace
-        values = [v.strip() for v in ethnicity_option.split(',')]
-        ethnicity_values.update(values)
-    
-    for ethnicity_value in sorted(ethnicity_values):
-        # Find all options that contain this value
-        matching_options = [
-            opt for opt in survey.options_ethnicity 
-            if ethnicity_value in [v.strip() for v in opt.split(',')]
-        ]
-        combinations.append({
-            'name': f'Ethnicity-{ethnicity_value}',
-            'filters': {'ethnicity': matching_options}
-        })
-    
-    # Income - one at a time
-    for income in survey.options_income:
-        combinations.append({
-            'name': f'Income-{income}',
-            'filters': {'income': [income]}
-        })
-    
-    # Consumer segments - combine _A and _B options, and also include standalone
-    # Group options by base name (removing _A/_B suffix)
-    consumer_groups = {}
-    for consumer in survey.options_consumer:
-        # Check if ends with _A or _B
-        if consumer.endswith('_A') or consumer.endswith('_B'):
-            base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
-            if base_name not in consumer_groups:
-                consumer_groups[base_name] = []
-            consumer_groups[base_name].append(consumer)
-        else:
-            # Not an _A/_B option, keep as-is
-            consumer_groups[consumer] = [consumer]
-    
-    # Add combined _A+_B options
-    for base_name, options in consumer_groups.items():
-        if len(options) > 1:  # Only combine if there are multiple (_A and _B)
+    if not category or category in ['all_filters', 'age']:
+        for age in survey.options_age:
            combinations.append({
-                'name': f'Consumer-{base_name}',
-                'filters': {'consumer': options}
+                'name': f'Age-{age}',
+                'filters': {'age': [age]}
            })
    
-    # Add standalone options (including individual _A and _B)
-    for consumer in survey.options_consumer:
-        combinations.append({
-            'name': f'Consumer-{consumer}',
-            'filters': {'consumer': [consumer]}
-        })
+    # Gender - one at a time
+    if not category or category in ['all_filters', 'gender']:
+        for gender in survey.options_gender:
+            combinations.append({
+                'name': f'Gender-{gender}',
+                'filters': {'gender': [gender]}
+            })
+    
+    # Ethnicity - grouped by individual values
+    if not category or category in ['all_filters', 'ethnicity']:
+        # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
+        # Create filters that include ALL options containing each individual ethnicity value
+        ethnicity_values = set()
+        for ethnicity_option in survey.options_ethnicity:
+            # Split by comma and strip whitespace
+            values = [v.strip() for v in ethnicity_option.split(',')]
+            ethnicity_values.update(values)
+        
+        for ethnicity_value in sorted(ethnicity_values):
+            # Find all options that contain this value
+            matching_options = [
+                opt for opt in survey.options_ethnicity 
+                if ethnicity_value in [v.strip() for v in opt.split(',')]
+            ]
+            combinations.append({
+                'name': f'Ethnicity-{ethnicity_value}',
+                'filters': {'ethnicity': matching_options}
+            })
+    
+    # Income - one at a time
+    if not category or category in ['all_filters', 'income']:
+        for income in survey.options_income:
+            combinations.append({
+                'name': f'Income-{income}',
+                'filters': {'income': [income]}
+            })
+    
+    # Consumer segments - combine _A and _B options, and also include standalone
+    if not category or category in ['all_filters', 'consumer']:
+        # Group options by base name (removing _A/_B suffix)
+        consumer_groups = {}
+        for consumer in survey.options_consumer:
+            # Check if ends with _A or _B
+            if consumer.endswith('_A') or consumer.endswith('_B'):
+                base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
+                if base_name not in consumer_groups:
+                    consumer_groups[base_name] = []
+                consumer_groups[base_name].append(consumer)
+            else:
+                # Not an _A/_B option, keep as-is
+                consumer_groups[consumer] = [consumer]
+        
+        # Add combined _A+_B options
+        for base_name, options in consumer_groups.items():
+            if len(options) > 1:  # Only combine if there are multiple (_A and _B)
+                combinations.append({
+                    'name': f'Consumer-{base_name}',
+                    'filters': {'consumer': options}
+                })
+        
+        # Add standalone options (including individual _A and _B)
+        for consumer in survey.options_consumer:
+            combinations.append({
+                'name': f'Consumer-{consumer}',
+                'filters': {'consumer': [consumer]}
+            })
    
    # Business Owner - one at a time
-    for business_owner in survey.options_business_owner:
-        combinations.append({
-            'name': f'BusinessOwner-{business_owner}',
-            'filters': {'business_owner': [business_owner]}
-        })
+    if not category or category in ['all_filters', 'business_owner']:
+        for business_owner in survey.options_business_owner:
+            combinations.append({
+                'name': f'BusinessOwner-{business_owner}',
+                'filters': {'business_owner': [business_owner]}
+            })
    
    # AI User - one at a time
-    for ai_user in survey.options_ai_user:
+    if not category or category in ['all_filters', 'ai_user']:
+        for ai_user in survey.options_ai_user:
+            combinations.append({
+                'name': f'AIUser-{ai_user}',
+                'filters': {'ai_user': [ai_user]}
+            })
+        
+        # AI user daily, more than once daily, en multiple times a week = frequent
        combinations.append({
-            'name': f'AIUser-{ai_user}',
-            'filters': {'ai_user': [ai_user]}
+            'name': 'AIUser-Frequent',
+            'filters': {'ai_user': [
+                'Daily', 'More than once daily', 'Multiple times per week'
+            ]}
+        })
+        combinations.append({
+            'name': 'AIUser-RarelyNever',
+            'filters': {'ai_user': [
+                'Once a month', 'Less than once a month', 'Once a week', 'Rarely/Never'
+            ]}
        })
    
    # Investable Assets - one at a time
-    for investable_assets in survey.options_investable_assets:
-        combinations.append({
-            'name': f'Assets-{investable_assets}',
-            'filters': {'investable_assets': [investable_assets]}
-        })
+    if not category or category in ['all_filters', 'investable_assets']:
+        for investable_assets in survey.options_investable_assets:
+            combinations.append({
+                'name': f'Assets-{investable_assets}',
+                'filters': {'investable_assets': [investable_assets]}
+            })
    
    # Industry - one at a time
-    for industry in survey.options_industry:
+    if not category or category in ['all_filters', 'industry']:
+        for industry in survey.options_industry:
+            combinations.append({
+                'name': f'Industry-{industry}',
+                'filters': {'industry': [industry]}
+            })
+    
+    # Voice ranking completeness filter
+    # These use a special flag rather than demographic filters, so we store
+    # the mode in a dedicated key that run_report passes as --voice-ranking-filter.
+    if not category or category in ['all_filters', 'voice_ranking']:
        combinations.append({
-            'name': f'Industry-{industry}',
-            'filters': {'industry': [industry]}
+            'name': 'VoiceRanking-OnlyMissing',
+            'filters': {},
+            'voice_ranking_filter': 'only-missing',
+        })
+        combinations.append({
+            'name': 'VoiceRanking-ExcludeMissing',
+            'filters': {},
+            'voice_ranking_filter': 'exclude-missing',
        })
    
    return combinations


-def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
+def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
    """
    Run the report script with given filters.
    
@@ -155,6 +205,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
        filters: Dict of filter_name -> list of values
        name: Name for this filter combination (used for .txt description file)
        dry_run: If True, just print command without running
+        sl_threshold: If set, exclude respondents with >= N straight-lined question groups
+        voice_ranking_filter: If set, filter by voice ranking completeness.
+            'only-missing' keeps only respondents missing QID98 data,
+            'exclude-missing' removes them.
        
    Returns:
        True if successful, False otherwise
@@ -165,6 +219,14 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
    if name:
        cmd.extend(['--filter-name', name])
    
+    # Pass straight-liner threshold if specified
+    if sl_threshold is not None:
+        cmd.extend(['--sl-threshold', str(sl_threshold)])
+    
+    # Pass voice ranking filter if specified
+    if voice_ranking_filter is not None:
+        cmd.extend(['--voice-ranking-filter', voice_ranking_filter])
+    
    for filter_name, values in filters.items():
        if values:
            cmd.extend([f'--{filter_name}', json.dumps(values)])
@@ -193,6 +255,13 @@ def main():
    import argparse
    parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
    parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
+    parser.add_argument(
+        '--category',
+        choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
+        default='all_filters',
+        help='Filter category to run combinations for (default: all_filters)'
+    )
+    parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)')
    args = parser.parse_args()
    
    # Load survey to get available filter options
@@ -200,15 +269,19 @@ def main():
    survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
    survey.load_data()  # Populates options_* attributes
    
-    # Generate all combinations
-    combinations = get_filter_combinations(survey)
-    print(f"Generated {len(combinations)} filter combinations")
+    # Generate combinations for specified category
+    combinations = get_filter_combinations(survey, category=args.category)
+    category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
+    print(f"Generated {len(combinations)} filter combinations{category_desc}")
    
+    if args.sl_threshold is not None:
+        print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups")
+
    if args.dry_run:
        print("\nDRY RUN - Commands that would be executed:")
        for combo in combinations:
            print(f"\n{combo['name']}:")
-            run_report(combo['filters'], name=combo['name'], dry_run=True)
+            run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
        return
    
    # Run each combination with progress bar
@@ -217,7 +290,7 @@ def main():
    
    for combo in tqdm(combinations, desc="Running reports", unit="filter"):
        tqdm.write(f"Running: {combo['name']}")
-        if run_report(combo['filters'], name=combo['name']):
+        if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
            successful += 1
        else:
            failed.append(combo['name'])
--- a/speech_data_correlation.ipynb
+++ b/speech_data_correlation.ipynb
--- a/theme.py
+++ b/theme.py
@@ -77,6 +77,13 @@ class ColorPalette:
    GENDER_MALE_NEUTRAL = "#B8C9D9"   # Grey-Blue
    GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink

+    # Gender colors for correlation plots (green/red indicate +/- correlation)
+    # Male = darker shade, Female = lighter shade
+    CORR_MALE_POSITIVE = "#1B5E20"     # Dark Green
+    CORR_FEMALE_POSITIVE = "#81C784"   # Light Green
+    CORR_MALE_NEGATIVE = "#B71C1C"     # Dark Red
+    CORR_FEMALE_NEGATIVE = "#E57373"   # Light Red
+
    # Speaking Style Colors (named after the style quadrant colors)
    STYLE_GREEN = "#2E7D32"   # Forest Green
    STYLE_BLUE = "#1565C0"    # Strong Blue
--- a/utils.py
+++ b/utils.py
@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
            yield shape


+def _set_shape_alt_text(shape, alt_text: str):
+    """
+    Set alt text (descr attribute) for a PowerPoint shape.
+    """
+    nvPr = None
+    # Check for common property names used by python-pptx elements
+    for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
+        if hasattr(shape._element, attr):
+            nvPr = getattr(shape._element, attr)
+            break
+            
+    if nvPr and hasattr(nvPr, 'cNvPr'):
+        nvPr.cNvPr.set("descr", alt_text)
+
+
 def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
    """
-    Updates the alt text of images in a PowerPoint presentation by matching
-    their content with images in a source directory.
+    Updates the alt text of images in a PowerPoint presentation.
+    
+    1. First pass: Validates existing alt-text format (<filter>/<filename>). 
+       - Fixes full paths by keeping only the last two parts.
+       - Clears invalid alt-text.
+    2. Second pass: If images are missing alt-text, matches them against source directory
+       using perceptual hash or SHA1.

    Args:
        ppt_path (str/Path): Path to the PowerPoint file.
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
    if output_path is None:
        output_path = ppt_path

-    # 1. Build lookup map of {hash: file_path} from the source directory
-    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
-
-    # 2. Open Presentation
+    # Open Presentation
    try:
        prs = Presentation(ppt_path)
    except Exception as e:
@@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
        return

    updates_count = 0
-    unmatched_images = []  # Collect unmatched images to report at the end
+    images_needing_match = []
+    
    slides = list(prs.slides)
    total_slides = len(slides)

-    print(f"Processing {total_slides} slides...")
+    print(f"Scanning {total_slides} slides for existing alt-text...")

+    # Pass 1: Scan and clean existing alt-text
    for i, slide in enumerate(slides):
-        # Use recursive iterator to find all pictures including those in groups/placeholders
        picture_shapes = list(_iter_picture_shapes(slide.shapes))
        
        for shape in picture_shapes:
-            try:
-                # Get image hash based on selected method
-                if use_perceptual_hash:
-                    # Use perceptual hash of the image blob for visual content matching
-                    current_hash = _calculate_perceptual_hash(shape.image.blob)
-                else:
-                    # Use SHA1 hash from python-pptx (exact byte match)
-                    current_hash = shape.image.sha1
+            alt_text = _get_shape_alt_text(shape)
+            has_valid_alt = False
+            
+            if alt_text:
+                # Handle potential path separators and whitespace
+                clean_alt = alt_text.strip().replace('\\', '/')
+                parts = clean_alt.split('/')
                
-                if current_hash in image_hash_map:
-                    original_path = image_hash_map[current_hash]
+                # Check if it looks like a path/file reference (at least 2 parts like dir/file)
+                if len(parts) >= 2:
+                    # Enforce format: keep last 2 parts (e.g. filter/image.png)
+                    new_alt = '/'.join(parts[-2:])
                    
-                    # Generate Alt Text
-                    try:
-                        # Prepare path for generator. 
-                        # Try to relativize to CWD if capable
-                        pass_path = original_path
-                        try:
-                            pass_path = original_path.relative_to(Path.cwd())
-                        except ValueError:
-                            pass
+                    if new_alt != alt_text:
+                        print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
+                        _set_shape_alt_text(shape, new_alt)
+                        updates_count += 1
                        
-                        new_alt_text = image_alt_text_generator(pass_path)
-                        
-                        # Check existing alt text to avoid redundant updates/log them
-                        # Accessing alt text via cNvPr
-                        # Note: Different shape types might store non-visual props differently
-                        # Picture: nvPicPr.cNvPr
-                        # GraphicFrame: nvGraphicFramePr.cNvPr
-                        # Group: nvGrpSpPr.cNvPr
-                        # Shape/Placeholder: nvSpPr.cNvPr
-                        
-                        nvPr = None
-                        for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
-                            if hasattr(shape._element, attr):
-                                nvPr = getattr(shape._element, attr)
-                                break
-                        
-                        if nvPr and hasattr(nvPr, 'cNvPr'):
-                            cNvPr = nvPr.cNvPr
-                            existing_alt_text = cNvPr.get("descr", "")
-                            
-                            if existing_alt_text != new_alt_text:
-                                print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
-                                print(f"  Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
-                                cNvPr.set("descr", new_alt_text)
-                                updates_count += 1
-                        else:
-                            print(f"Could not find cNvPr for shape on slide {i+1}")
-                            
-                    except AssertionError as e:
-                        print(f"Skipping match for {original_path} due to generator error: {e}")
-                    except Exception as e:
-                        print(f"Error updating alt text for {original_path}: {e}")
-                
+                    has_valid_alt = True
                else:
-                    # Check if image already has alt text set - if so, skip reporting as unmatched
-                    existing_alt = _get_shape_alt_text(shape)
-                    if existing_alt:
-                        # Image already has alt text, no need to report as unmatched
-                        continue
-                    
-                    shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
-                    shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
-                    hash_type = "pHash" if use_perceptual_hash else "SHA1"
-                    
-                    unmatched_images.append({
-                        'slide': i+1,
-                        'shape_name': shape_name,
-                        'hash_type': hash_type,
-                        'hash': current_hash
-                    })
-                            
-            except AttributeError:
-                continue
-            except Exception as e:
-                print(f"Error processing shape on slide {i+1}: {e}")
+                    # User requested deleting other cases that do not meet format
+                    # If it's single word or doesn't look like our path format
+                    pass # logic below handles this
+            
+            if not has_valid_alt:
+                if alt_text:
+                    print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
+                    _set_shape_alt_text(shape, "")
+                    updates_count += 1
+                
+                # Queue for hash matching
+                shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
+                shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
+                images_needing_match.append({
+                    'slide_idx': i, # 0-based
+                    'slide_num': i+1,
+                    'shape': shape,
+                    'shape_name': shape_name
+                })

-    # Print summary
+    if not images_needing_match:
+        print("\nAll images have valid alt-text format. No hash matching needed.")
+        if updates_count > 0:
+            prs.save(output_path)
+            print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
+        else:
+             print("Presentation is up to date.")
+        return
+
+    # Pass 2: Hash Matching
+    print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
+    
+    # Build lookup map of {hash: file_path} only if needed
+    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
+    
+    unmatched_images = []
+
+    for item in images_needing_match:
+        shape = item['shape']
+        slide_num = item['slide_num']
+        
+        try:
+            # Get image hash
+            if use_perceptual_hash:
+                current_hash = _calculate_perceptual_hash(shape.image.blob)
+            else:
+                current_hash = shape.image.sha1
+            
+            if current_hash in image_hash_map:
+                original_path = image_hash_map[current_hash]
+                
+                # Generate Alt Text
+                try:
+                    # Try to relativize to CWD if capable
+                    pass_path = original_path
+                    try:
+                        pass_path = original_path.relative_to(Path.cwd())
+                    except ValueError:
+                        pass
+                    
+                    new_alt_text = image_alt_text_generator(pass_path)
+                    
+                    print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
+                    _set_shape_alt_text(shape, new_alt_text)
+                    updates_count += 1
+                        
+                except Exception as e:
+                    print(f"Error generating alt text for {original_path}: {e}")
+            else:
+                hash_type = "pHash" if use_perceptual_hash else "SHA1"
+                unmatched_images.append({
+                    'slide': slide_num,
+                    'shape_name': item['shape_name'],
+                    'hash_type': hash_type,
+                    'hash': current_hash
+                })
+                        
+        except Exception as e:
+            print(f"Error processing shape on slide {slide_num}: {e}")
+
+    # Save and Print Summary
    print("\n" + "="*80)
    if updates_count > 0:
        prs.save(output_path)
        print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
    else:
-        print("No images matched or required updates.")
+        print("No matches found for missing images.")
    
-    # List unmatched images at the end
    if unmatched_images:
-        print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
+        print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:")
        for img in unmatched_images:
            print(f"  • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
    else:
-        print("\n✓ All images matched successfully!")
+        print("\n✓ All images processed successfully!")
    print("="*80)


@@ -723,7 +762,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
 class QualtricsSurvey(QualtricsPlotsMixin):
    """Class to handle Qualtrics survey data."""
    
-    def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
+    def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path], figures_dir: Union[str, Path] = None):
        if isinstance(data_path, str):
            data_path = Path(data_path)
        
@@ -735,8 +774,12 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        self.qid_descr_map = self._extract_qid_descr_map()
        self.qsf:dict = self._load_qsf()
        
-        # get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
-        self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
+        if figures_dir:
+            self.fig_save_dir = Path(figures_dir)
+        else:
+            # get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
+            self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
+            
        if not self.fig_save_dir.exists():
            self.fig_save_dir.mkdir(parents=True, exist_ok=True)

@@ -879,40 +922,42 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        """
        
        # Apply filters - skip if empty list (columns with all NULLs produce empty options)
+        # OR if all options are selected (to avoid dropping NULLs)
+        
        self.filter_age = age
-        if age is not None and len(age) > 0:
+        if age is not None and len(age) > 0 and set(age) != set(self.options_age):
            q = q.filter(pl.col('QID1').is_in(age))
        
        self.filter_gender = gender
-        if gender is not None and len(gender) > 0:
+        if gender is not None and len(gender) > 0 and set(gender) != set(self.options_gender):
            q = q.filter(pl.col('QID2').is_in(gender))
        
        self.filter_consumer = consumer
-        if consumer is not None and len(consumer) > 0:
+        if consumer is not None and len(consumer) > 0 and set(consumer) != set(self.options_consumer):
            q = q.filter(pl.col('Consumer').is_in(consumer))
        
        self.filter_ethnicity = ethnicity
-        if ethnicity is not None and len(ethnicity) > 0:
+        if ethnicity is not None and len(ethnicity) > 0 and set(ethnicity) != set(self.options_ethnicity):
            q = q.filter(pl.col('QID3').is_in(ethnicity))
        
        self.filter_income = income
-        if income is not None and len(income) > 0:
+        if income is not None and len(income) > 0 and set(income) != set(self.options_income):
            q = q.filter(pl.col('QID15').is_in(income))
        
        self.filter_business_owner = business_owner
-        if business_owner is not None and len(business_owner) > 0:
+        if business_owner is not None and len(business_owner) > 0 and set(business_owner) != set(self.options_business_owner):
            q = q.filter(pl.col('QID4').is_in(business_owner))
        
        self.filter_ai_user = ai_user
-        if ai_user is not None and len(ai_user) > 0:
+        if ai_user is not None and len(ai_user) > 0 and set(ai_user) != set(self.options_ai_user):
            q = q.filter(pl.col('QID22').is_in(ai_user))
        
        self.filter_investable_assets = investable_assets
-        if investable_assets is not None and len(investable_assets) > 0:
+        if investable_assets is not None and len(investable_assets) > 0 and set(investable_assets) != set(self.options_investable_assets):
            q = q.filter(pl.col('QID16').is_in(investable_assets))
        
        self.filter_industry = industry
-        if industry is not None and len(industry) > 0:
+        if industry is not None and len(industry) > 0 and set(industry) != set(self.options_industry):
            q = q.filter(pl.col('QID17').is_in(industry))
        
        self.data_filtered = q
@@ -1070,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):

        return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
    
+    def get_top_3_voices_missing_ranking(
+        self, q: pl.LazyFrame
+    ) -> pl.DataFrame:
+        """Identify respondents who completed the top-3 voice selection (QID36)
+        but are missing the explicit ranking question (QID98).
+
+        These respondents picked 3 voices in the selection step and have
+        selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
+        ranking columns are null.  This means ``get_top_3_voices()`` will
+        return all-null rows for them, causing plots like
+        ``plot_most_ranked_1`` to undercount.
+
+        Parameters:
+            q: The (optionally filtered) LazyFrame from ``load_data()``.
+
+        Returns:
+            A collected ``pl.DataFrame`` with columns:
+
+            - ``_recordId`` – the respondent identifier
+            - ``3_Ranked`` – comma-separated text of the 3 voices they selected
+            - ``qid36_rank_cols`` – dict-like column with their QID36 selection-
+              order values (for reference; these are *not* preference ranks)
+        """
+        # Get the top-3 ranking data (QID98-based)
+        top3, _ = self.get_top_3_voices(q)
+        top3_df = top3.collect()
+
+        ranking_cols = [c for c in top3_df.columns if c != '_recordId']
+
+        # Respondents where every QID98 ranking column is null
+        all_null_expr = pl.lit(True)
+        for col in ranking_cols:
+            all_null_expr = all_null_expr & pl.col(col).is_null()
+
+        missing_ids = top3_df.filter(all_null_expr).select('_recordId')
+
+        if missing_ids.height == 0:
+            return pl.DataFrame(schema={
+                '_recordId': pl.Utf8,
+                '3_Ranked': pl.Utf8,
+            })
+
+        # Enrich with the 3_Ranked text from the 18→8→3 question
+        v_18_8_3, _ = self.get_18_8_3(q)
+        v_df = v_18_8_3.collect()
+
+        result = missing_ids.join(
+            v_df.select(['_recordId', '3_Ranked']),
+            on='_recordId',
+            how='left',
+        )
+
+        return result
+
    
    def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
        """Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant. 
@@ -1543,6 +1642,235 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        
        return results_df, metadata

+    def compute_mentions_significance(
+        self,
+        data: pl.LazyFrame | pl.DataFrame,
+        alpha: float = 0.05,
+        correction: str = "bonferroni",
+    ) -> tuple[pl.DataFrame, dict]:
+        """Compute statistical significance for Total Mentions (Rank 1+2+3).
+        
+        Tests whether the proportion of respondents who included a voice in their Top 3
+        is significantly different between voices.
+        
+        Args:
+            data: Ranking data (rows=respondents, cols=voices, values=rank).
+            alpha: Significance level.
+            correction: Multiple comparison correction method.
+            
+        Returns:
+            tuple: (pairwise_df, metadata)
+        """
+        from scipy import stats as scipy_stats
+        import numpy as np
+        
+        if isinstance(data, pl.LazyFrame):
+            df = data.collect()
+        else:
+            df = data
+        
+        ranking_cols = [c for c in df.columns if c != '_recordId']
+        if len(ranking_cols) < 2:
+            raise ValueError("Need at least 2 ranking columns")
+        
+        total_respondents = df.height
+        mentions_data = {}
+        
+        # Count mentions (any rank) for each voice
+        for col in ranking_cols:
+            label = self._clean_voice_label(col)
+            count = df.filter(pl.col(col).is_not_null()).height
+            mentions_data[label] = count
+            
+        labels = sorted(list(mentions_data.keys()))
+        results = []
+        n_comparisons = len(labels) * (len(labels) - 1) // 2
+        
+        for i, label1 in enumerate(labels):
+            for label2 in labels[i+1:]:
+                count1 = mentions_data[label1]
+                count2 = mentions_data[label2]
+                
+                pct1 = count1 / total_respondents
+                pct2 = count2 / total_respondents
+                
+                # Z-test for two proportions
+                n1 = total_respondents
+                n2 = total_respondents
+                
+                p_pooled = (count1 + count2) / (n1 + n2)
+                se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
+                
+                if se > 0:
+                    z_stat = (pct1 - pct2) / se
+                    p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
+                else:
+                    p_value = 1.0
+                
+                results.append({
+                    'group1': label1,
+                    'group2': label2,
+                    'p_value': float(p_value),
+                    'rank1_count1': count1, # Reusing column names for compatibility with heatmap plotting
+                    'rank1_count2': count2,
+                    'rank1_pct1': round(pct1 * 100, 1),
+                    'rank1_pct2': round(pct2 * 100, 1),
+                    'total1': n1,
+                    'total2': n2,
+                    'effect_size': pct1 - pct2 # Difference in proportions
+                })
+        
+        results_df = pl.DataFrame(results)
+        
+        p_values = results_df['p_value'].to_numpy()
+        p_adjusted = np.full_like(p_values, np.nan, dtype=float)
+        
+        if correction == "bonferroni":
+            p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
+        elif correction == "holm":
+            sorted_idx = np.argsort(p_values)
+            sorted_p = p_values[sorted_idx]
+            m = len(sorted_p)
+            adjusted = np.zeros(m)
+            for j in range(m):
+                adjusted[j] = sorted_p[j] * (m - j)
+            for j in range(1, m):
+                adjusted[j] = max(adjusted[j], adjusted[j-1])
+            adjusted = np.minimum(adjusted, 1.0)
+            p_adjusted = adjusted[np.argsort(sorted_idx)]
+        elif correction == "none":
+            p_adjusted = p_values.astype(float) # pyright: ignore
+            
+        results_df = results_df.with_columns([
+            pl.Series('p_adjusted', p_adjusted),
+            pl.Series('significant', p_adjusted < alpha),
+        ]).sort('p_value')
+        
+        metadata = {
+            'test_type': 'proportion_z_test_mentions',
+            'alpha': alpha,
+            'correction': correction,
+            'n_comparisons': n_comparisons,
+        }
+        
+        return results_df, metadata
+
+    def compute_rank1_significance(
+        self,
+        data: pl.LazyFrame | pl.DataFrame,
+        alpha: float = 0.05,
+        correction: str = "bonferroni",
+    ) -> tuple[pl.DataFrame, dict]:
+        """Compute statistical significance for Rank 1 selections only.
+
+        Like compute_mentions_significance but counts only how many times each
+        voice/character was ranked **1st**, using total respondents as the
+        denominator.  This tests whether first-choice preference differs
+        significantly between voices.
+
+        Args:
+            data: Ranking data (rows=respondents, cols=voices, values=rank).
+            alpha: Significance level.
+            correction: Multiple comparison correction method.
+
+        Returns:
+            tuple: (pairwise_df, metadata)
+        """
+        from scipy import stats as scipy_stats
+        import numpy as np
+
+        if isinstance(data, pl.LazyFrame):
+            df = data.collect()
+        else:
+            df = data
+
+        ranking_cols = [c for c in df.columns if c != '_recordId']
+        if len(ranking_cols) < 2:
+            raise ValueError("Need at least 2 ranking columns")
+
+        total_respondents = df.height
+        rank1_data: dict[str, int] = {}
+
+        # Count rank-1 selections for each voice
+        for col in ranking_cols:
+            label = self._clean_voice_label(col)
+            count = df.filter(pl.col(col) == 1).height
+            rank1_data[label] = count
+
+        labels = sorted(list(rank1_data.keys()))
+        results = []
+        n_comparisons = len(labels) * (len(labels) - 1) // 2
+
+        for i, label1 in enumerate(labels):
+            for label2 in labels[i+1:]:
+                count1 = rank1_data[label1]
+                count2 = rank1_data[label2]
+
+                pct1 = count1 / total_respondents
+                pct2 = count2 / total_respondents
+
+                # Z-test for two proportions (same denominator for both)
+                n1 = total_respondents
+                n2 = total_respondents
+
+                p_pooled = (count1 + count2) / (n1 + n2)
+                se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
+
+                if se > 0:
+                    z_stat = (pct1 - pct2) / se
+                    p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
+                else:
+                    p_value = 1.0
+
+                results.append({
+                    'group1': label1,
+                    'group2': label2,
+                    'p_value': float(p_value),
+                    'rank1_count1': count1,
+                    'rank1_count2': count2,
+                    'rank1_pct1': round(pct1 * 100, 1),
+                    'rank1_pct2': round(pct2 * 100, 1),
+                    'total1': n1,
+                    'total2': n2,
+                    'effect_size': pct1 - pct2,
+                })
+
+        results_df = pl.DataFrame(results)
+
+        p_values = results_df['p_value'].to_numpy()
+        p_adjusted = np.full_like(p_values, np.nan, dtype=float)
+
+        if correction == "bonferroni":
+            p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
+        elif correction == "holm":
+            sorted_idx = np.argsort(p_values)
+            sorted_p = p_values[sorted_idx]
+            m = len(sorted_p)
+            adjusted = np.zeros(m)
+            for j in range(m):
+                adjusted[j] = sorted_p[j] * (m - j)
+            for j in range(1, m):
+                adjusted[j] = max(adjusted[j], adjusted[j-1])
+            adjusted = np.minimum(adjusted, 1.0)
+            p_adjusted = adjusted[np.argsort(sorted_idx)]
+        elif correction == "none":
+            p_adjusted = p_values.astype(float)  # pyright: ignore
+
+        results_df = results_df.with_columns([
+            pl.Series('p_adjusted', p_adjusted),
+            pl.Series('significant', p_adjusted < alpha),
+        ]).sort('p_value')
+
+        metadata = {
+            'test_type': 'proportion_z_test_rank1',
+            'alpha': alpha,
+            'correction': correction,
+            'n_comparisons': n_comparisons,
+        }
+
+        return results_df, metadata
+
+

 def process_speaking_style_data(
    df: Union[pl.LazyFrame, pl.DataFrame],
Author	SHA1	Message	Date
Luigi Maiorano	03a716e8ec	correlation matrix speech characteristics vs score	2026-02-10 16:50:47 +01:00
Luigi Maiorano	8720bb670d	started speech data notebook	2026-02-10 14:58:13 +01:00
Luigi Maiorano	9dfab75925	missing data analysis	2026-02-10 14:24:26 +01:00
Luigi Maiorano	14e28cf368	stat significance nr times ranked 1st	2026-02-09 18:37:41 +01:00
Luigi Maiorano	8e181e193a	SL filter	2026-02-09 17:57:04 +01:00
Luigi Maiorano	6c16993cb3	straight-liner plot analysis	2026-02-09 17:26:45 +01:00
Luigi Maiorano	92c6fc03ab	docs datasets	2026-02-09 13:17:59 +01:00
Luigi Maiorano	7fb6570190	statistical significance	2026-02-05 19:49:19 +01:00
Luigi Maiorano	840bd2940d	other top bc's	2026-02-05 11:50:00 +01:00
Luigi Maiorano	af9a15ccb0	renamed notebooks and added significance test	2026-02-05 10:14:53 +01:00
Luigi Maiorano	a3cf9f103d	update plots with final data release	2026-02-04 21:15:03 +01:00
Luigi Maiorano	f0eab32c34	update alt-text with full filepaths	2026-02-04 17:48:48 +01:00
Luigi Maiorano	d231fc02db	fix missing filter descr in correlation plots	2026-02-04 14:48:14 +01:00
Luigi Maiorano	fc76bb0ab5	voice gender split correlation plots	2026-02-04 13:44:51 +01:00
Luigi Maiorano	ab78276a97	male/female voices in separate plots for correlations	2026-02-04 12:35:24 +01:00
Luigi Maiorano	e17646eb70	correlation plots for best bc	2026-02-04 10:46:31 +01:00
Luigi Maiorano	ad1d8c6e58	all plots offline update	2026-02-03 22:38:15 +01:00
Luigi Maiorano	f5b4c247b8	tidy plots	2026-02-03 22:12:17 +01:00
Luigi Maiorano	a35670aa72	fixed missing ai_user category	2026-02-03 21:13:29 +01:00
Luigi Maiorano	36280a6ff8	fix sample size	2026-02-03 20:48:34 +01:00
Luigi Maiorano	9a587dcc4c	add ai-user filter combinations	2026-02-03 19:46:07 +01:00
Luigi Maiorano	9a49d1c690	added sample size to filter text	2026-02-03 19:16:39 +01:00
Luigi Maiorano	8f505da550	offline update 18-30	2026-02-03 18:43:20 +01:00
Luigi Maiorano	495b56307c	fixed filter to none	2026-02-03 18:19:06 +01:00
Luigi Maiorano	1e76a82f24	fix wordcloud filter values	2026-02-03 17:41:12 +01:00