correlation matrix speech characteristics vs score

started speech data notebook
missing data analysis
2026-02-10 16:50:47 +01:00 · 2026-02-10 14:58:13 +01:00 · 2026-02-10 14:24:26 +01:00 · 2026-02-09 18:37:41 +01:00 · 2026-02-09 17:57:04 +01:00 · 2026-02-09 17:26:45 +01:00
18 changed files with 6145 additions and 1020 deletions
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "wakatime.vscode-wakatime"
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,5 @@
+{
+    "chat.tools.terminal.autoApprove": {
+        "/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/.venv/bin/python": true
+    }
+}
--- a/03_quant_report.script.py
+++ b/03_quant_report.script.py
@@ -1,691 +0,0 @@
-
-__generated_with = "0.19.7"
-
-# %%
-import marimo as mo
-import polars as pl
-from pathlib import Path
-import argparse
-import json
-
-from validation import check_progress, duration_validation, check_straight_liners
-from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
-import utils
-
-from speaking_styles import SPEAKING_STYLES
-
-# %%
-# CLI argument parsing for batch automation
-# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
-# When run in Jupyter: args will use defaults (all filters = None = all options selected)
-
-# Central filter configuration - add new filters here only
-# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
-FILTER_CONFIG = {
-    'age': 'options_age',
-    'gender': 'options_gender',
-    'ethnicity': 'options_ethnicity',
-    'income': 'options_income',
-    'consumer': 'options_consumer',
-    # Add new filters here: 'newfilter': 'options_newfilter',
-}
-
-def parse_cli_args():
-    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
-    
-    # Dynamically add filter arguments from config
-    for filter_name in FILTER_CONFIG:
-        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
-    
-    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
-    
-    # Only parse if running as script (not in Jupyter/interactive)
-    try:
-        # Check if running in Jupyter by looking for ipykernel
-        get_ipython()  # noqa: F821
-        # Return namespace with all filters set to None
-        return argparse.Namespace(**{f: None for f in FILTER_CONFIG}, filter_name=None)
-    except NameError:
-        args = parser.parse_args()
-        # Parse JSON strings to lists
-        for filter_name in FILTER_CONFIG:
-            val = getattr(args, filter_name)
-            setattr(args, filter_name, json.loads(val) if val else None)
-        return args
-
-cli_args = parse_cli_args()
-
-# %%
-
-# file_browser = mo.ui.file_browser(
-#     initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
-# )
-# file_browser
-
-# # %%
-# mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
-# RESULTS_FILE = Path(file_browser.path(index=0))
-
-RESULTS_FILE = 'data/exports/2-3-26_Copy-2-2-26/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
-QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
-
-# %%
-S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
-try:
-    data_all = S.load_data()
-except NotImplementedError as e:
-    mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
-
-# %%
-BEST_CHOSEN_CHARACTER = "the_coach"
-
-# # %%
-# filter_form = mo.md('''
-
-
-
-# {age}
-
-# {gender}
-
-# {ethnicity}
-
-# {income}
-
-# {consumer}
-# '''
-# ).batch(
-#     age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
-#     gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
-#     ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
-#     income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
-#     consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
-# ).form()
-# mo.md(f'''
-# ---
-
-# # Data Filter
-
-# {filter_form}
-# ''')
-
-# %%
-# mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
-# CLI args: None means "all options selected" (use S.options_* defaults)
-# Build filter values dict dynamically from FILTER_CONFIG
-_active_filters = {}
-for filter_name, options_attr in FILTER_CONFIG.items():
-    cli_value = getattr(cli_args, filter_name)
-    all_options = getattr(S, options_attr)
-    _active_filters[filter_name] = cli_value if cli_value is not None else all_options
-
-_d = S.filter_data(data_all, **_active_filters)
-
-# Write filter description file if filter-name is provided
-if cli_args.filter_name and S.fig_save_dir:
-    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
-    _filter_slug = S._get_filter_slug()
-    _filter_slug_dir = S.fig_save_dir / _filter_slug
-    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
-    
-    # Build filter description
-    _filter_desc_lines = [
-        f"Filter: {cli_args.filter_name}",
-        "",
-        "Applied Filters:",
-    ]
-    _short_desc_parts = []
-    for filter_name, options_attr in FILTER_CONFIG.items():
-        all_options = getattr(S, options_attr)
-        values = _active_filters[filter_name]
-        display_name = filter_name.replace('_', ' ').title()
-        if values != all_options:
-            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
-            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
-        else:
-            _filter_desc_lines.append(f"  {display_name}: All")
-    
-    # Write detailed description INSIDE the filter-slug directory
-    _filter_file = _filter_slug_dir / f"{cli_args.filter_name}.txt"
-    _filter_file.write_text('\n'.join(_filter_desc_lines))
-    
-    # Append to summary index file at figures/<export_date>/filter_index.txt
-    _summary_file = S.fig_save_dir / "filter_index.txt"
-    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
-    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
-    
-    # Append or create the summary file
-    if _summary_file.exists():
-        _existing = _summary_file.read_text()
-        # Avoid duplicate entries for same slug
-        if _filter_slug not in _existing:
-            with _summary_file.open('a') as f:
-                f.write(_summary_line)
-    else:
-        _header = "Filter Index\n" + "=" * 80 + "\n\n"
-        _header += "Directory  |  Filter Name  |  Description\n"
-        _header += "-" * 80 + "\n"
-        _summary_file.write_text(_header + _summary_line)
-
-# Stop execution and prevent other cells from running if no data is selected
-# mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
-data = _d
-
-# data = data_validated
-data.collect()
-
-# %%
-
-
-# %%
-# Check if all business owners are missing a 'Consumer type' in demographics
-# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
-
-# %%
-mo.md(r"""
-# Demographic Distributions
-""")
-
-# %%
-demo_plot_cols = [
-    'Age',
-    'Gender',
-    # 'Race/Ethnicity',
-    'Bussiness_Owner',
-    'Consumer'
-]
-
-# %%
-_content = """
-
-"""
-for c in demo_plot_cols:
-    _fig = S.plot_demographic_distribution(
-        data=S.get_demographics(data)[0],
-        column=c,
-        title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
-    )
-    _content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
-
-mo.md(_content)
-
-# %%
-mo.md(r"""
---
-
-# Brand Character Results
-""")
-
-# %%
-mo.md(r"""
-## Best performing: Original vs Refined frankenstein
-""")
-
-# %%
-char_refine_rank = S.get_character_refine(data)[0]
-# print(char_rank.collect().head())
-print(char_refine_rank.collect().head())
-
-# %%
-mo.md(r"""
-## Character ranking points
-""")
-
-# %%
-mo.md(r"""
-## Character ranking 1-2-3
-""")
-
-# %%
-char_rank = S.get_character_ranking(data)[0]
-
-# %%
-char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
-S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
-
-# %%
-S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
-
-# %%
-mo.md(r"""
-### Statistical Significance Character Ranking
-""")
-
-# %%
-_pairwise_df, _meta = S.compute_ranking_significance(char_rank)
-
-# print(_pairwise_df.columns)
-
-mo.md(f"""
-
-
-{mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
-
-{mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
-""")
-
-# %%
-mo.md(r"""
-## Character Ranking: times 1st place
-""")
-
-# %%
-S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
-
-# %%
-mo.md(r"""
-## Prominent predefined personality traits wordcloud
-""")
-
-# %%
-top8_traits = S.get_top_8_traits(data)[0]
-S.plot_traits_wordcloud(
-    data=top8_traits,
-    column='Top_8_Traits',
-    title="Most Prominent Personality Traits",
-)
-
-# %%
-mo.md(r"""
-## Trait frequency per brand character
-""")
-
-# %%
-char_df = S.get_character_refine(data)[0]
-
-# %%
-from theme import ColorPalette
-
-# Assuming you already have char_df (your data from get_character_refine or similar)
-characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
-character_colors = {
-    'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
-    'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
-    'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
-    'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
-}
-
-# Build consistent sort order (by total frequency across all characters)
-all_trait_counts = {}
-for char in characters:
-    freq_df, _ = S.transform_character_trait_frequency(char_df, char)
-    for row in freq_df.iter_rows(named=True):
-        all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
-
-consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
-
-_content = """"""
-# Generate 4 plots (one per character)
-for char in characters:
-    freq_df, _ = S.transform_character_trait_frequency(char_df, char)
-    main_color, highlight_color = character_colors[char]
-    chart = S.plot_single_character_trait_frequency(
-        data=freq_df,
-        character_name=char,
-        bar_color=main_color,
-        highlight_color=highlight_color,
-        trait_sort_order=consistent_sort_order,
-    )
-    _content += f"""
-    {mo.ui.altair_chart(chart)}
-
-
-"""
-
-mo.md(_content)
-
-# %%
-mo.md(r"""
-## Statistical significance best characters
-
-zie chat
-> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
->
-""")
-
-# %%
-
-
-# %%
-
-
-# %%
-mo.md(r"""
---
-
-# Spoken Voice Results
-""")
-
-# %%
-COLOR_GENDER = True
-
-# %%
-mo.md(r"""
-## Top 8 Most Chosen out of 18
-""")
-
-# %%
-v_18_8_3 = S.get_18_8_3(data)[0]
-
-# %%
-S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice', color_gender=COLOR_GENDER)
-
-# %%
-mo.md(r"""
-## Top 3 most chosen out of 8
-""")
-
-# %%
-S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice', color_gender=COLOR_GENDER)
-
-# %%
-mo.md(r"""
-## Voice Ranking Weighted Score
-""")
-
-# %%
-top3_voices = S.get_top_3_voices(data)[0]
-top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
-
-# %%
-S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", color_gender=COLOR_GENDER)
-
-# %%
-mo.md(r"""
-## Which voice is ranked best in the ranking question for top 3?
-
-(not best 3 out of 8 question)
-""")
-
-# %%
-S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Top 3 Voice Rankings (1st, 2nd, 3rd)", color_gender=COLOR_GENDER)
-
-# %%
-mo.md(r"""
-### Statistical significance for voice ranking
-""")
-
-# %%
-# print(top3_voices.collect().head())
-
-# %%
-
-# _pairwise_df, _metadata = S.compute_ranking_significance(
-#     top3_voices,alpha=0.05,correction="none")
-
-# # View significant pairs
-# # print(pairwise_df.filter(pl.col('significant') == True))
-
-# # Create heatmap visualization
-# _heatmap = S.plot_significance_heatmap(
-#     _pairwise_df, 
-#     metadata=_metadata,
-#     title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
-# )
-
-# # Create summary bar chart
-# _summary = S.plot_significance_summary(
-#     _pairwise_df,
-#     metadata=_metadata
-# )
-
-# mo.md(f"""
-# {mo.ui.altair_chart(_heatmap)}
-
-# {mo.ui.altair_chart(_summary)}
-# """)
-
-# %%
-## Voice Ranked 1st the most
-
-# %%
-S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', color_gender=COLOR_GENDER)
-
-# %%
-mo.md(r"""
-## Voice Scale 1-10
-""")
-
-# %%
-# Get your voice scale data (from notebook)
-voice_1_10, _ = S.get_voice_scale_1_10(data)
-S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)", color_gender=COLOR_GENDER)
-
-# %%
-mo.md(r"""
-### Statistical Significance (Scale 1-10)
-""")
-
-# %%
-# Compute pairwise significance tests
-pairwise_df, metadata = S.compute_pairwise_significance(
-    voice_1_10,
-    test_type="mannwhitney",  # or "ttest", "chi2", "auto"
-    alpha=0.05,
-    correction="bonferroni"   # or "holm", "none"
-)
-
-# View significant pairs
-# print(pairwise_df.filter(pl.col('significant') == True))
-
-# Create heatmap visualization
-_heatmap = S.plot_significance_heatmap(
-    pairwise_df, 
-    metadata=metadata,
-    title="Voice Rating Significance<br>(Pairwise Comparisons)"
-)
-
-# Create summary bar chart
-_summary = S.plot_significance_summary(
-    pairwise_df,
-    metadata=metadata
-)
-
-mo.md(f"""
-{mo.ui.altair_chart(_heatmap)}
-
-{mo.ui.altair_chart(_summary)}
-""")
-
-# %%
-
-
-# %%
-mo.md(r"""
-## Ranking points for Voice per Chosen Brand Character
-
-**missing mapping**
-""")
-
-# %%
-mo.md(r"""
-## Correlation Speaking Styles
-""")
-
-# %%
-ss_or, choice_map_or = S.get_ss_orange_red(data)
-ss_gb, choice_map_gb = S.get_ss_green_blue(data)
-
-# Combine the data
-ss_all = ss_or.join(ss_gb, on='_recordId')
-_d = ss_all.collect()
-
-choice_map = {**choice_map_or, **choice_map_gb}
-# print(_d.head())
-# print(choice_map)
-ss_long = utils.process_speaking_style_data(ss_all, choice_map)
-
-df_style = utils.process_speaking_style_data(ss_all, choice_map)
-
-vscales = S.get_voice_scale_1_10(data)[0]
-df_scale_long = utils.process_voice_scale_data(vscales)
-
-joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
-
-df_ranking = utils.process_voice_ranking_data(top3_voices)
-joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
-
-# %%
-joined_ranking.head()
-
-# %%
-mo.md(r"""
-### Colors vs Scale 1-10
-""")
-
-# %%
-# Transform to get one row per color with average correlation
-color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
-S.plot_speaking_style_color_correlation(
-    data=color_corr_scale,
-    title="Correlation: Speaking Style Colors and Voice Scale 1-10"
-)
-
-# %%
-mo.md(r"""
-### Colors vs Ranking Points
-""")
-
-# %%
-color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
-    joined_ranking, 
-    SPEAKING_STYLES, 
-    target_column="Ranking_Points"
-)
-S.plot_speaking_style_color_correlation(
-    data=color_corr_ranking,
-    title="Correlation: Speaking Style Colors and Voice Ranking Points"
-)
-
-# %%
-mo.md(r"""
-### Individual Traits vs Scale 1-10
-""")
-
-# %%
-_content = """"""
-
-for _style, _traits in SPEAKING_STYLES.items():
-    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_correlation(
-        data=joined_scale,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10",
-    )
-    _content += f"""
-#### Speaking Style **{_style}**:
-
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)
-
-# %%
-mo.md(r"""
-### Individual Traits vs Ranking Points
-""")
-
-# %%
-_content = """"""
-
-for _style, _traits in SPEAKING_STYLES.items():
-    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_ranking_correlation(
-    data=joined_ranking,
-    style_color=_style,
-    style_traits=_traits,
-    title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
-)
-    _content += f"""
-#### Speaking Style **{_style}**:
-
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)
-
-# %%
-mo.md(r"""
-## Correlations when "Best Brand Character" is chosen
-
-Select only the traits that fit with that character
-""")
-
-# %%
-from reference import ORIGINAL_CHARACTER_TRAITS
-chosen_bc_traits = ORIGINAL_CHARACTER_TRAITS[BEST_CHOSEN_CHARACTER]
-
-# %%
-STYLES_SUBSET = utils.filter_speaking_styles(SPEAKING_STYLES, chosen_bc_traits)
-
-# %%
-mo.md(r"""
-### Individual Traits vs Ranking Points
-""")
-
-# %%
-_content = ""
-for _style, _traits in STYLES_SUBSET.items():
-    _fig = S.plot_speaking_style_ranking_correlation(
-        data=joined_ranking,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Ranking Points"""
-    )
-    _content += f"""
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)
-
-# %%
-mo.md(r"""
-### Individual Traits vs Scale 1-10
-""")
-
-# %%
-_content = """"""
-
-for _style, _traits in STYLES_SUBSET.items():
-    # print(f"Correlation plot for {style}...")
-    _fig = S.plot_speaking_style_correlation(
-        data=joined_scale,
-        style_color=_style,
-        style_traits=_traits,
-        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Scale 1-10""",
-    )
-    _content += f"""
-{mo.ui.altair_chart(_fig)}
-
-"""
-mo.md(_content)
-
-# %%
-mo.md(r"""
-### Colors vs Scale 1-10 (Best Character)
-""")
-
-# %%
-# Transform to get one row per color with average correlation
-_color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, STYLES_SUBSET)
-S.plot_speaking_style_color_correlation(
-    data=_color_corr_scale,
-    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Scale 1-10"""
-)
-
-# %%
-mo.md(r"""
-### Colors vs Ranking Points (Best Character)
-""")
-
-# %%
-_color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
-    joined_ranking, 
-    STYLES_SUBSET, 
-    target_column="Ranking_Points"
-)
-S.plot_speaking_style_color_correlation(
-    data=_color_corr_ranking,
-    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Ranking Points"""
-)
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -21,9 +21,14 @@ def _():

@app.cell
 def _():
-    TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
+    return
+
+
+@app.cell
+def _():
+    TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
    # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
-    TAG_IMAGE_DIR = Path('figures/2-2-26')
+    TAG_IMAGE_DIR = Path('figures/debug')
    return TAG_IMAGE_DIR, TAG_SOURCE


@@ -47,10 +52,10 @@ def _():

@app.cell
 def _():
-    REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
-    REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
+    REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
+    # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')

-    NEW_IMAGES_DIR = Path('figures/2-2-26')
+    NEW_IMAGES_DIR = Path('figures/2-4-26')
    return NEW_IMAGES_DIR, REPLACE_SOURCE


--- a/README.md
+++ b/README.md
@@ -150,36 +150,57 @@ combinations.append({

 ## Adding a New Filter Dimension

-To add an entirely new filter dimension (e.g., a new demographic question), edit **only** `FILTER_CONFIG` in `03_quant_report.script.py`:
+To add an entirely new filter dimension (e.g., a new demographic question), you need to update several files:

 ### Checklist

-1. **Ensure `QualtricsSurvey`** has the corresponding `options_*` attribute and `filter_data()` accepts the parameter
-
-2. **Open** `03_quant_report.script.py`
-
-3. **Find** `FILTER_CONFIG` near the top of the file:
+1. **Update `utils.py` — `QualtricsSurvey.__init__()`** to initialize the filter state attribute:

 ```python
-FILTER_CONFIG = {
-    'age': 'options_age',
-    'gender': 'options_gender',
-    'ethnicity': 'options_ethnicity',
-    'income': 'options_income',
-    'consumer': 'options_consumer',
-    # Add new filters here: 'newfilter': 'options_newfilter',
-}
+# In __init__(), add after existing filter_ attributes (around line 758):
+self.filter_region:list = None  # QID99
 ```

-4. **Add** your new filter:
+2. **Update `utils.py` — `load_data()`** to populate the `options_*` attribute:
+
+```python
+# In load_data(), add after existing options:
+self.options_region = sorted(df['QID99'].drop_nulls().unique().to_list()) if 'QID99' in df.columns else []
+```
+
+3. **Update `utils.py` — `filter_data()`** to accept and apply the filter:
+
+```python
+# Add parameter to function signature:
+def filter_data(self, q: pl.LazyFrame, ..., region:list=None) -> pl.LazyFrame:
+
+# Add filter logic in function body:
+self.filter_region = region
+if region is not None:
+    q = q.filter(pl.col('QID99').is_in(region))
+```
+
+4. **Update `plots.py` — `_get_filter_slug()`** to include the filter in directory slugs:
+
+```python
+# Add to the filters list:
+('region', 'Reg', getattr(self, 'filter_region', None), 'options_region'),
+```
+
+5. **Update `plots.py` — `_get_filter_description()`** for human-readable descriptions:
+
+```python
+# Add to the filters list:
+('Region', getattr(self, 'filter_region', None), 'options_region'),
+```
+
+6. **Update `03_quant_report.script.py` — `FILTER_CONFIG`**:

 ```python
 FILTER_CONFIG = {
    'age': 'options_age',
    'gender': 'options_gender',
-    'ethnicity': 'options_ethnicity',
-    'income': 'options_income',
-    'consumer': 'options_consumer',
+    # ... existing filters ...
    'region': 'options_region',  # ← New filter
 }
 ```
@@ -190,4 +211,29 @@ This **automatically**:
 - Passes it to `S.filter_data()`
 - Writes it to the `.txt` filter description file

-5. **Update** `run_filter_combinations.py` to generate combinations for the new filter (optional)
+7. **Update `run_filter_combinations.py`** to generate combinations (optional):
+
+```python
+# Add after existing filter loops:
+for region in survey.options_region:
+    combinations.append({
+        'name': f'Region-{region}',
+        'filters': {'region': [region]}
+    })
+```
+
+### Currently Available Filters
+
+| CLI Argument | Options Attribute | QID Column | Description |
+|--------------|-------------------|------------|-------------|
+| `--age` | `options_age` | QID1 | Age groups |
+| `--gender` | `options_gender` | QID2 | Gender |
+| `--ethnicity` | `options_ethnicity` | QID3 | Ethnicity |
+| `--income` | `options_income` | QID15 | Income brackets |
+| `--consumer` | `options_consumer` | Consumer | Consumer segments |
+| `--business_owner` | `options_business_owner` | QID4 | Business owner status |
+| `--employment_status` | `options_employment_status` | QID13 | Employment status |
+| `--personal_products` | `options_personal_products` | QID14 | Personal products |
+| `--ai_user` | `options_ai_user` | QID22 | AI user status |
+| `--investable_assets` | `options_investable_assets` | QID16 | Investable assets |
+| `--industry` | `options_industry` | QID17 | Industry |
--- a/XX_detailed_trait_analysis.py
+++ b/XX_detailed_trait_analysis.py
@@ -0,0 +1,263 @@
+"""Extra analyses of the traits"""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+from validation import check_straight_liners
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+# %% Voices per trait
+
+
+ss_or, choice_map_or = S.get_ss_orange_red(data)
+ss_gb, choice_map_gb = S.get_ss_green_blue(data)
+
+# Combine the data
+ss_all = ss_or.join(ss_gb, on='_recordId')
+_d = ss_all.collect()
+
+choice_map = {**choice_map_or, **choice_map_gb}
+# print(_d.head())
+# print(choice_map)
+ss_long = utils.process_speaking_style_data(ss_all, choice_map)
+
+
+# %% Create plots
+
+for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
+    trait_d = ss_long.filter(pl.col("Description") == trait)
+
+    S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True)
+
+
+
+
+
+# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
+# Save with different filename suffix so we can compare with/without straight-liners
+
+print("\n--- Straight-lining Checks on TRAITS ---")
+sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
+sl_traits_df
+
+# %%
+
+if sl_traits_df is not None and not sl_traits_df.is_empty():
+    sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
+    n_sl_groups = sl_traits_df.height
+    print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
+    
+    # Create key in ss_long to match sl_traits_df for anti-join
+    # Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
+    # ss_long has "Style_Group" and "Voice"
+    ss_long_w_key = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    
+    # Prepare filter table: Record ID + Question Group
+    sl_filter = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+
+    # Anti-join to remove specific question blocks that were straight-lined
+    ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
+    
+    # Re-plot with suffix in title
+    print("Re-plotting traits (Cleaned)...")
+    for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
+        trait_d = ss_long_clean.filter(pl.col("Description") == trait)
+        
+        # Modify title to create unique filename (and display title)
+        title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)"
+        
+        S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
+else:
+    print("No straight-liners found on traits.")
+
+
+
+
+# %% Compare All vs Cleaned
+if sl_traits_df is not None and not sl_traits_df.is_empty():
+    print("Generating Comparison Plots (All vs Cleaned)...")
+    
+    # Always apply the per-question-group filtering here to ensure consistency
+    # (Matches the logic used in the re-plotting section above)
+    print("Applying filter to remove straight-lined question blocks...")
+    ss_long_w_key = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    sl_filter = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+    ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
+
+    sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
+
+    # --- Verification Prints ---
+    print(f"\n--- Verification of Filter ---")
+    print(f"Original Row Count: {ss_long.height}")
+    print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
+    print(f"Sample IDs affected: {sl_ids[:5]}")
+    print(f"Cleaned Row Count: {ss_long_clean.height}")
+    print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
+    
+    # Verify removal
+    # Re-construct key to verify
+    ss_long_check = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    sl_filter_check = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+    
+    should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
+    print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
+    
+    # Show what was removed (the straight lining behavior)
+    print("\nSample of Straight-Liner Data (Values that caused removal):")
+    print(sl_traits_df.head(5))
+    print("-" * 30 + "\n")
+    # ---------------------------
+    
+    for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
+        
+        # Get data for this trait from both datasets
+        trait_d_all = ss_long.filter(pl.col("Description") == trait)
+        trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
+        
+        # Plot comparison
+        title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)"
+        
+        S.plot_speaking_style_trait_scores_comparison(
+            trait_d_all, 
+            trait_d_clean, 
+            title=title_comp,
+            height=600  # Slightly taller for grouped bars
+        )
+
--- a/XX_quant_report.script.py
+++ b/XX_quant_report.script.py
@@ -0,0 +1,849 @@
+
+__generated_with = "0.19.7"
+
+# %%
+import marimo as mo
+import polars as pl
+from pathlib import Path
+import argparse
+import json
+import re
+from validation import check_progress, duration_validation, check_straight_liners
+from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
+import utils
+
+from speaking_styles import SPEAKING_STYLES
+
+# %% Fixed Variables
+
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %%
+# CLI argument parsing for batch automation
+# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
+# When run in Jupyter: args will use defaults (all filters = None = all options selected)
+
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
+    parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
+    parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+BEST_CHOSEN_CHARACTER = cli_args.best_character
+
+
+
+# %%
+S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+try:
+    data_all = S.load_data()
+except NotImplementedError as e:
+    mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+# %% Apply filters
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# %% Apply straight-liner threshold filter (if specified)
+# Removes respondents who straight-lined >= N question groups across
+# speaking style and voice scale questions.
+if cli_args.sl_threshold is not None:
+    _sl_n = cli_args.sl_threshold
+    S.sl_threshold = _sl_n  # Store on Survey so filter slug/description include it
+    print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...")
+    _n_before = _d.select(pl.len()).collect().item()
+
+    # Extract question groups with renamed columns for check_straight_liners
+    _sl_ss_or, _ = S.get_ss_orange_red(_d)
+    _sl_ss_gb, _ = S.get_ss_green_blue(_d)
+    _sl_vs, _ = S.get_voice_scale_1_10(_d)
+    _sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId')
+
+    _, _sl_df = check_straight_liners(_sl_all_q, max_score=5)
+
+    if _sl_df is not None and not _sl_df.is_empty():
+        # Count straight-lined question groups per respondent
+        _sl_counts = (
+            _sl_df
+            .group_by("Record ID")
+            .agg(pl.len().alias("sl_count"))
+            .filter(pl.col("sl_count") >= _sl_n)
+            .select(pl.col("Record ID").alias("_recordId"))
+        )
+        # Anti-join to remove offending respondents
+        _d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy()
+        # Update filtered data on the Survey object so sample size is correct
+        S.data_filtered = _d
+        _n_after = _d.select(pl.len()).collect().item()
+        print(f"  Removed {_n_before - _n_after} respondents ({_n_before} → {_n_after})")
+    else:
+        print("  No straight-liners detected — no respondents removed.")
+
+# %% Apply voice-ranking completeness filter (if specified)
+# Keeps only / excludes respondents who are missing the explicit voice
+# ranking question (QID98) despite completing the top-3 selection (QID36).
+if cli_args.voice_ranking_filter is not None:
+    S.voice_ranking_filter = cli_args.voice_ranking_filter  # Store on Survey so filter slug/description include it
+    _vr_missing = S.get_top_3_voices_missing_ranking(_d)
+    _vr_missing_ids = _vr_missing.select('_recordId')
+    _n_before = _d.select(pl.len()).collect().item()
+
+    if cli_args.voice_ranking_filter == 'only-missing':
+        print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
+        _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
+    elif cli_args.voice_ranking_filter == 'exclude-missing':
+        print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
+        _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
+
+    S.data_filtered = _d
+    _n_after = _d.select(pl.len()).collect().item()
+    print(f"  {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+
+
+# %%
+# Check if all business owners are missing a 'Consumer type' in demographics
+# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
+
+# %%
+mo.md(r"""
+# Demographic Distributions
+""")
+
+# %%
+demo_plot_cols = [
+    'Age',
+    'Gender',
+    # 'Race/Ethnicity',
+    'Bussiness_Owner',
+    'Consumer'
+]
+
+# %%
+_content = """
+
+"""
+for c in demo_plot_cols:
+    _fig = S.plot_demographic_distribution(
+        data=S.get_demographics(data)[0],
+        column=c,
+        title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
+    )
+    _content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
+
+mo.md(_content)
+
+# %%
+mo.md(r"""
+---
+
+# Brand Character Results
+""")
+
+# %%
+mo.md(r"""
+## Best performing: Original vs Refined frankenstein
+""")
+
+# %%
+char_refine_rank = S.get_character_refine(data)[0]
+# print(char_rank.collect().head())
+print(char_refine_rank.collect().head())
+
+# %%
+mo.md(r"""
+## Character ranking points
+""")
+
+# %%
+mo.md(r"""
+## Character ranking 1-2-3
+""")
+
+# %%
+char_rank = S.get_character_ranking(data)[0]
+
+# %%
+char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
+S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
+
+# %%
+S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
+
+# %%
+mo.md(r"""
+### Statistical Significance Character Ranking
+""")
+
+# %%
+# _pairwise_df, _meta = S.compute_ranking_significance(char_rank)
+
+# # print(_pairwise_df.columns)
+
+# mo.md(f"""
+
+
+# {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
+
+# {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
+# """)
+
+# %%
+mo.md(r"""
+## Character Ranking: times 1st place
+""")
+
+# %%
+S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
+
+# %%
+mo.md(r"""
+## Prominent predefined personality traits wordcloud
+""")
+
+# %%
+top8_traits = S.get_top_8_traits(data)[0]
+S.plot_traits_wordcloud(
+    data=top8_traits,
+    column='Top_8_Traits',
+    title="Most Prominent Personality Traits",
+)
+
+# %%
+mo.md(r"""
+## Trait frequency per brand character
+""")
+
+# %%
+char_df = S.get_character_refine(data)[0]
+
+# %%
+from theme import ColorPalette
+
+# Assuming you already have char_df (your data from get_character_refine or similar)
+characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
+character_colors = {
+    'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
+    'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
+    'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
+    'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
+}
+
+# Build consistent sort order (by total frequency across all characters)
+all_trait_counts = {}
+for char in characters:
+    freq_df, _ = S.transform_character_trait_frequency(char_df, char)
+    for row in freq_df.iter_rows(named=True):
+        all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
+
+consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
+
+_content = """"""
+# Generate 4 plots (one per character)
+for char in characters:
+    freq_df, _ = S.transform_character_trait_frequency(char_df, char)
+    main_color, highlight_color = character_colors[char]
+    chart = S.plot_single_character_trait_frequency(
+        data=freq_df,
+        character_name=char,
+        bar_color=main_color,
+        highlight_color=highlight_color,
+        trait_sort_order=consistent_sort_order,
+    )
+    _content += f"""
+    {mo.ui.altair_chart(chart)}
+
+
+"""
+
+mo.md(_content)
+
+# %%
+mo.md(r"""
+## Statistical significance best characters
+
+zie chat
+> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
+>
+""")
+
+# %%
+
+
+# %%
+
+
+# %%
+mo.md(r"""
+---
+
+# Spoken Voice Results
+""")
+
+# %%
+COLOR_GENDER = True
+
+# %%
+mo.md(r"""
+## Top 8 Most Chosen out of 18
+""")
+
+# %%
+v_18_8_3 = S.get_18_8_3(data)[0]
+
+# %%
+S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice', color_gender=COLOR_GENDER)
+
+# %%
+mo.md(r"""
+## Top 3 most chosen out of 8
+""")
+
+# %%
+S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice', color_gender=COLOR_GENDER)
+
+# %%
+mo.md(r"""
+## Voice Ranking Weighted Score
+""")
+
+# %%
+top3_voices = S.get_top_3_voices(data)[0]
+top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
+
+# %%
+S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", color_gender=COLOR_GENDER)
+
+# %%
+mo.md(r"""
+## Which voice is ranked best in the ranking question for top 3?
+
+(not best 3 out of 8 question)
+""")
+
+# %%
+S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Top 3 Voice Rankings (1st, 2nd, 3rd)", color_gender=COLOR_GENDER)
+
+# %%
+mo.md(r"""
+### Statistical significance for voice ranking
+""")
+
+# %%
+# print(top3_voices.collect().head())
+
+# %%
+
+# _pairwise_df, _metadata = S.compute_ranking_significance(
+#     top3_voices,alpha=0.05,correction="none")
+
+# # View significant pairs
+# # print(pairwise_df.filter(pl.col('significant') == True))
+
+# # Create heatmap visualization
+# _heatmap = S.plot_significance_heatmap(
+#     _pairwise_df, 
+#     metadata=_metadata,
+#     title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
+# )
+
+# # Create summary bar chart
+# _summary = S.plot_significance_summary(
+#     _pairwise_df,
+#     metadata=_metadata
+# )
+
+# mo.md(f"""
+# {mo.ui.altair_chart(_heatmap)}
+
+# {mo.ui.altair_chart(_summary)}
+# """)
+
+# %%
+## Voice Ranked 1st the most
+
+# %%
+S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', color_gender=COLOR_GENDER)
+
+# %%
+mo.md(r"""
+## Voice Scale 1-10
+""")
+
+# %%
+# Get your voice scale data (from notebook)
+voice_1_10, _ = S.get_voice_scale_1_10(data)
+S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)", color_gender=COLOR_GENDER)
+
+# %%
+mo.md(r"""
+### Statistical Significance (Scale 1-10)
+""")
+
+# %%
+# Compute pairwise significance tests
+# pairwise_df, metadata = S.compute_pairwise_significance(
+#     voice_1_10,
+#     test_type="mannwhitney",  # or "ttest", "chi2", "auto"
+#     alpha=0.05,
+#     correction="bonferroni"   # or "holm", "none"
+# )
+
+# # View significant pairs
+# # print(pairwise_df.filter(pl.col('significant') == True))
+
+# # Create heatmap visualization
+# _heatmap = S.plot_significance_heatmap(
+#     pairwise_df, 
+#     metadata=metadata,
+#     title="Voice Rating Significance<br>(Pairwise Comparisons)"
+# )
+
+# # Create summary bar chart
+# _summary = S.plot_significance_summary(
+#     pairwise_df,
+#     metadata=metadata
+# )
+
+# mo.md(f"""
+# {mo.ui.altair_chart(_heatmap)}
+
+# {mo.ui.altair_chart(_summary)}
+# """)
+
+# %%
+
+
+# %%
+mo.md(r"""
+## Ranking points for Voice per Chosen Brand Character
+
+**missing mapping**
+""")
+
+# %%
+mo.md(r"""
+## Correlation Speaking Styles
+""")
+
+# %%
+ss_or, choice_map_or = S.get_ss_orange_red(data)
+ss_gb, choice_map_gb = S.get_ss_green_blue(data)
+
+# Combine the data
+ss_all = ss_or.join(ss_gb, on='_recordId')
+_d = ss_all.collect()
+
+choice_map = {**choice_map_or, **choice_map_gb}
+# print(_d.head())
+# print(choice_map)
+ss_long = utils.process_speaking_style_data(ss_all, choice_map)
+
+df_style = utils.process_speaking_style_data(ss_all, choice_map)
+
+vscales = S.get_voice_scale_1_10(data)[0]
+df_scale_long = utils.process_voice_scale_data(vscales)
+
+joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
+
+df_ranking = utils.process_voice_ranking_data(top3_voices)
+joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
+
+# %%
+joined_ranking.head()
+
+# %%
+mo.md(r"""
+### Colors vs Scale 1-10
+""")
+
+# %%
+# Transform to get one row per color with average correlation
+color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
+S.plot_speaking_style_color_correlation(
+    data=color_corr_scale,
+    title="Correlation: Speaking Style Colors and Voice Scale 1-10"
+)
+
+# %%
+mo.md(r"""
+### Colors vs Ranking Points
+""")
+
+# %%
+color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
+    joined_ranking, 
+    SPEAKING_STYLES, 
+    target_column="Ranking_Points"
+)
+S.plot_speaking_style_color_correlation(
+    data=color_corr_ranking,
+    title="Correlation: Speaking Style Colors and Voice Ranking Points"
+)
+
+# %%
+# Gender-filtered correlation plots (Male vs Female voices)
+from reference import VOICE_GENDER_MAPPING
+
+MALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Male"]
+FEMALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Female"]
+
+# Filter joined data by voice gender
+joined_scale_male = joined_scale.filter(pl.col("Voice").is_in(MALE_VOICES))
+joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
+joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
+joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
+
+# Colors vs Scale 1-10 (grouped by voice gender)
+S.plot_speaking_style_color_correlation_by_gender(
+    data_male=joined_scale_male,
+    data_female=joined_scale_female,
+    speaking_styles=SPEAKING_STYLES,
+    target_column="Voice_Scale_Score",
+    title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
+    filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
+)
+
+# Colors vs Ranking Points (grouped by voice gender)
+S.plot_speaking_style_color_correlation_by_gender(
+    data_male=joined_ranking_male,
+    data_female=joined_ranking_female,
+    speaking_styles=SPEAKING_STYLES,
+    target_column="Ranking_Points",
+    title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
+    filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
+)
+
+# %%
+mo.md(r"""
+### Individual Traits vs Scale 1-10
+""")
+
+# %%
+_content = """"""
+
+for _style, _traits in SPEAKING_STYLES.items():
+    # print(f"Correlation plot for {style}...")
+    _fig = S.plot_speaking_style_scale_correlation(
+        data=joined_scale,
+        style_color=_style,
+        style_traits=_traits,
+        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10",
+    )
+    _content += f"""
+#### Speaking Style **{_style}**:
+
+{mo.ui.altair_chart(_fig)}
+
+"""
+mo.md(_content)
+
+# %%
+mo.md(r"""
+### Individual Traits vs Ranking Points
+""")
+
+# %%
+_content = """"""
+
+for _style, _traits in SPEAKING_STYLES.items():
+    # print(f"Correlation plot for {style}...")
+    _fig = S.plot_speaking_style_ranking_correlation(
+    data=joined_ranking,
+    style_color=_style,
+    style_traits=_traits,
+    title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
+)
+    _content += f"""
+#### Speaking Style **{_style}**:
+
+{mo.ui.altair_chart(_fig)}
+
+"""
+mo.md(_content)
+
+# %%
+# Individual Traits vs Scale 1-10 (grouped by voice gender)
+_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
+
+for _style, _traits in SPEAKING_STYLES.items():
+    _fig = S.plot_speaking_style_scale_correlation_by_gender(
+        data_male=joined_scale_male,
+        data_female=joined_scale_female,
+        style_color=_style,
+        style_traits=_traits,
+        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
+        filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
+    )
+    _content += f"""
+#### Speaking Style **{_style}**:
+
+{mo.ui.altair_chart(_fig)}
+
+"""
+mo.md(_content)
+
+# %%
+# Individual Traits vs Ranking Points (grouped by voice gender)
+_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
+
+for _style, _traits in SPEAKING_STYLES.items():
+    _fig = S.plot_speaking_style_ranking_correlation_by_gender(
+        data_male=joined_ranking_male,
+        data_female=joined_ranking_female,
+        style_color=_style,
+        style_traits=_traits,
+        title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
+        filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
+    )
+    _content += f"""
+#### Speaking Style **{_style}**:
+
+{mo.ui.altair_chart(_fig)}
+
+"""
+mo.md(_content)
+
+# %%
+# ## Correlations when "Best Brand Character" is chosen
+# For each of the 4 brand characters, filter the dataset to only those respondents 
+# who selected that character as their #1 choice.
+
+# %%
+# Prepare character-filtered data subsets
+char_rank_for_filter = S.get_character_ranking(data)[0].collect()
+
+CHARACTER_FILTER_MAP = {
+    'Familiar Friend': 'Character_Ranking_Familiar_Friend',
+    'The Coach': 'Character_Ranking_The_Coach',
+    'Personal Assistant': 'Character_Ranking_The_Personal_Assistant',
+    'Bank Teller': 'Character_Ranking_The_Bank_Teller',
+}
+
+def get_filtered_data_for_character(char_name: str) -> tuple[pl.DataFrame, pl.DataFrame, int]:
+    """Filter joined_scale and joined_ranking to respondents who ranked char_name #1."""
+    col = CHARACTER_FILTER_MAP[char_name]
+    respondents = char_rank_for_filter.filter(pl.col(col) == 1).select('_recordId')
+    n = respondents.height
+    filtered_scale = joined_scale.join(respondents, on='_recordId', how='inner')
+    filtered_ranking = joined_ranking.join(respondents, on='_recordId', how='inner')
+    return filtered_scale, filtered_ranking, n
+
+def _char_filename(char_name: str, suffix: str) -> str:
+    """Generate filename for character-filtered plots (without n-value).
+    
+    Format: bc_ranked_1_{suffix}__{char_slug}
+    This groups all plot types together in directory listings.
+    """
+    char_slug = char_name.lower().replace(' ', '_')
+    return f"bc_ranked_1_{suffix}__{char_slug}"
+
+
+
+# %%
+# ### Voice Weighted Ranking Score (by Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    _, _, n = get_filtered_data_for_character(char_name)
+    # Get top3 voices for this character subset using _recordIds
+    respondents = char_rank_for_filter.filter(
+        pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
+    ).select('_recordId')
+    # Collect top3_voices if it's a LazyFrame, then join
+    top3_df = top3_voices.collect() if isinstance(top3_voices, pl.LazyFrame) else top3_voices
+    filtered_top3 = top3_df.join(respondents, on='_recordId', how='inner')
+    weighted = calculate_weighted_ranking_scores(filtered_top3)
+    S.plot_weighted_ranking_score(
+        data=weighted,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Most Popular Voice - Weighted Score (1st=3pts, 2nd=2pts, 3rd=1pt)',
+        filename=_char_filename(char_name, "voice_weighted_ranking_score"),
+        color_gender=COLOR_GENDER,
+    )
+
+# %%
+# ### Voice Scale 1-10 Average Scores (by Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    _, _, n = get_filtered_data_for_character(char_name)
+    # Get voice scale data for this character subset using _recordIds
+    respondents = char_rank_for_filter.filter(
+        pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
+    ).select('_recordId')
+    # Collect voice_1_10 if it's a LazyFrame, then join
+    voice_1_10_df = voice_1_10.collect() if isinstance(voice_1_10, pl.LazyFrame) else voice_1_10
+    filtered_voice_1_10 = voice_1_10_df.join(respondents, on='_recordId', how='inner')
+    S.plot_average_scores_with_counts(
+        data=filtered_voice_1_10,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Voice General Impression (Scale 1-10)',
+        filename=_char_filename(char_name, "voice_scale_1-10"),
+        x_label='Voice',
+        domain=[1, 10],
+        color_gender=COLOR_GENDER,
+    )
+
+
+
+# %%
+# ### Speaking Style Colors vs Scale 1-10 (only for Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+        continue
+    
+    filtered_scale, _, n = get_filtered_data_for_character(char_name)
+    color_corr, _ = utils.transform_speaking_style_color_correlation(filtered_scale, SPEAKING_STYLES)
+    S.plot_speaking_style_color_correlation(
+        data=color_corr,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Scale 1-10',
+        filename=_char_filename(char_name, "colors_vs_voice_scale_1-10"),
+    )
+
+# %%
+# ### Speaking Style Colors vs Ranking Points (only for Best Character)
+for char_name in CHARACTER_FILTER_MAP:
+    if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+        continue
+    
+    _, filtered_ranking, n = get_filtered_data_for_character(char_name)
+    color_corr, _ = utils.transform_speaking_style_color_correlation(
+        filtered_ranking, SPEAKING_STYLES, target_column="Ranking_Points"
+    )
+    S.plot_speaking_style_color_correlation(
+        data=color_corr,
+        title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Ranking Points',
+        filename=_char_filename(char_name, "colors_vs_voice_ranking_points"),
+    )
+
+# %%
+# ### Individual Traits vs Scale 1-10 (only for Best Character)
+for _style, _traits in SPEAKING_STYLES.items():
+    print(f"--- Speaking Style: {_style} ---")
+    for char_name in CHARACTER_FILTER_MAP:
+        if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+            continue
+        
+        filtered_scale, _, n = get_filtered_data_for_character(char_name)
+        S.plot_speaking_style_scale_correlation(
+            data=filtered_scale,
+            style_color=_style,
+            style_traits=_traits,
+            title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Scale 1-10',
+            filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_scale_1-10"),
+        )
+
+# %%
+# ### Individual Traits vs Ranking Points (only for Best Character)
+for _style, _traits in SPEAKING_STYLES.items():
+    print(f"--- Speaking Style: {_style} ---")
+    for char_name in CHARACTER_FILTER_MAP:
+        if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
+            continue
+        
+        _, filtered_ranking, n = get_filtered_data_for_character(char_name)
+        S.plot_speaking_style_ranking_correlation(
+            data=filtered_ranking,
+            style_color=_style,
+            style_traits=_traits,
+            title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Ranking Points',
+            filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_ranking_points"),
+        )
+
+
+# %%
--- a/XX_statistical_significance.script.py
+++ b/XX_statistical_significance.script.py
@@ -0,0 +1,370 @@
+"""Extra statistical significance analyses for quant report."""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+# %% Character coach significatly higher than others
+
+
+char_rank = S.get_character_ranking(data)[0]
+
+
+
+_pairwise_df, _meta = S.compute_ranking_significance(
+    char_rank,
+    alpha=0.05,
+    correction="none",
+    )
+
+# %% [markdown]
+"""
+### Methodology Analysis
+
+**Input Data (`char_rank`)**:
+*   Generated by `S.get_character_ranking(data)`.
+*   Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.).
+*   Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice).
+
+**Processing**:
+*   The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite).
+*   It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd).
+
+**Statistical Test**:
+*   **Test Used**: Pairwise Z-test for two proportions (uncorrected).
+*   **Comparison**: It compares the **Rank 1 Share** of every pair of characters.
+    *   *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?"
+*   **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance.
+"""
+
+
+# %% Plot heatmap of pairwise significance
+S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference")
+
+# %% Plot summary of significant differences (e.g., which characters are significantly higher than others)
+# S.plot_significance_summary(_pairwise_df, metadata=_meta)
+
+# %% [markdown]
+"""
+# Analysis: Significance of "The Coach"
+
+**Parameters**: `alpha=0.05`, `correction='none'`
+*   **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083).
+
+**Results**:
+"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field.
+
+*   **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001).
+*   **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level.
+    *   *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful.
+
+**Conclusion**:
+Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
+"""
+
+# %% Mentions significance analysis
+
+char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
+    char_rank,
+    alpha=0.05,
+    correction="none",
+)
+
+S.plot_significance_heatmap(
+    char_pairwise_df_mentions,
+    metadata=_meta_mentions,
+    title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
+)
+
+
+# %% voices analysis
+top3_voices = S.get_top_3_voices(data)[0]
+
+
+_pairwise_df_voice, _metadata = S.compute_ranking_significance(
+    top3_voices,alpha=0.05,correction="none")
+
+
+S.plot_significance_heatmap(
+    _pairwise_df_voice, 
+    metadata=_metadata,
+    title="Statistical Significance: Voice Top Choice Preference"
+)
+# %% Total Mentions Significance (Rank 1+2+3 Combined)
+# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
+
+_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
+    top3_voices,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_df_mentions,
+    metadata=_meta_mentions,
+    title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
+)
+# %% Male Voices Only Analysis
+import reference
+
+def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
+    """Filter ranking columns to keep only those matching target gender."""
+    cols_to_keep = []
+    
+    # Always keep identifier if present
+    if '_recordId' in df.columns:
+        cols_to_keep.append('_recordId')
+        
+    for col in df.columns:
+        # Check if column is a voice column (contains Vxx)
+        # Format is typically "Top_3_Voices_ranking__V14"
+        if '__V' in col:
+            voice_id = col.split('__')[1]
+            if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
+                cols_to_keep.append(col)
+                
+    return df.select(cols_to_keep)
+
+# Get full ranking data as DataFrame
+df_voices = top3_voices.collect()
+
+# Filter for Male voices
+df_male_voices = filter_voices_by_gender(df_voices, 'Male')
+
+# 1. Male Voices: Top Choice Preference (Rank 1)
+_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
+    df_male_voices,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_pref,
+    metadata=_meta_male_pref,
+    title="Male Voices Only: Top Choice Preference Significance"
+)
+
+# 2. Male Voices: Total Mentions (Visibility)
+_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
+    df_male_voices,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_vis,
+    metadata=_meta_male_vis,
+    title="Male Voices Only: Total Mentions Significance"
+)
+# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
+
+# Start with the male voices dataframe from the previous step
+voices_to_exclude = ['V88', 'V86', 'V81']
+
+def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
+    """Filter ranking columns to exclude specific voices."""
+    cols_to_keep = []
+    
+    # Always keep identifier if present
+    if '_recordId' in df.columns:
+        cols_to_keep.append('_recordId')
+        
+    for col in df.columns:
+        # Check if column is a voice column (contains Vxx)
+        if '__V' in col:
+            voice_id = col.split('__')[1]
+            if voice_id not in exclude_list:
+                cols_to_keep.append(col)
+                
+    return df.select(cols_to_keep)
+
+df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
+
+# 1. Male Top Candidates: Top Choice Preference
+_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
+    df_male_top,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_top_pref,
+    metadata=_meta_male_top_pref,
+    title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
+)
+
+# 2. Male Top Candidates: Total Mentions
+_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
+    df_male_top,
+    alpha=0.05,
+    correction="none"
+)
+
+S.plot_significance_heatmap(
+    _pairwise_male_top_vis,
+    metadata=_meta_male_top_vis,
+    title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
+)
+
+# %% [markdown]
+"""
+# Rank 1 Selection Significance (Voice Level)
+
+Similar to the Total Mentions significance analysis above, but counting
+only how many times each voice was ranked **1st** (out of all respondents).
+This isolates first-choice preference rather than overall top-3 visibility.
+"""
+
+# %% Rank 1 Significance: All Voices
+
+_pairwise_df_rank1, _meta_rank1 = S.compute_rank1_significance(
+    top3_voices,
+    alpha=0.05,
+    correction="none",
+)
+
+S.plot_significance_heatmap(
+    _pairwise_df_rank1,
+    metadata=_meta_rank1,
+    title="Statistical Significance: Voice Rank 1 Selection"
+)
+
+# %% Rank 1 Significance: Male Voices Only
+
+_pairwise_df_rank1_male, _meta_rank1_male = S.compute_rank1_significance(
+    df_male_voices,
+    alpha=0.05,
+    correction="none",
+)
+
+S.plot_significance_heatmap(
+    _pairwise_df_rank1_male,
+    metadata=_meta_rank1_male,
+    title="Male Voices Only: Rank 1 Selection Significance"
+)
+
+# %%
--- a/XX_straight_liners.py
+++ b/XX_straight_liners.py
@@ -0,0 +1,267 @@
+"""Extra analyses of the straight-liners"""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+from validation import check_straight_liners
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+
+# %% Determine straight-liner repeat offenders
+# Extract question groups with renamed columns that check_straight_liners expects.
+# The raw `data` has QID-based column names; the getter methods rename them to
+# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc.
+
+ss_or, _ = S.get_ss_orange_red(data)
+ss_gb, _ = S.get_ss_green_blue(data)
+vs, _ = S.get_voice_scale_1_10(data)
+
+# Combine all question groups into one wide LazyFrame (joined on _recordId)
+all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
+
+# Run straight-liner detection across all question groups
+# max_score=5 catches all speaking-style straight-lining (1-5 scale)
+# and voice-scale values ≤5 on the 1-10 scale
+# Note: sl_threshold is NOT set on S here — this script analyses straight-liners,
+# it doesn't filter them out of the dataset.
+print("Running straight-liner detection across all question groups...")
+sl_report, sl_df = check_straight_liners(all_questions, max_score=5)
+
+# %% Quantify repeat offenders
+# sl_df has one row per (Record ID, Question Group) that was straight-lined.
+# Group by Record ID to count how many question groups each person SL'd.
+
+if sl_df is not None and not sl_df.is_empty():
+    total_respondents = data.select(pl.len()).collect().item()
+
+    # Per-respondent count of straight-lined question groups
+    respondent_sl_counts = (
+        sl_df
+        .group_by("Record ID")
+        .agg(pl.len().alias("sl_count"))
+        .sort("sl_count", descending=True)
+    )
+
+    max_sl = respondent_sl_counts["sl_count"].max()
+    print(f"\nTotal respondents: {total_respondents}")
+    print(f"Respondents who straight-lined at least 1 question group: "
+          f"{respondent_sl_counts.height}")
+    print(f"Maximum question groups straight-lined by one person: {max_sl}")
+    print()
+
+    # Build cumulative distribution: for each threshold N, count respondents
+    # who straight-lined >= N question groups
+    cumulative_rows = []
+    for threshold in range(1, max_sl + 1):
+        count = respondent_sl_counts.filter(
+            pl.col("sl_count") >= threshold
+        ).height
+        pct = (count / total_respondents) * 100
+        cumulative_rows.append({
+            "threshold": threshold,
+            "count": count,
+            "pct": pct,
+        })
+        print(
+            f"  ≥{threshold} question groups straight-lined: "
+            f"{count} respondents ({pct:.1f}%)"
+        )
+
+    cumulative_df = pl.DataFrame(cumulative_rows)
+    print(f"\n{cumulative_df}")
+
+    # %% Save cumulative data to CSV
+    _filter_slug = S._get_filter_slug()
+    _csv_dir = Path(S.fig_save_dir) / _filter_slug
+    _csv_dir.mkdir(parents=True, exist_ok=True)
+
+    _csv_path = _csv_dir / "straight_liner_repeat_offenders.csv"
+    cumulative_df.write_csv(_csv_path)
+    print(f"Saved cumulative data to {_csv_path}")
+
+    # %% Plot the cumulative distribution
+    S.plot_straight_liner_repeat_offenders(
+        cumulative_df,
+        total_respondents=total_respondents,
+    )
+
+    # %% Per-question straight-lining frequency
+    # Build human-readable question group names from the raw keys
+    def _humanise_question_group(key: str) -> str:
+        """Convert internal question group key to a readable label.
+
+        Examples:
+            SS_Green_Blue__V14  → Green/Blue – V14
+            SS_Orange_Red__V48  → Orange/Red – V48
+            Voice_Scale_1_10    → Voice Scale (1-10)
+        """
+        if key.startswith("SS_Green_Blue__"):
+            voice = key.split("__")[1]
+            return f"Green/Blue – {voice}"
+        if key.startswith("SS_Orange_Red__"):
+            voice = key.split("__")[1]
+            return f"Orange/Red – {voice}"
+        if key == "Voice_Scale_1_10":
+            return "Voice Scale (1-10)"
+        # Fallback: replace underscores
+        return key.replace("_", " ")
+
+    per_question_counts = (
+        sl_df
+        .group_by("Question Group")
+        .agg(pl.col("Record ID").n_unique().alias("count"))
+        .sort("count", descending=True)
+        .with_columns(
+            (pl.col("count") / total_respondents * 100).alias("pct")
+        )
+    )
+
+    # Add human-readable names
+    per_question_counts = per_question_counts.with_columns(
+        pl.col("Question Group").map_elements(
+            _humanise_question_group, return_dtype=pl.Utf8
+        ).alias("question")
+    )
+
+    print("\n--- Per-Question Straight-Lining Frequency ---")
+    print(per_question_counts)
+
+    # Save per-question data to CSV
+    _csv_path_pq = _csv_dir / "straight_liner_per_question.csv"
+    per_question_counts.write_csv(_csv_path_pq)
+    print(f"Saved per-question data to {_csv_path_pq}")
+
+    # Plot
+    S.plot_straight_liner_per_question(
+        per_question_counts,
+        total_respondents=total_respondents,
+    )
+
+    # %% Show the top repeat offenders (respondents with most SL'd groups)
+    print("\n--- Top Repeat Offenders ---")
+    print(respondent_sl_counts.head(20))
+
+else:
+    print("No straight-liners detected in the dataset.")
--- a/analysis_missing_voice_ranking.ipynb
+++ b/analysis_missing_voice_ranking.ipynb
--- a/docs/README.pdf
+++ b/docs/README.pdf
--- a/docs/figures_structure_manual.md
+++ b/docs/figures_structure_manual.md
@@ -0,0 +1,104 @@
+# Appendix: Quantitative Analysis Plots - Folder Structure Manual
+
+This folder contains all the quantitative analysis plots, sorted by the filters applied to the dataset. Each folder corresponds to a specific demographic cut.
+
+## Folder Overview
+
+* `All_Respondents/`: Analysis of the full dataset (no filters).
+* `filter_index.txt`: A master list of every folder code and its corresponding demographic filter.
+* **Filter Folders**: All other folders represent specific demographic cuts (e.g., `Age-18to21years`, `Gen-Woman`).
+
+## How to Navigate
+
+Each folder contains the same set of charts generated for that specific filter.
+
+## Directory Reference Table
+
+Below is the complete list of folder names. These names are encodings of the filters applied to the dataset, which we use to maintain consistency across our analysis. 
+
+| Directory Code | Filter Description |
+| :--- | :--- |
+| All_Respondents | All Respondents |
+| Age-18to21years | Age: 18 to 21 years |
+| Age-22to24years | Age: 22 to 24 years |
+| Age-25to34years | Age: 25 to 34 years |
+| Age-35to40years | Age: 35 to 40 years |
+| Age-41to50years | Age: 41 to 50 years |
+| Age-51to59years | Age: 51 to 59 years |
+| Age-60to70years | Age: 60 to 70 years |
+| Age-70yearsormore | Age: 70 years or more |
+| Gen-Man | Gender: Man |
+| Gen-Prefernottosay | Gender: Prefer not to say |
+| Gen-Woman | Gender: Woman |
+| Eth-6_grps_c64411 | Ethnicity: All options containing 'Alaska Native or Indigenous American' |
+| Eth-6_grps_8f145b | Ethnicity: All options containing 'Asian or Asian American' |
+| Eth-8_grps_71ac47 | Ethnicity: All options containing 'Black or African American' |
+| Eth-7_grps_c5b3ce | Ethnicity: All options containing 'Hispanic or Latinx' |
+| Eth-BlackorAfricanAmerican<br>MiddleEasternorNorthAfrican<br>WhiteorCaucasian+<br>MiddleEasternorNorthAfrican | Ethnicity: Middle Eastern or North African |
+| Eth-AsianorAsianAmericanBlackorAfricanAmerican<br>NativeHawaiianorOtherPacificIslander+<br>NativeHawaiianorOtherPacificIslander | Ethnicity: Native Hawaiian or Other Pacific Islander |
+| Eth-10_grps_cef760 | Ethnicity: All options containing 'White or Caucasian' |
+| Inc-100000to149999 | Income: $100,000 to $149,999 |
+| Inc-150000to199999 | Income: $150,000 to $199,999 |
+| Inc-200000ormore | Income: $200,000 or more |
+| Inc-25000to34999 | Income: $25,000 to $34,999 |
+| Inc-35000to54999 | Income: $35,000 to $54,999 |
+| Inc-55000to79999 | Income: $55,000 to $79,999 |
+| Inc-80000to99999 | Income: $80,000 to $99,999 |
+| Inc-Lessthan25000 | Income: Less than $25,000 |
+| Cons-Lower_Mass_A+Lower_Mass_B | Consumer: Lower_Mass_A, Lower_Mass_B |
+| Cons-MassAffluent_A+MassAffluent_B | Consumer: MassAffluent_A, MassAffluent_B |
+| Cons-Mass_A+Mass_B | Consumer: Mass_A, Mass_B |
+| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_A+<br>Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_A, Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
+| Cons-Early_Professional | Consumer: Early_Professional |
+| Cons-Lower_Mass_B | Consumer: Lower_Mass_B |
+| Cons-MassAffluent_B | Consumer: MassAffluent_B |
+| Cons-Mass_B | Consumer: Mass_B |
+| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
+| Cons-Starter | Consumer: Starter |
+| BizOwn-No | Business Owner: No |
+| BizOwn-Yes | Business Owner: Yes |
+| AI-Daily | Ai User: Daily |
+| AI-Lessthanonceamonth | Ai User: Less than once a month |
+| AI-Morethanoncedaily | Ai User: More than once daily |
+| AI-Multipletimesperweek | Ai User: Multiple times per week |
+| AI-Onceamonth | Ai User: Once a month |
+| AI-Onceaweek | Ai User: Once a week |
+| AI-RarelyNever | Ai User: Rarely/Never |
+| AI-Daily+<br>Morethanoncedaily+<br>Multipletimesperweek | Ai User: Daily, More than once daily, Multiple times per week |
+| AI-4_grps_d4f57a | Ai User: Once a week, Once a month, Less than once a month, Rarely/Never |
+| InvAsts-0to24999 | Investable Assets: $0 to $24,999 |
+| InvAsts-150000to249999 | Investable Assets: $150,000 to $249,999 |
+| InvAsts-1Mto4.9M | Investable Assets: $1M to $4.9M |
+| InvAsts-25000to49999 | Investable Assets: $25,000 to $49,999 |
+| InvAsts-250000to499999 | Investable Assets: $250,000 to $499,999 |
+| InvAsts-50000to149999 | Investable Assets: $50,000 to $149,999 |
+| InvAsts-500000to999999 | Investable Assets: $500,000 to $999,999 |
+| InvAsts-5Mormore | Investable Assets: $5M or more |
+| InvAsts-Prefernottoanswer | Investable Assets: Prefer not to answer |
+| Ind-Agricultureforestryfishingorhunting | Industry: Agriculture, forestry, fishing, or hunting |
+| Ind-Artsentertainmentorrecreation | Industry: Arts, entertainment, or recreation |
+| Ind-Broadcasting | Industry: Broadcasting |
+| Ind-Construction | Industry: Construction |
+| Ind-EducationCollegeuniversityoradult | Industry: Education – College, university, or adult |
+| Ind-EducationOther | Industry: Education – Other |
+| Ind-EducationPrimarysecondaryK-12 | Industry: Education – Primary/secondary (K-12) |
+| Ind-Governmentandpublicadministration | Industry: Government and public administration |
+| Ind-Hotelandfoodservices | Industry: Hotel and food services |
+| Ind-InformationOther | Industry: Information – Other |
+| Ind-InformationServicesanddata | Industry: Information – Services and data |
+| Ind-Legalservices | Industry: Legal services |
+| Ind-ManufacturingComputerandelectronics | Industry: Manufacturing – Computer and electronics |
+| Ind-ManufacturingOther | Industry: Manufacturing – Other |
+| Ind-Notemployed | Industry: Not employed |
+| Ind-Otherindustrypleasespecify | Industry: Other industry (please specify) |
+| Ind-Processing | Industry: Processing |
+| Ind-Publishing | Industry: Publishing |
+| Ind-Realestaterentalorleasing | Industry: Real estate, rental, or leasing |
+| Ind-Retired | Industry: Retired |
+| Ind-Scientificortechnicalservices | Industry: Scientific or technical services |
+| Ind-Software | Industry: Software |
+| Ind-Telecommunications | Industry: Telecommunications |
+| Ind-Transportationandwarehousing | Industry: Transportation and warehousing |
+| Ind-Utilities | Industry: Utilities |
+| Ind-Wholesale | Industry: Wholesale |
+
--- a/plots.py
+++ b/plots.py
--- a/potential_dataset_issues.md
+++ b/potential_dataset_issues.md
@@ -0,0 +1,3 @@
+- V46 not in scale 1-10. Qualtrics 
+- Straightliners
+- V45 goed in qual maar slecht in quant
--- a/run_filter_combinations.py
+++ b/run_filter_combinations.py
@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
 Usage:
    uv run python run_filter_combinations.py
    uv run python run_filter_combinations.py --dry-run  # Preview combinations without running
+    uv run python run_filter_combinations.py --category age  # Only run age combinations
+    uv run python run_filter_combinations.py --category consumer  # Only run consumer segment combinations
 """

 import subprocess
@@ -31,86 +33,171 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
 REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'


-def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
+def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
    """
    Generate all single-filter combinations.
    
    Each combination isolates ONE filter value while keeping all others at "all selected".
-    Returns list of dicts with filter kwargs for each run.
+    
+    Args:
+        survey: QualtricsSurvey instance with loaded data
+        category: Optional filter category to limit combinations to.
+                  Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
+                               'business_owner', 'ai_user', 'investable_assets', 'industry'
+                  If None or 'all', generates all combinations.
+    
+    Returns:
+        List of dicts with filter kwargs for each run.
    """
    combinations = []
    
    # Add "All Respondents" run (no filters = all options selected)
-    combinations.append({
-        'name': 'All_Respondents',
-        'filters': {}  # Empty = use defaults (all selected)
-    })
+    if not category or category in ['all_filters', 'all']:
+        combinations.append({
+            'name': 'All_Respondents',
+            'filters': {}  # Empty = use defaults (all selected)
+        })
    
    # Age groups - one at a time
-    for age in survey.options_age:
-        combinations.append({
-            'name': f'Age-{age}',
-            'filters': {'age': [age]}
-        })
+    if not category or category in ['all_filters', 'age']:
+        for age in survey.options_age:
+            combinations.append({
+                'name': f'Age-{age}',
+                'filters': {'age': [age]}
+            })
    
    # Gender - one at a time
-    for gender in survey.options_gender:
-        combinations.append({
-            'name': f'Gender-{gender}',
-            'filters': {'gender': [gender]}
-        })
+    if not category or category in ['all_filters', 'gender']:
+        for gender in survey.options_gender:
+            combinations.append({
+                'name': f'Gender-{gender}',
+                'filters': {'gender': [gender]}
+            })
    
    # Ethnicity - grouped by individual values
-    # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
-    # Create filters that include ALL options containing each individual ethnicity value
-    ethnicity_values = set()
-    for ethnicity_option in survey.options_ethnicity:
-        # Split by comma and strip whitespace
-        values = [v.strip() for v in ethnicity_option.split(',')]
-        ethnicity_values.update(values)
-    
-    for ethnicity_value in sorted(ethnicity_values):
-        # Find all options that contain this value
-        matching_options = [
-            opt for opt in survey.options_ethnicity 
-            if ethnicity_value in [v.strip() for v in opt.split(',')]
-        ]
-        combinations.append({
-            'name': f'Ethnicity-{ethnicity_value}',
-            'filters': {'ethnicity': matching_options}
-        })
+    if not category or category in ['all_filters', 'ethnicity']:
+        # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
+        # Create filters that include ALL options containing each individual ethnicity value
+        ethnicity_values = set()
+        for ethnicity_option in survey.options_ethnicity:
+            # Split by comma and strip whitespace
+            values = [v.strip() for v in ethnicity_option.split(',')]
+            ethnicity_values.update(values)
+        
+        for ethnicity_value in sorted(ethnicity_values):
+            # Find all options that contain this value
+            matching_options = [
+                opt for opt in survey.options_ethnicity 
+                if ethnicity_value in [v.strip() for v in opt.split(',')]
+            ]
+            combinations.append({
+                'name': f'Ethnicity-{ethnicity_value}',
+                'filters': {'ethnicity': matching_options}
+            })
    
    # Income - one at a time
-    for income in survey.options_income:
+    if not category or category in ['all_filters', 'income']:
+        for income in survey.options_income:
+            combinations.append({
+                'name': f'Income-{income}',
+                'filters': {'income': [income]}
+            })
+    
+    # Consumer segments - combine _A and _B options, and also include standalone
+    if not category or category in ['all_filters', 'consumer']:
+        # Group options by base name (removing _A/_B suffix)
+        consumer_groups = {}
+        for consumer in survey.options_consumer:
+            # Check if ends with _A or _B
+            if consumer.endswith('_A') or consumer.endswith('_B'):
+                base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
+                if base_name not in consumer_groups:
+                    consumer_groups[base_name] = []
+                consumer_groups[base_name].append(consumer)
+            else:
+                # Not an _A/_B option, keep as-is
+                consumer_groups[consumer] = [consumer]
+        
+        # Add combined _A+_B options
+        for base_name, options in consumer_groups.items():
+            if len(options) > 1:  # Only combine if there are multiple (_A and _B)
+                combinations.append({
+                    'name': f'Consumer-{base_name}',
+                    'filters': {'consumer': options}
+                })
+        
+        # Add standalone options (including individual _A and _B)
+        for consumer in survey.options_consumer:
+            combinations.append({
+                'name': f'Consumer-{consumer}',
+                'filters': {'consumer': [consumer]}
+            })
+    
+    # Business Owner - one at a time
+    if not category or category in ['all_filters', 'business_owner']:
+        for business_owner in survey.options_business_owner:
+            combinations.append({
+                'name': f'BusinessOwner-{business_owner}',
+                'filters': {'business_owner': [business_owner]}
+            })
+    
+    # AI User - one at a time
+    if not category or category in ['all_filters', 'ai_user']:
+        for ai_user in survey.options_ai_user:
+            combinations.append({
+                'name': f'AIUser-{ai_user}',
+                'filters': {'ai_user': [ai_user]}
+            })
+        
+        # AI user daily, more than once daily, en multiple times a week = frequent
        combinations.append({
-            'name': f'Income-{income}',
-            'filters': {'income': [income]}
+            'name': 'AIUser-Frequent',
+            'filters': {'ai_user': [
+                'Daily', 'More than once daily', 'Multiple times per week'
+            ]}
+        })
+        combinations.append({
+            'name': 'AIUser-RarelyNever',
+            'filters': {'ai_user': [
+                'Once a month', 'Less than once a month', 'Once a week', 'Rarely/Never'
+            ]}
        })
    
-    # Consumer segments - combine _A and _B options
-    # Group options by base name (removing _A/_B suffix)
-    consumer_groups = {}
-    for consumer in survey.options_consumer:
-        # Check if ends with _A or _B
-        if consumer.endswith('_A') or consumer.endswith('_B'):
-            base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
-            if base_name not in consumer_groups:
-                consumer_groups[base_name] = []
-            consumer_groups[base_name].append(consumer)
-        else:
-            # Not an _A/_B option, keep as-is
-            consumer_groups[consumer] = [consumer]
+    # Investable Assets - one at a time
+    if not category or category in ['all_filters', 'investable_assets']:
+        for investable_assets in survey.options_investable_assets:
+            combinations.append({
+                'name': f'Assets-{investable_assets}',
+                'filters': {'investable_assets': [investable_assets]}
+            })
    
-    for base_name, options in consumer_groups.items():
+    # Industry - one at a time
+    if not category or category in ['all_filters', 'industry']:
+        for industry in survey.options_industry:
+            combinations.append({
+                'name': f'Industry-{industry}',
+                'filters': {'industry': [industry]}
+            })
+    
+    # Voice ranking completeness filter
+    # These use a special flag rather than demographic filters, so we store
+    # the mode in a dedicated key that run_report passes as --voice-ranking-filter.
+    if not category or category in ['all_filters', 'voice_ranking']:
        combinations.append({
-            'name': f'Consumer-{base_name}',
-            'filters': {'consumer': options}
+            'name': 'VoiceRanking-OnlyMissing',
+            'filters': {},
+            'voice_ranking_filter': 'only-missing',
+        })
+        combinations.append({
+            'name': 'VoiceRanking-ExcludeMissing',
+            'filters': {},
+            'voice_ranking_filter': 'exclude-missing',
        })
    
    return combinations


-def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
+def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
    """
    Run the report script with given filters.
    
@@ -118,6 +205,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
        filters: Dict of filter_name -> list of values
        name: Name for this filter combination (used for .txt description file)
        dry_run: If True, just print command without running
+        sl_threshold: If set, exclude respondents with >= N straight-lined question groups
+        voice_ranking_filter: If set, filter by voice ranking completeness.
+            'only-missing' keeps only respondents missing QID98 data,
+            'exclude-missing' removes them.
        
    Returns:
        True if successful, False otherwise
@@ -128,6 +219,14 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
    if name:
        cmd.extend(['--filter-name', name])
    
+    # Pass straight-liner threshold if specified
+    if sl_threshold is not None:
+        cmd.extend(['--sl-threshold', str(sl_threshold)])
+    
+    # Pass voice ranking filter if specified
+    if voice_ranking_filter is not None:
+        cmd.extend(['--voice-ranking-filter', voice_ranking_filter])
+    
    for filter_name, values in filters.items():
        if values:
            cmd.extend([f'--{filter_name}', json.dumps(values)])
@@ -156,6 +255,13 @@ def main():
    import argparse
    parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
    parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
+    parser.add_argument(
+        '--category',
+        choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
+        default='all_filters',
+        help='Filter category to run combinations for (default: all_filters)'
+    )
+    parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)')
    args = parser.parse_args()
    
    # Load survey to get available filter options
@@ -163,15 +269,19 @@ def main():
    survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
    survey.load_data()  # Populates options_* attributes
    
-    # Generate all combinations
-    combinations = get_filter_combinations(survey)
-    print(f"Generated {len(combinations)} filter combinations")
+    # Generate combinations for specified category
+    combinations = get_filter_combinations(survey, category=args.category)
+    category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
+    print(f"Generated {len(combinations)} filter combinations{category_desc}")
    
+    if args.sl_threshold is not None:
+        print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups")
+
    if args.dry_run:
        print("\nDRY RUN - Commands that would be executed:")
        for combo in combinations:
            print(f"\n{combo['name']}:")
-            run_report(combo['filters'], name=combo['name'], dry_run=True)
+            run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
        return
    
    # Run each combination with progress bar
@@ -180,7 +290,7 @@ def main():
    
    for combo in tqdm(combinations, desc="Running reports", unit="filter"):
        tqdm.write(f"Running: {combo['name']}")
-        if run_report(combo['filters'], name=combo['name']):
+        if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
            successful += 1
        else:
            failed.append(combo['name'])
--- a/speech_data_correlation.ipynb
+++ b/speech_data_correlation.ipynb
--- a/theme.py
+++ b/theme.py
@@ -77,6 +77,13 @@ class ColorPalette:
    GENDER_MALE_NEUTRAL = "#B8C9D9"   # Grey-Blue
    GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink

+    # Gender colors for correlation plots (green/red indicate +/- correlation)
+    # Male = darker shade, Female = lighter shade
+    CORR_MALE_POSITIVE = "#1B5E20"     # Dark Green
+    CORR_FEMALE_POSITIVE = "#81C784"   # Light Green
+    CORR_MALE_NEGATIVE = "#B71C1C"     # Dark Red
+    CORR_FEMALE_NEGATIVE = "#E57373"   # Light Red
+
    # Speaking Style Colors (named after the style quadrant colors)
    STYLE_GREEN = "#2E7D32"   # Forest Green
    STYLE_BLUE = "#1565C0"    # Strong Blue
--- a/utils.py
+++ b/utils.py
@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
            yield shape


+def _set_shape_alt_text(shape, alt_text: str):
+    """
+    Set alt text (descr attribute) for a PowerPoint shape.
+    """
+    nvPr = None
+    # Check for common property names used by python-pptx elements
+    for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
+        if hasattr(shape._element, attr):
+            nvPr = getattr(shape._element, attr)
+            break
+            
+    if nvPr and hasattr(nvPr, 'cNvPr'):
+        nvPr.cNvPr.set("descr", alt_text)
+
+
 def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
    """
-    Updates the alt text of images in a PowerPoint presentation by matching
-    their content with images in a source directory.
+    Updates the alt text of images in a PowerPoint presentation.
+    
+    1. First pass: Validates existing alt-text format (<filter>/<filename>). 
+       - Fixes full paths by keeping only the last two parts.
+       - Clears invalid alt-text.
+    2. Second pass: If images are missing alt-text, matches them against source directory
+       using perceptual hash or SHA1.

    Args:
        ppt_path (str/Path): Path to the PowerPoint file.
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
    if output_path is None:
        output_path = ppt_path

-    # 1. Build lookup map of {hash: file_path} from the source directory
-    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
-
-    # 2. Open Presentation
+    # Open Presentation
    try:
        prs = Presentation(ppt_path)
    except Exception as e:
@@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
        return

    updates_count = 0
-    unmatched_images = []  # Collect unmatched images to report at the end
+    images_needing_match = []
+    
    slides = list(prs.slides)
    total_slides = len(slides)

-    print(f"Processing {total_slides} slides...")
+    print(f"Scanning {total_slides} slides for existing alt-text...")

+    # Pass 1: Scan and clean existing alt-text
    for i, slide in enumerate(slides):
-        # Use recursive iterator to find all pictures including those in groups/placeholders
        picture_shapes = list(_iter_picture_shapes(slide.shapes))
        
        for shape in picture_shapes:
-            try:
-                # Get image hash based on selected method
-                if use_perceptual_hash:
-                    # Use perceptual hash of the image blob for visual content matching
-                    current_hash = _calculate_perceptual_hash(shape.image.blob)
-                else:
-                    # Use SHA1 hash from python-pptx (exact byte match)
-                    current_hash = shape.image.sha1
+            alt_text = _get_shape_alt_text(shape)
+            has_valid_alt = False
+            
+            if alt_text:
+                # Handle potential path separators and whitespace
+                clean_alt = alt_text.strip().replace('\\', '/')
+                parts = clean_alt.split('/')
                
-                if current_hash in image_hash_map:
-                    original_path = image_hash_map[current_hash]
+                # Check if it looks like a path/file reference (at least 2 parts like dir/file)
+                if len(parts) >= 2:
+                    # Enforce format: keep last 2 parts (e.g. filter/image.png)
+                    new_alt = '/'.join(parts[-2:])
                    
-                    # Generate Alt Text
-                    try:
-                        # Prepare path for generator. 
-                        # Try to relativize to CWD if capable
-                        pass_path = original_path
-                        try:
-                            pass_path = original_path.relative_to(Path.cwd())
-                        except ValueError:
-                            pass
+                    if new_alt != alt_text:
+                        print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
+                        _set_shape_alt_text(shape, new_alt)
+                        updates_count += 1
                        
-                        new_alt_text = image_alt_text_generator(pass_path)
-                        
-                        # Check existing alt text to avoid redundant updates/log them
-                        # Accessing alt text via cNvPr
-                        # Note: Different shape types might store non-visual props differently
-                        # Picture: nvPicPr.cNvPr
-                        # GraphicFrame: nvGraphicFramePr.cNvPr
-                        # Group: nvGrpSpPr.cNvPr
-                        # Shape/Placeholder: nvSpPr.cNvPr
-                        
-                        nvPr = None
-                        for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
-                            if hasattr(shape._element, attr):
-                                nvPr = getattr(shape._element, attr)
-                                break
-                        
-                        if nvPr and hasattr(nvPr, 'cNvPr'):
-                            cNvPr = nvPr.cNvPr
-                            existing_alt_text = cNvPr.get("descr", "")
-                            
-                            if existing_alt_text != new_alt_text:
-                                print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
-                                print(f"  Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
-                                cNvPr.set("descr", new_alt_text)
-                                updates_count += 1
-                        else:
-                            print(f"Could not find cNvPr for shape on slide {i+1}")
-                            
-                    except AssertionError as e:
-                        print(f"Skipping match for {original_path} due to generator error: {e}")
-                    except Exception as e:
-                        print(f"Error updating alt text for {original_path}: {e}")
-                
+                    has_valid_alt = True
                else:
-                    # Check if image already has alt text set - if so, skip reporting as unmatched
-                    existing_alt = _get_shape_alt_text(shape)
-                    if existing_alt:
-                        # Image already has alt text, no need to report as unmatched
-                        continue
-                    
-                    shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
-                    shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
-                    hash_type = "pHash" if use_perceptual_hash else "SHA1"
-                    
-                    unmatched_images.append({
-                        'slide': i+1,
-                        'shape_name': shape_name,
-                        'hash_type': hash_type,
-                        'hash': current_hash
-                    })
-                            
-            except AttributeError:
-                continue
-            except Exception as e:
-                print(f"Error processing shape on slide {i+1}: {e}")
+                    # User requested deleting other cases that do not meet format
+                    # If it's single word or doesn't look like our path format
+                    pass # logic below handles this
+            
+            if not has_valid_alt:
+                if alt_text:
+                    print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
+                    _set_shape_alt_text(shape, "")
+                    updates_count += 1
+                
+                # Queue for hash matching
+                shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
+                shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
+                images_needing_match.append({
+                    'slide_idx': i, # 0-based
+                    'slide_num': i+1,
+                    'shape': shape,
+                    'shape_name': shape_name
+                })

-    # Print summary
+    if not images_needing_match:
+        print("\nAll images have valid alt-text format. No hash matching needed.")
+        if updates_count > 0:
+            prs.save(output_path)
+            print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
+        else:
+             print("Presentation is up to date.")
+        return
+
+    # Pass 2: Hash Matching
+    print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
+    
+    # Build lookup map of {hash: file_path} only if needed
+    image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
+    
+    unmatched_images = []
+
+    for item in images_needing_match:
+        shape = item['shape']
+        slide_num = item['slide_num']
+        
+        try:
+            # Get image hash
+            if use_perceptual_hash:
+                current_hash = _calculate_perceptual_hash(shape.image.blob)
+            else:
+                current_hash = shape.image.sha1
+            
+            if current_hash in image_hash_map:
+                original_path = image_hash_map[current_hash]
+                
+                # Generate Alt Text
+                try:
+                    # Try to relativize to CWD if capable
+                    pass_path = original_path
+                    try:
+                        pass_path = original_path.relative_to(Path.cwd())
+                    except ValueError:
+                        pass
+                    
+                    new_alt_text = image_alt_text_generator(pass_path)
+                    
+                    print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
+                    _set_shape_alt_text(shape, new_alt_text)
+                    updates_count += 1
+                        
+                except Exception as e:
+                    print(f"Error generating alt text for {original_path}: {e}")
+            else:
+                hash_type = "pHash" if use_perceptual_hash else "SHA1"
+                unmatched_images.append({
+                    'slide': slide_num,
+                    'shape_name': item['shape_name'],
+                    'hash_type': hash_type,
+                    'hash': current_hash
+                })
+                        
+        except Exception as e:
+            print(f"Error processing shape on slide {slide_num}: {e}")
+
+    # Save and Print Summary
    print("\n" + "="*80)
    if updates_count > 0:
        prs.save(output_path)
        print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
    else:
-        print("No images matched or required updates.")
+        print("No matches found for missing images.")
    
-    # List unmatched images at the end
    if unmatched_images:
-        print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
+        print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:")
        for img in unmatched_images:
            print(f"  • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
    else:
-        print("\n✓ All images matched successfully!")
+        print("\n✓ All images processed successfully!")
    print("="*80)


@@ -723,7 +762,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
 class QualtricsSurvey(QualtricsPlotsMixin):
    """Class to handle Qualtrics survey data."""
    
-    def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
+    def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path], figures_dir: Union[str, Path] = None):
        if isinstance(data_path, str):
            data_path = Path(data_path)
        
@@ -735,8 +774,12 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        self.qid_descr_map = self._extract_qid_descr_map()
        self.qsf:dict = self._load_qsf()
        
-        # get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
-        self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
+        if figures_dir:
+            self.fig_save_dir = Path(figures_dir)
+        else:
+            # get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
+            self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
+            
        if not self.fig_save_dir.exists():
            self.fig_save_dir.mkdir(parents=True, exist_ok=True)

@@ -750,7 +793,10 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        self.filter_consumer:list = None
        self.filter_ethnicity:list = None
        self.filter_income:list = None
-
+        self.filter_business_owner:list = None      # QID4
+        self.filter_ai_user:list = None             # QID22
+        self.filter_investable_assets:list = None   # QID16
+        self.filter_industry:list = None            # QID17
        
    
    
@@ -838,6 +884,10 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        self.options_consumer = sorted(df['Consumer'].drop_nulls().unique().to_list()) if 'Consumer' in df.columns else []
        self.options_ethnicity = sorted(df['QID3'].drop_nulls().unique().to_list()) if 'QID3' in df.columns else []
        self.options_income = sorted(df['QID15'].drop_nulls().unique().to_list()) if 'QID15' in df.columns else []
+        self.options_business_owner = sorted(df['QID4'].drop_nulls().unique().to_list()) if 'QID4' in df.columns else []
+        self.options_ai_user = sorted(df['QID22'].drop_nulls().unique().to_list()) if 'QID22' in df.columns else []
+        self.options_investable_assets = sorted(df['QID16'].drop_nulls().unique().to_list()) if 'QID16' in df.columns else []
+        self.options_industry = sorted(df['QID17'].drop_nulls().unique().to_list()) if 'QID17' in df.columns else []
        
        return df.lazy()
    
@@ -854,41 +904,61 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        
        return q.select(QIDs).rename(rename_dict)

-    def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None) -> pl.LazyFrame:
+    def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None, business_owner:list=None, ai_user:list=None, investable_assets:list=None, industry:list=None) -> pl.LazyFrame:
        """Filter data based on provided parameters
        
        Possible parameters:
-        - age: list of age groups to include
-        - gender: list
-        - consumer: list
-        - ethnicity: list
-        - income: list
+        - age: list of age groups to include (QID1)
+        - gender: list (QID2)
+        - consumer: list (Consumer)
+        - ethnicity: list (QID3)
+        - income: list (QID15)
+        - business_owner: list (QID4)
+        - ai_user: list (QID22)
+        - investable_assets: list (QID16)
+        - industry: list (QID17)
        
        Also saves the result to self.data_filtered.
        """
        
-        # Apply filters
+        # Apply filters - skip if empty list (columns with all NULLs produce empty options)
+        # OR if all options are selected (to avoid dropping NULLs)
+        
        self.filter_age = age
-        if age is not None:
+        if age is not None and len(age) > 0 and set(age) != set(self.options_age):
            q = q.filter(pl.col('QID1').is_in(age))
        
        self.filter_gender = gender
-        if gender is not None:
+        if gender is not None and len(gender) > 0 and set(gender) != set(self.options_gender):
            q = q.filter(pl.col('QID2').is_in(gender))
        
        self.filter_consumer = consumer
-        if consumer is not None:
+        if consumer is not None and len(consumer) > 0 and set(consumer) != set(self.options_consumer):
            q = q.filter(pl.col('Consumer').is_in(consumer))
        
        self.filter_ethnicity = ethnicity
-        if ethnicity is not None:
+        if ethnicity is not None and len(ethnicity) > 0 and set(ethnicity) != set(self.options_ethnicity):
            q = q.filter(pl.col('QID3').is_in(ethnicity))
        
        self.filter_income = income
-        if income is not None:
+        if income is not None and len(income) > 0 and set(income) != set(self.options_income):
            q = q.filter(pl.col('QID15').is_in(income))
        
-        self
+        self.filter_business_owner = business_owner
+        if business_owner is not None and len(business_owner) > 0 and set(business_owner) != set(self.options_business_owner):
+            q = q.filter(pl.col('QID4').is_in(business_owner))
+        
+        self.filter_ai_user = ai_user
+        if ai_user is not None and len(ai_user) > 0 and set(ai_user) != set(self.options_ai_user):
+            q = q.filter(pl.col('QID22').is_in(ai_user))
+        
+        self.filter_investable_assets = investable_assets
+        if investable_assets is not None and len(investable_assets) > 0 and set(investable_assets) != set(self.options_investable_assets):
+            q = q.filter(pl.col('QID16').is_in(investable_assets))
+        
+        self.filter_industry = industry
+        if industry is not None and len(industry) > 0 and set(industry) != set(self.options_industry):
+            q = q.filter(pl.col('QID17').is_in(industry))
        
        self.data_filtered = q
        return self.data_filtered
@@ -1045,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):

        return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
    
+    def get_top_3_voices_missing_ranking(
+        self, q: pl.LazyFrame
+    ) -> pl.DataFrame:
+        """Identify respondents who completed the top-3 voice selection (QID36)
+        but are missing the explicit ranking question (QID98).
+
+        These respondents picked 3 voices in the selection step and have
+        selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
+        ranking columns are null.  This means ``get_top_3_voices()`` will
+        return all-null rows for them, causing plots like
+        ``plot_most_ranked_1`` to undercount.
+
+        Parameters:
+            q: The (optionally filtered) LazyFrame from ``load_data()``.
+
+        Returns:
+            A collected ``pl.DataFrame`` with columns:
+
+            - ``_recordId`` – the respondent identifier
+            - ``3_Ranked`` – comma-separated text of the 3 voices they selected
+            - ``qid36_rank_cols`` – dict-like column with their QID36 selection-
+              order values (for reference; these are *not* preference ranks)
+        """
+        # Get the top-3 ranking data (QID98-based)
+        top3, _ = self.get_top_3_voices(q)
+        top3_df = top3.collect()
+
+        ranking_cols = [c for c in top3_df.columns if c != '_recordId']
+
+        # Respondents where every QID98 ranking column is null
+        all_null_expr = pl.lit(True)
+        for col in ranking_cols:
+            all_null_expr = all_null_expr & pl.col(col).is_null()
+
+        missing_ids = top3_df.filter(all_null_expr).select('_recordId')
+
+        if missing_ids.height == 0:
+            return pl.DataFrame(schema={
+                '_recordId': pl.Utf8,
+                '3_Ranked': pl.Utf8,
+            })
+
+        # Enrich with the 3_Ranked text from the 18→8→3 question
+        v_18_8_3, _ = self.get_18_8_3(q)
+        v_df = v_18_8_3.collect()
+
+        result = missing_ids.join(
+            v_df.select(['_recordId', '3_Ranked']),
+            on='_recordId',
+            how='left',
+        )
+
+        return result
+
    
    def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
        """Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant. 
@@ -1518,6 +1642,235 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        
        return results_df, metadata

+    def compute_mentions_significance(
+        self,
+        data: pl.LazyFrame | pl.DataFrame,
+        alpha: float = 0.05,
+        correction: str = "bonferroni",
+    ) -> tuple[pl.DataFrame, dict]:
+        """Compute statistical significance for Total Mentions (Rank 1+2+3).
+        
+        Tests whether the proportion of respondents who included a voice in their Top 3
+        is significantly different between voices.
+        
+        Args:
+            data: Ranking data (rows=respondents, cols=voices, values=rank).
+            alpha: Significance level.
+            correction: Multiple comparison correction method.
+            
+        Returns:
+            tuple: (pairwise_df, metadata)
+        """
+        from scipy import stats as scipy_stats
+        import numpy as np
+        
+        if isinstance(data, pl.LazyFrame):
+            df = data.collect()
+        else:
+            df = data
+        
+        ranking_cols = [c for c in df.columns if c != '_recordId']
+        if len(ranking_cols) < 2:
+            raise ValueError("Need at least 2 ranking columns")
+        
+        total_respondents = df.height
+        mentions_data = {}
+        
+        # Count mentions (any rank) for each voice
+        for col in ranking_cols:
+            label = self._clean_voice_label(col)
+            count = df.filter(pl.col(col).is_not_null()).height
+            mentions_data[label] = count
+            
+        labels = sorted(list(mentions_data.keys()))
+        results = []
+        n_comparisons = len(labels) * (len(labels) - 1) // 2
+        
+        for i, label1 in enumerate(labels):
+            for label2 in labels[i+1:]:
+                count1 = mentions_data[label1]
+                count2 = mentions_data[label2]
+                
+                pct1 = count1 / total_respondents
+                pct2 = count2 / total_respondents
+                
+                # Z-test for two proportions
+                n1 = total_respondents
+                n2 = total_respondents
+                
+                p_pooled = (count1 + count2) / (n1 + n2)
+                se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
+                
+                if se > 0:
+                    z_stat = (pct1 - pct2) / se
+                    p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
+                else:
+                    p_value = 1.0
+                
+                results.append({
+                    'group1': label1,
+                    'group2': label2,
+                    'p_value': float(p_value),
+                    'rank1_count1': count1, # Reusing column names for compatibility with heatmap plotting
+                    'rank1_count2': count2,
+                    'rank1_pct1': round(pct1 * 100, 1),
+                    'rank1_pct2': round(pct2 * 100, 1),
+                    'total1': n1,
+                    'total2': n2,
+                    'effect_size': pct1 - pct2 # Difference in proportions
+                })
+        
+        results_df = pl.DataFrame(results)
+        
+        p_values = results_df['p_value'].to_numpy()
+        p_adjusted = np.full_like(p_values, np.nan, dtype=float)
+        
+        if correction == "bonferroni":
+            p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
+        elif correction == "holm":
+            sorted_idx = np.argsort(p_values)
+            sorted_p = p_values[sorted_idx]
+            m = len(sorted_p)
+            adjusted = np.zeros(m)
+            for j in range(m):
+                adjusted[j] = sorted_p[j] * (m - j)
+            for j in range(1, m):
+                adjusted[j] = max(adjusted[j], adjusted[j-1])
+            adjusted = np.minimum(adjusted, 1.0)
+            p_adjusted = adjusted[np.argsort(sorted_idx)]
+        elif correction == "none":
+            p_adjusted = p_values.astype(float) # pyright: ignore
+            
+        results_df = results_df.with_columns([
+            pl.Series('p_adjusted', p_adjusted),
+            pl.Series('significant', p_adjusted < alpha),
+        ]).sort('p_value')
+        
+        metadata = {
+            'test_type': 'proportion_z_test_mentions',
+            'alpha': alpha,
+            'correction': correction,
+            'n_comparisons': n_comparisons,
+        }
+        
+        return results_df, metadata
+
+    def compute_rank1_significance(
+        self,
+        data: pl.LazyFrame | pl.DataFrame,
+        alpha: float = 0.05,
+        correction: str = "bonferroni",
+    ) -> tuple[pl.DataFrame, dict]:
+        """Compute statistical significance for Rank 1 selections only.
+
+        Like compute_mentions_significance but counts only how many times each
+        voice/character was ranked **1st**, using total respondents as the
+        denominator.  This tests whether first-choice preference differs
+        significantly between voices.
+
+        Args:
+            data: Ranking data (rows=respondents, cols=voices, values=rank).
+            alpha: Significance level.
+            correction: Multiple comparison correction method.
+
+        Returns:
+            tuple: (pairwise_df, metadata)
+        """
+        from scipy import stats as scipy_stats
+        import numpy as np
+
+        if isinstance(data, pl.LazyFrame):
+            df = data.collect()
+        else:
+            df = data
+
+        ranking_cols = [c for c in df.columns if c != '_recordId']
+        if len(ranking_cols) < 2:
+            raise ValueError("Need at least 2 ranking columns")
+
+        total_respondents = df.height
+        rank1_data: dict[str, int] = {}
+
+        # Count rank-1 selections for each voice
+        for col in ranking_cols:
+            label = self._clean_voice_label(col)
+            count = df.filter(pl.col(col) == 1).height
+            rank1_data[label] = count
+
+        labels = sorted(list(rank1_data.keys()))
+        results = []
+        n_comparisons = len(labels) * (len(labels) - 1) // 2
+
+        for i, label1 in enumerate(labels):
+            for label2 in labels[i+1:]:
+                count1 = rank1_data[label1]
+                count2 = rank1_data[label2]
+
+                pct1 = count1 / total_respondents
+                pct2 = count2 / total_respondents
+
+                # Z-test for two proportions (same denominator for both)
+                n1 = total_respondents
+                n2 = total_respondents
+
+                p_pooled = (count1 + count2) / (n1 + n2)
+                se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
+
+                if se > 0:
+                    z_stat = (pct1 - pct2) / se
+                    p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
+                else:
+                    p_value = 1.0
+
+                results.append({
+                    'group1': label1,
+                    'group2': label2,
+                    'p_value': float(p_value),
+                    'rank1_count1': count1,
+                    'rank1_count2': count2,
+                    'rank1_pct1': round(pct1 * 100, 1),
+                    'rank1_pct2': round(pct2 * 100, 1),
+                    'total1': n1,
+                    'total2': n2,
+                    'effect_size': pct1 - pct2,
+                })
+
+        results_df = pl.DataFrame(results)
+
+        p_values = results_df['p_value'].to_numpy()
+        p_adjusted = np.full_like(p_values, np.nan, dtype=float)
+
+        if correction == "bonferroni":
+            p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
+        elif correction == "holm":
+            sorted_idx = np.argsort(p_values)
+            sorted_p = p_values[sorted_idx]
+            m = len(sorted_p)
+            adjusted = np.zeros(m)
+            for j in range(m):
+                adjusted[j] = sorted_p[j] * (m - j)
+            for j in range(1, m):
+                adjusted[j] = max(adjusted[j], adjusted[j-1])
+            adjusted = np.minimum(adjusted, 1.0)
+            p_adjusted = adjusted[np.argsort(sorted_idx)]
+        elif correction == "none":
+            p_adjusted = p_values.astype(float)  # pyright: ignore
+
+        results_df = results_df.with_columns([
+            pl.Series('p_adjusted', p_adjusted),
+            pl.Series('significant', p_adjusted < alpha),
+        ]).sort('p_value')
+
+        metadata = {
+            'test_type': 'proportion_z_test_rank1',
+            'alpha': alpha,
+            'correction': correction,
+            'n_comparisons': n_comparisons,
+        }
+
+        return results_df, metadata
+
+

 def process_speaking_style_data(
    df: Union[pl.LazyFrame, pl.DataFrame],
Author	SHA1	Message	Date
Luigi Maiorano	03a716e8ec	correlation matrix speech characteristics vs score	2026-02-10 16:50:47 +01:00
Luigi Maiorano	8720bb670d	started speech data notebook	2026-02-10 14:58:13 +01:00
Luigi Maiorano	9dfab75925	missing data analysis	2026-02-10 14:24:26 +01:00
Luigi Maiorano	14e28cf368	stat significance nr times ranked 1st	2026-02-09 18:37:41 +01:00
Luigi Maiorano	8e181e193a	SL filter	2026-02-09 17:57:04 +01:00
Luigi Maiorano	6c16993cb3	straight-liner plot analysis	2026-02-09 17:26:45 +01:00
Luigi Maiorano	92c6fc03ab	docs datasets	2026-02-09 13:17:59 +01:00
Luigi Maiorano	7fb6570190	statistical significance	2026-02-05 19:49:19 +01:00
Luigi Maiorano	840bd2940d	other top bc's	2026-02-05 11:50:00 +01:00
Luigi Maiorano	af9a15ccb0	renamed notebooks and added significance test	2026-02-05 10:14:53 +01:00
Luigi Maiorano	a3cf9f103d	update plots with final data release	2026-02-04 21:15:03 +01:00
Luigi Maiorano	f0eab32c34	update alt-text with full filepaths	2026-02-04 17:48:48 +01:00
Luigi Maiorano	d231fc02db	fix missing filter descr in correlation plots	2026-02-04 14:48:14 +01:00
Luigi Maiorano	fc76bb0ab5	voice gender split correlation plots	2026-02-04 13:44:51 +01:00
Luigi Maiorano	ab78276a97	male/female voices in separate plots for correlations	2026-02-04 12:35:24 +01:00
Luigi Maiorano	e17646eb70	correlation plots for best bc	2026-02-04 10:46:31 +01:00
Luigi Maiorano	ad1d8c6e58	all plots offline update	2026-02-03 22:38:15 +01:00
Luigi Maiorano	f5b4c247b8	tidy plots	2026-02-03 22:12:17 +01:00
Luigi Maiorano	a35670aa72	fixed missing ai_user category	2026-02-03 21:13:29 +01:00
Luigi Maiorano	36280a6ff8	fix sample size	2026-02-03 20:48:34 +01:00
Luigi Maiorano	9a587dcc4c	add ai-user filter combinations	2026-02-03 19:46:07 +01:00
Luigi Maiorano	9a49d1c690	added sample size to filter text	2026-02-03 19:16:39 +01:00
Luigi Maiorano	8f505da550	offline update 18-30	2026-02-03 18:43:20 +01:00
Luigi Maiorano	495b56307c	fixed filter to none	2026-02-03 18:19:06 +01:00
Luigi Maiorano	1e76a82f24	fix wordcloud filter values	2026-02-03 17:41:12 +01:00
Luigi Maiorano	01b7d50637	fixed empty plots, updated filters	2026-02-03 16:51:24 +01:00
Luigi Maiorano	dca9ac11ba	supposed wordcloud fix, but everything broke	2026-02-03 15:36:35 +01:00
Luigi Maiorano	081fb0dd6e	added 6 more filters	2026-02-03 15:20:01 +01:00