Compare commits

...

25 Commits

Author SHA1 Message Date
03a716e8ec correlation matrix speech characteristics vs score 2026-02-10 16:50:47 +01:00
8720bb670d started speech data notebook 2026-02-10 14:58:13 +01:00
9dfab75925 missing data analysis 2026-02-10 14:24:26 +01:00
14e28cf368 stat significance nr times ranked 1st 2026-02-09 18:37:41 +01:00
8e181e193a SL filter 2026-02-09 17:57:04 +01:00
6c16993cb3 straight-liner plot analysis 2026-02-09 17:26:45 +01:00
92c6fc03ab docs datasets 2026-02-09 13:17:59 +01:00
7fb6570190 statistical significance 2026-02-05 19:49:19 +01:00
840bd2940d other top bc's 2026-02-05 11:50:00 +01:00
af9a15ccb0 renamed notebooks and added significance test 2026-02-05 10:14:53 +01:00
a3cf9f103d update plots with final data release 2026-02-04 21:15:03 +01:00
f0eab32c34 update alt-text with full filepaths 2026-02-04 17:48:48 +01:00
d231fc02db fix missing filter descr in correlation plots 2026-02-04 14:48:14 +01:00
fc76bb0ab5 voice gender split correlation plots 2026-02-04 13:44:51 +01:00
ab78276a97 male/female voices in separate plots for correlations 2026-02-04 12:35:24 +01:00
e17646eb70 correlation plots for best bc 2026-02-04 10:46:31 +01:00
ad1d8c6e58 all plots offline update 2026-02-03 22:38:15 +01:00
f5b4c247b8 tidy plots 2026-02-03 22:12:17 +01:00
a35670aa72 fixed missing ai_user category 2026-02-03 21:13:29 +01:00
36280a6ff8 fix sample size 2026-02-03 20:48:34 +01:00
9a587dcc4c add ai-user filter combinations 2026-02-03 19:46:07 +01:00
9a49d1c690 added sample size to filter text 2026-02-03 19:16:39 +01:00
8f505da550 offline update 18-30 2026-02-03 18:43:20 +01:00
495b56307c fixed filter to none 2026-02-03 18:19:06 +01:00
1e76a82f24 fix wordcloud filter values 2026-02-03 17:41:12 +01:00
16 changed files with 5444 additions and 460 deletions

5
.vscode/extensions.json vendored Normal file
View File

@@ -0,0 +1,5 @@
{
"recommendations": [
"wakatime.vscode-wakatime"
]
}

5
.vscode/settings.json vendored Normal file
View File

@@ -0,0 +1,5 @@
{
"chat.tools.terminal.autoApprove": {
"/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/.venv/bin/python": true
}
}

View File

@@ -21,9 +21,14 @@ def _():
@app.cell
def _():
TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
return
@app.cell
def _():
TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/2-2-26')
TAG_IMAGE_DIR = Path('figures/debug')
return TAG_IMAGE_DIR, TAG_SOURCE
@@ -47,10 +52,10 @@ def _():
@app.cell
def _():
REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
NEW_IMAGES_DIR = Path('figures/2-2-26')
NEW_IMAGES_DIR = Path('figures/2-4-26')
return NEW_IMAGES_DIR, REPLACE_SOURCE

View File

@@ -0,0 +1,263 @@
"""Extra analyses of the traits"""
# %% Imports
import utils
import polars as pl
import argparse
import json
import re
from pathlib import Path
from validation import check_straight_liners
# %% Fixed Variables
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
# %% CLI argument parsing for batch automation
# When run as script: uv run XX_statistical_significance.script.py --age '["18
# Central filter configuration - add new filters here only
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
FILTER_CONFIG = {
'age': 'options_age',
'gender': 'options_gender',
'ethnicity': 'options_ethnicity',
'income': 'options_income',
'consumer': 'options_consumer',
'business_owner': 'options_business_owner',
'ai_user': 'options_ai_user',
'investable_assets': 'options_investable_assets',
'industry': 'options_industry',
}
def parse_cli_args():
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
# Dynamically add filter arguments from config
for filter_name in FILTER_CONFIG:
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
# Only parse if running as script (not in Jupyter/interactive)
try:
# Check if running in Jupyter by looking for ipykernel
get_ipython() # noqa: F821 # type: ignore
# Return namespace with all filters set to None
no_filters = {f: None for f in FILTER_CONFIG}
# Use the same default as argparse
default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
except NameError:
args = parser.parse_args()
# Parse JSON strings to lists
for filter_name in FILTER_CONFIG:
val = getattr(args, filter_name)
setattr(args, filter_name, json.loads(val) if val else None)
return args
cli_args = parse_cli_args()
# %%
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
data_all = S.load_data()
# %% Build filtered dataset based on CLI args
# CLI args: None means "no filter applied" - filter_data() will skip None filters
# Build filter values dict dynamically from FILTER_CONFIG
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
_d = S.filter_data(data_all, **_active_filters)
# Write filter description file if filter-name is provided
if cli_args.filter_name and S.fig_save_dir:
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
_filter_slug = S._get_filter_slug()
_filter_slug_dir = S.fig_save_dir / _filter_slug
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
# Build filter description
_filter_desc_lines = [
f"Filter: {cli_args.filter_name}",
"",
"Applied Filters:",
]
_short_desc_parts = []
for filter_name, options_attr in FILTER_CONFIG.items():
all_options = getattr(S, options_attr)
values = _active_filters[filter_name]
display_name = filter_name.replace('_', ' ').title()
# None means no filter applied (same as "All")
if values is not None and values != all_options:
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
else:
_filter_desc_lines.append(f" {display_name}: All")
# Write detailed description INSIDE the filter-slug directory
# Sanitize filter name for filename usage (replace / and other chars)
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
_filter_file.write_text('\n'.join(_filter_desc_lines))
# Append to summary index file at figures/<export_date>/filter_index.txt
_summary_file = S.fig_save_dir / "filter_index.txt"
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
# Append or create the summary file
if _summary_file.exists():
_existing = _summary_file.read_text()
# Avoid duplicate entries for same slug
if _filter_slug not in _existing:
with _summary_file.open('a') as f:
f.write(_summary_line)
else:
_header = "Filter Index\n" + "=" * 80 + "\n\n"
_header += "Directory | Filter Name | Description\n"
_header += "-" * 80 + "\n"
_summary_file.write_text(_header + _summary_line)
# Save to logical variable name for further analysis
data = _d
data.collect()
# %% Voices per trait
ss_or, choice_map_or = S.get_ss_orange_red(data)
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
# Combine the data
ss_all = ss_or.join(ss_gb, on='_recordId')
_d = ss_all.collect()
choice_map = {**choice_map_or, **choice_map_gb}
# print(_d.head())
# print(choice_map)
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
# %% Create plots
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
trait_d = ss_long.filter(pl.col("Description") == trait)
S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", ""), height=550, color_gender=True)
# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
# Save with different filename suffix so we can compare with/without straight-liners
print("\n--- Straight-lining Checks on TRAITS ---")
sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
sl_traits_df
# %%
if sl_traits_df is not None and not sl_traits_df.is_empty():
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
n_sl_groups = sl_traits_df.height
print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
# Create key in ss_long to match sl_traits_df for anti-join
# Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
# ss_long has "Style_Group" and "Voice"
ss_long_w_key = ss_long.with_columns(
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
)
# Prepare filter table: Record ID + Question Group
sl_filter = sl_traits_df.select([
pl.col("Record ID").alias("_recordId"),
pl.col("Question Group")
])
# Anti-join to remove specific question blocks that were straight-lined
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
# Re-plot with suffix in title
print("Re-plotting traits (Cleaned)...")
for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
trait_d = ss_long_clean.filter(pl.col("Description") == trait)
# Modify title to create unique filename (and display title)
title_clean = trait.replace(":", "") + " (Excl. Straight-Liners)"
S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
else:
print("No straight-liners found on traits.")
# %% Compare All vs Cleaned
if sl_traits_df is not None and not sl_traits_df.is_empty():
print("Generating Comparison Plots (All vs Cleaned)...")
# Always apply the per-question-group filtering here to ensure consistency
# (Matches the logic used in the re-plotting section above)
print("Applying filter to remove straight-lined question blocks...")
ss_long_w_key = ss_long.with_columns(
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
)
sl_filter = sl_traits_df.select([
pl.col("Record ID").alias("_recordId"),
pl.col("Question Group")
])
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
# --- Verification Prints ---
print(f"\n--- Verification of Filter ---")
print(f"Original Row Count: {ss_long.height}")
print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
print(f"Sample IDs affected: {sl_ids[:5]}")
print(f"Cleaned Row Count: {ss_long_clean.height}")
print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
# Verify removal
# Re-construct key to verify
ss_long_check = ss_long.with_columns(
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
)
sl_filter_check = sl_traits_df.select([
pl.col("Record ID").alias("_recordId"),
pl.col("Question Group")
])
should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
# Show what was removed (the straight lining behavior)
print("\nSample of Straight-Liner Data (Values that caused removal):")
print(sl_traits_df.head(5))
print("-" * 30 + "\n")
# ---------------------------
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
# Get data for this trait from both datasets
trait_d_all = ss_long.filter(pl.col("Description") == trait)
trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
# Plot comparison
title_comp = trait.replace(":", "") + " (Impact of Straight-Liners)"
S.plot_speaking_style_trait_scores_comparison(
trait_d_all,
trait_d_clean,
title=title_comp,
height=600 # Slightly taller for grouped bars
)

View File

@@ -7,13 +7,20 @@ import polars as pl
from pathlib import Path
import argparse
import json
import re
from validation import check_progress, duration_validation, check_straight_liners
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
import utils
from speaking_styles import SPEAKING_STYLES
# %% Fixed Variables
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
# %%
# CLI argument parsing for batch automation
# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
@@ -41,13 +48,18 @@ def parse_cli_args():
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
# Only parse if running as script (not in Jupyter/interactive)
try:
# Check if running in Jupyter by looking for ipykernel
get_ipython() # noqa: F821
get_ipython() # noqa: F821 # type: ignore
# Return namespace with all filters set to None
return argparse.Namespace(**{f: None for f in FILTER_CONFIG}, filter_name=None)
no_filters = {f: None for f in FILTER_CONFIG}
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
except NameError:
args = parser.parse_args()
# Parse JSON strings to lists
@@ -57,71 +69,26 @@ def parse_cli_args():
return args
cli_args = parse_cli_args()
BEST_CHOSEN_CHARACTER = cli_args.best_character
# %%
# file_browser = mo.ui.file_browser(
# initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
# )
# file_browser
# # %%
# mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
# RESULTS_FILE = Path(file_browser.path(index=0))
RESULTS_FILE = 'data/exports/2-3-26_Copy-2-2-26/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
# %%
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
try:
data_all = S.load_data()
except NotImplementedError as e:
mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
# %%
BEST_CHOSEN_CHARACTER = "the_coach"
# # %%
# filter_form = mo.md('''
# %% Build filtered dataset based on CLI args
# CLI args: None means "no filter applied" - filter_data() will skip None filters
# {age}
# {gender}
# {ethnicity}
# {income}
# {consumer}
# '''
# ).batch(
# age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
# gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
# ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
# income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
# consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
# ).form()
# mo.md(f'''
# ---
# # Data Filter
# {filter_form}
# ''')
# %%
# mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
# CLI args: None means "all options selected" (use S.options_* defaults)
# Build filter values dict dynamically from FILTER_CONFIG
_active_filters = {}
for filter_name, options_attr in FILTER_CONFIG.items():
cli_value = getattr(cli_args, filter_name)
all_options = getattr(S, options_attr)
_active_filters[filter_name] = cli_value if cli_value is not None else all_options
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
# %% Apply filters
_d = S.filter_data(data_all, **_active_filters)
# Write filter description file if filter-name is provided
@@ -142,14 +109,17 @@ if cli_args.filter_name and S.fig_save_dir:
all_options = getattr(S, options_attr)
values = _active_filters[filter_name]
display_name = filter_name.replace('_', ' ').title()
if values != all_options:
# None means no filter applied (same as "All")
if values is not None and values != all_options:
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
else:
_filter_desc_lines.append(f" {display_name}: All")
# Write detailed description INSIDE the filter-slug directory
_filter_file = _filter_slug_dir / f"{cli_args.filter_name}.txt"
# Sanitize filter name for filename usage (replace / and other chars)
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
_filter_file.write_text('\n'.join(_filter_desc_lines))
# Append to summary index file at figures/<export_date>/filter_index.txt
@@ -170,14 +140,65 @@ if cli_args.filter_name and S.fig_save_dir:
_header += "-" * 80 + "\n"
_summary_file.write_text(_header + _summary_line)
# Stop execution and prevent other cells from running if no data is selected
# mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
data = _d
# %% Apply straight-liner threshold filter (if specified)
# Removes respondents who straight-lined >= N question groups across
# speaking style and voice scale questions.
if cli_args.sl_threshold is not None:
_sl_n = cli_args.sl_threshold
S.sl_threshold = _sl_n # Store on Survey so filter slug/description include it
print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...")
_n_before = _d.select(pl.len()).collect().item()
# data = data_validated
# Extract question groups with renamed columns for check_straight_liners
_sl_ss_or, _ = S.get_ss_orange_red(_d)
_sl_ss_gb, _ = S.get_ss_green_blue(_d)
_sl_vs, _ = S.get_voice_scale_1_10(_d)
_sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId')
_, _sl_df = check_straight_liners(_sl_all_q, max_score=5)
if _sl_df is not None and not _sl_df.is_empty():
# Count straight-lined question groups per respondent
_sl_counts = (
_sl_df
.group_by("Record ID")
.agg(pl.len().alias("sl_count"))
.filter(pl.col("sl_count") >= _sl_n)
.select(pl.col("Record ID").alias("_recordId"))
)
# Anti-join to remove offending respondents
_d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy()
# Update filtered data on the Survey object so sample size is correct
S.data_filtered = _d
_n_after = _d.select(pl.len()).collect().item()
print(f" Removed {_n_before - _n_after} respondents ({_n_before}{_n_after})")
else:
print(" No straight-liners detected — no respondents removed.")
# %% Apply voice-ranking completeness filter (if specified)
# Keeps only / excludes respondents who are missing the explicit voice
# ranking question (QID98) despite completing the top-3 selection (QID36).
if cli_args.voice_ranking_filter is not None:
S.voice_ranking_filter = cli_args.voice_ranking_filter # Store on Survey so filter slug/description include it
_vr_missing = S.get_top_3_voices_missing_ranking(_d)
_vr_missing_ids = _vr_missing.select('_recordId')
_n_before = _d.select(pl.len()).collect().item()
if cli_args.voice_ranking_filter == 'only-missing':
print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
elif cli_args.voice_ranking_filter == 'exclude-missing':
print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
S.data_filtered = _d
_n_after = _d.select(pl.len()).collect().item()
print(f" {_n_before}{_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
# Save to logical variable name for further analysis
data = _d
data.collect()
# %%
# %%
@@ -560,6 +581,39 @@ S.plot_speaking_style_color_correlation(
title="Correlation: Speaking Style Colors and Voice Ranking Points"
)
# %%
# Gender-filtered correlation plots (Male vs Female voices)
from reference import VOICE_GENDER_MAPPING
MALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Male"]
FEMALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Female"]
# Filter joined data by voice gender
joined_scale_male = joined_scale.filter(pl.col("Voice").is_in(MALE_VOICES))
joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
# Colors vs Scale 1-10 (grouped by voice gender)
S.plot_speaking_style_color_correlation_by_gender(
data_male=joined_scale_male,
data_female=joined_scale_female,
speaking_styles=SPEAKING_STYLES,
target_column="Voice_Scale_Score",
title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
)
# Colors vs Ranking Points (grouped by voice gender)
S.plot_speaking_style_color_correlation_by_gender(
data_male=joined_ranking_male,
data_female=joined_ranking_female,
speaking_styles=SPEAKING_STYLES,
target_column="Ranking_Points",
title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
)
# %%
mo.md(r"""
### Individual Traits vs Scale 1-10
@@ -570,7 +624,7 @@ _content = """"""
for _style, _traits in SPEAKING_STYLES.items():
# print(f"Correlation plot for {style}...")
_fig = S.plot_speaking_style_correlation(
_fig = S.plot_speaking_style_scale_correlation(
data=joined_scale,
style_color=_style,
style_traits=_traits,
@@ -609,86 +663,187 @@ for _style, _traits in SPEAKING_STYLES.items():
mo.md(_content)
# %%
mo.md(r"""
## Correlations when "Best Brand Character" is chosen
# Individual Traits vs Scale 1-10 (grouped by voice gender)
_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
Select only the traits that fit with that character
""")
# %%
from reference import ORIGINAL_CHARACTER_TRAITS
chosen_bc_traits = ORIGINAL_CHARACTER_TRAITS[BEST_CHOSEN_CHARACTER]
# %%
STYLES_SUBSET = utils.filter_speaking_styles(SPEAKING_STYLES, chosen_bc_traits)
# %%
mo.md(r"""
### Individual Traits vs Ranking Points
""")
# %%
_content = ""
for _style, _traits in STYLES_SUBSET.items():
_fig = S.plot_speaking_style_ranking_correlation(
data=joined_ranking,
for _style, _traits in SPEAKING_STYLES.items():
_fig = S.plot_speaking_style_scale_correlation_by_gender(
data_male=joined_scale_male,
data_female=joined_scale_female,
style_color=_style,
style_traits=_traits,
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Ranking Points"""
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.altair_chart(_fig)}
"""
mo.md(_content)
# %%
mo.md(r"""
### Individual Traits vs Scale 1-10
""")
# Individual Traits vs Ranking Points (grouped by voice gender)
_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
# %%
_content = """"""
for _style, _traits in STYLES_SUBSET.items():
# print(f"Correlation plot for {style}...")
_fig = S.plot_speaking_style_correlation(
data=joined_scale,
for _style, _traits in SPEAKING_STYLES.items():
_fig = S.plot_speaking_style_ranking_correlation_by_gender(
data_male=joined_ranking_male,
data_female=joined_ranking_female,
style_color=_style,
style_traits=_traits,
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Scale 1-10""",
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
)
_content += f"""
#### Speaking Style **{_style}**:
{mo.ui.altair_chart(_fig)}
"""
mo.md(_content)
# %%
mo.md(r"""
### Colors vs Scale 1-10 (Best Character)
""")
# ## Correlations when "Best Brand Character" is chosen
# For each of the 4 brand characters, filter the dataset to only those respondents
# who selected that character as their #1 choice.
# %%
# Transform to get one row per color with average correlation
_color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, STYLES_SUBSET)
S.plot_speaking_style_color_correlation(
data=_color_corr_scale,
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Scale 1-10"""
)
# Prepare character-filtered data subsets
char_rank_for_filter = S.get_character_ranking(data)[0].collect()
CHARACTER_FILTER_MAP = {
'Familiar Friend': 'Character_Ranking_Familiar_Friend',
'The Coach': 'Character_Ranking_The_Coach',
'Personal Assistant': 'Character_Ranking_The_Personal_Assistant',
'Bank Teller': 'Character_Ranking_The_Bank_Teller',
}
def get_filtered_data_for_character(char_name: str) -> tuple[pl.DataFrame, pl.DataFrame, int]:
"""Filter joined_scale and joined_ranking to respondents who ranked char_name #1."""
col = CHARACTER_FILTER_MAP[char_name]
respondents = char_rank_for_filter.filter(pl.col(col) == 1).select('_recordId')
n = respondents.height
filtered_scale = joined_scale.join(respondents, on='_recordId', how='inner')
filtered_ranking = joined_ranking.join(respondents, on='_recordId', how='inner')
return filtered_scale, filtered_ranking, n
def _char_filename(char_name: str, suffix: str) -> str:
"""Generate filename for character-filtered plots (without n-value).
Format: bc_ranked_1_{suffix}__{char_slug}
This groups all plot types together in directory listings.
"""
char_slug = char_name.lower().replace(' ', '_')
return f"bc_ranked_1_{suffix}__{char_slug}"
# %%
mo.md(r"""
### Colors vs Ranking Points (Best Character)
""")
# ### Voice Weighted Ranking Score (by Best Character)
for char_name in CHARACTER_FILTER_MAP:
_, _, n = get_filtered_data_for_character(char_name)
# Get top3 voices for this character subset using _recordIds
respondents = char_rank_for_filter.filter(
pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
).select('_recordId')
# Collect top3_voices if it's a LazyFrame, then join
top3_df = top3_voices.collect() if isinstance(top3_voices, pl.LazyFrame) else top3_voices
filtered_top3 = top3_df.join(respondents, on='_recordId', how='inner')
weighted = calculate_weighted_ranking_scores(filtered_top3)
S.plot_weighted_ranking_score(
data=weighted,
title=f'"{char_name}" Ranked #1 (n={n})<br>Most Popular Voice - Weighted Score (1st=3pts, 2nd=2pts, 3rd=1pt)',
filename=_char_filename(char_name, "voice_weighted_ranking_score"),
color_gender=COLOR_GENDER,
)
# %%
# ### Voice Scale 1-10 Average Scores (by Best Character)
for char_name in CHARACTER_FILTER_MAP:
_, _, n = get_filtered_data_for_character(char_name)
# Get voice scale data for this character subset using _recordIds
respondents = char_rank_for_filter.filter(
pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
).select('_recordId')
# Collect voice_1_10 if it's a LazyFrame, then join
voice_1_10_df = voice_1_10.collect() if isinstance(voice_1_10, pl.LazyFrame) else voice_1_10
filtered_voice_1_10 = voice_1_10_df.join(respondents, on='_recordId', how='inner')
S.plot_average_scores_with_counts(
data=filtered_voice_1_10,
title=f'"{char_name}" Ranked #1 (n={n})<br>Voice General Impression (Scale 1-10)',
filename=_char_filename(char_name, "voice_scale_1-10"),
x_label='Voice',
domain=[1, 10],
color_gender=COLOR_GENDER,
)
# %%
# ### Speaking Style Colors vs Scale 1-10 (only for Best Character)
for char_name in CHARACTER_FILTER_MAP:
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
continue
filtered_scale, _, n = get_filtered_data_for_character(char_name)
color_corr, _ = utils.transform_speaking_style_color_correlation(filtered_scale, SPEAKING_STYLES)
S.plot_speaking_style_color_correlation(
data=color_corr,
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Scale 1-10',
filename=_char_filename(char_name, "colors_vs_voice_scale_1-10"),
)
# %%
# ### Speaking Style Colors vs Ranking Points (only for Best Character)
for char_name in CHARACTER_FILTER_MAP:
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
continue
_, filtered_ranking, n = get_filtered_data_for_character(char_name)
color_corr, _ = utils.transform_speaking_style_color_correlation(
filtered_ranking, SPEAKING_STYLES, target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
data=color_corr,
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Ranking Points',
filename=_char_filename(char_name, "colors_vs_voice_ranking_points"),
)
# %%
# ### Individual Traits vs Scale 1-10 (only for Best Character)
for _style, _traits in SPEAKING_STYLES.items():
print(f"--- Speaking Style: {_style} ---")
for char_name in CHARACTER_FILTER_MAP:
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
continue
filtered_scale, _, n = get_filtered_data_for_character(char_name)
S.plot_speaking_style_scale_correlation(
data=filtered_scale,
style_color=_style,
style_traits=_traits,
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Scale 1-10',
filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_scale_1-10"),
)
# %%
# ### Individual Traits vs Ranking Points (only for Best Character)
for _style, _traits in SPEAKING_STYLES.items():
print(f"--- Speaking Style: {_style} ---")
for char_name in CHARACTER_FILTER_MAP:
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
continue
_, filtered_ranking, n = get_filtered_data_for_character(char_name)
S.plot_speaking_style_ranking_correlation(
data=filtered_ranking,
style_color=_style,
style_traits=_traits,
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Ranking Points',
filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_ranking_points"),
)
# %%
_color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
joined_ranking,
STYLES_SUBSET,
target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
data=_color_corr_ranking,
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Ranking Points"""
)

View File

@@ -0,0 +1,370 @@
"""Extra statistical significance analyses for quant report."""
# %% Imports
import utils
import polars as pl
import argparse
import json
import re
from pathlib import Path
# %% Fixed Variables
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
# %% CLI argument parsing for batch automation
# When run as script: uv run XX_statistical_significance.script.py --age '["18
# Central filter configuration - add new filters here only
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
FILTER_CONFIG = {
'age': 'options_age',
'gender': 'options_gender',
'ethnicity': 'options_ethnicity',
'income': 'options_income',
'consumer': 'options_consumer',
'business_owner': 'options_business_owner',
'ai_user': 'options_ai_user',
'investable_assets': 'options_investable_assets',
'industry': 'options_industry',
}
def parse_cli_args():
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
# Dynamically add filter arguments from config
for filter_name in FILTER_CONFIG:
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
# Only parse if running as script (not in Jupyter/interactive)
try:
# Check if running in Jupyter by looking for ipykernel
get_ipython() # noqa: F821 # type: ignore
# Return namespace with all filters set to None
no_filters = {f: None for f in FILTER_CONFIG}
# Use the same default as argparse
default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}'
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
except NameError:
args = parser.parse_args()
# Parse JSON strings to lists
for filter_name in FILTER_CONFIG:
val = getattr(args, filter_name)
setattr(args, filter_name, json.loads(val) if val else None)
return args
cli_args = parse_cli_args()
# %%
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
data_all = S.load_data()
# %% Build filtered dataset based on CLI args
# CLI args: None means "no filter applied" - filter_data() will skip None filters
# Build filter values dict dynamically from FILTER_CONFIG
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
_d = S.filter_data(data_all, **_active_filters)
# Write filter description file if filter-name is provided
if cli_args.filter_name and S.fig_save_dir:
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
_filter_slug = S._get_filter_slug()
_filter_slug_dir = S.fig_save_dir / _filter_slug
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
# Build filter description
_filter_desc_lines = [
f"Filter: {cli_args.filter_name}",
"",
"Applied Filters:",
]
_short_desc_parts = []
for filter_name, options_attr in FILTER_CONFIG.items():
all_options = getattr(S, options_attr)
values = _active_filters[filter_name]
display_name = filter_name.replace('_', ' ').title()
# None means no filter applied (same as "All")
if values is not None and values != all_options:
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
else:
_filter_desc_lines.append(f" {display_name}: All")
# Write detailed description INSIDE the filter-slug directory
# Sanitize filter name for filename usage (replace / and other chars)
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
_filter_file.write_text('\n'.join(_filter_desc_lines))
# Append to summary index file at figures/<export_date>/filter_index.txt
_summary_file = S.fig_save_dir / "filter_index.txt"
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
# Append or create the summary file
if _summary_file.exists():
_existing = _summary_file.read_text()
# Avoid duplicate entries for same slug
if _filter_slug not in _existing:
with _summary_file.open('a') as f:
f.write(_summary_line)
else:
_header = "Filter Index\n" + "=" * 80 + "\n\n"
_header += "Directory | Filter Name | Description\n"
_header += "-" * 80 + "\n"
_summary_file.write_text(_header + _summary_line)
# Save to logical variable name for further analysis
data = _d
data.collect()
# %% Character coach significatly higher than others
char_rank = S.get_character_ranking(data)[0]
_pairwise_df, _meta = S.compute_ranking_significance(
char_rank,
alpha=0.05,
correction="none",
)
# %% [markdown]
"""
### Methodology Analysis
**Input Data (`char_rank`)**:
* Generated by `S.get_character_ranking(data)`.
* Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.).
* Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice).
**Processing**:
* The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite).
* It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd).
**Statistical Test**:
* **Test Used**: Pairwise Z-test for two proportions (uncorrected).
* **Comparison**: It compares the **Rank 1 Share** of every pair of characters.
* *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?"
* **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance.
"""
# %% Plot heatmap of pairwise significance
S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference")
# %% Plot summary of significant differences (e.g., which characters are significantly higher than others)
# S.plot_significance_summary(_pairwise_df, metadata=_meta)
# %% [markdown]
"""
# Analysis: Significance of "The Coach"
**Parameters**: `alpha=0.05`, `correction='none'`
* **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083).
**Results**:
"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field.
* **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001).
* **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level.
* *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful.
**Conclusion**:
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
"""
# %% Mentions significance analysis
char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
char_rank,
alpha=0.05,
correction="none",
)
S.plot_significance_heatmap(
char_pairwise_df_mentions,
metadata=_meta_mentions,
title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
)
# %% voices analysis
top3_voices = S.get_top_3_voices(data)[0]
_pairwise_df_voice, _metadata = S.compute_ranking_significance(
top3_voices,alpha=0.05,correction="none")
S.plot_significance_heatmap(
_pairwise_df_voice,
metadata=_metadata,
title="Statistical Significance: Voice Top Choice Preference"
)
# %% Total Mentions Significance (Rank 1+2+3 Combined)
# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
top3_voices,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_df_mentions,
metadata=_meta_mentions,
title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
)
# %% Male Voices Only Analysis
import reference
def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
"""Filter ranking columns to keep only those matching target gender."""
cols_to_keep = []
# Always keep identifier if present
if '_recordId' in df.columns:
cols_to_keep.append('_recordId')
for col in df.columns:
# Check if column is a voice column (contains Vxx)
# Format is typically "Top_3_Voices_ranking__V14"
if '__V' in col:
voice_id = col.split('__')[1]
if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
cols_to_keep.append(col)
return df.select(cols_to_keep)
# Get full ranking data as DataFrame
df_voices = top3_voices.collect()
# Filter for Male voices
df_male_voices = filter_voices_by_gender(df_voices, 'Male')
# 1. Male Voices: Top Choice Preference (Rank 1)
_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
df_male_voices,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_pref,
metadata=_meta_male_pref,
title="Male Voices Only: Top Choice Preference Significance"
)
# 2. Male Voices: Total Mentions (Visibility)
_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
df_male_voices,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_vis,
metadata=_meta_male_vis,
title="Male Voices Only: Total Mentions Significance"
)
# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
# Start with the male voices dataframe from the previous step
voices_to_exclude = ['V88', 'V86', 'V81']
def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
"""Filter ranking columns to exclude specific voices."""
cols_to_keep = []
# Always keep identifier if present
if '_recordId' in df.columns:
cols_to_keep.append('_recordId')
for col in df.columns:
# Check if column is a voice column (contains Vxx)
if '__V' in col:
voice_id = col.split('__')[1]
if voice_id not in exclude_list:
cols_to_keep.append(col)
return df.select(cols_to_keep)
df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
# 1. Male Top Candidates: Top Choice Preference
_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
df_male_top,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_top_pref,
metadata=_meta_male_top_pref,
title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
)
# 2. Male Top Candidates: Total Mentions
_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
df_male_top,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_top_vis,
metadata=_meta_male_top_vis,
title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
)
# %% [markdown]
"""
# Rank 1 Selection Significance (Voice Level)
Similar to the Total Mentions significance analysis above, but counting
only how many times each voice was ranked **1st** (out of all respondents).
This isolates first-choice preference rather than overall top-3 visibility.
"""
# %% Rank 1 Significance: All Voices
_pairwise_df_rank1, _meta_rank1 = S.compute_rank1_significance(
top3_voices,
alpha=0.05,
correction="none",
)
S.plot_significance_heatmap(
_pairwise_df_rank1,
metadata=_meta_rank1,
title="Statistical Significance: Voice Rank 1 Selection"
)
# %% Rank 1 Significance: Male Voices Only
_pairwise_df_rank1_male, _meta_rank1_male = S.compute_rank1_significance(
df_male_voices,
alpha=0.05,
correction="none",
)
S.plot_significance_heatmap(
_pairwise_df_rank1_male,
metadata=_meta_rank1_male,
title="Male Voices Only: Rank 1 Selection Significance"
)
# %%

267
XX_straight_liners.py Normal file
View File

@@ -0,0 +1,267 @@
"""Extra analyses of the straight-liners"""
# %% Imports
import utils
import polars as pl
import argparse
import json
import re
from pathlib import Path
from validation import check_straight_liners
# %% Fixed Variables
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
# %% CLI argument parsing for batch automation
# When run as script: uv run XX_statistical_significance.script.py --age '["18
# Central filter configuration - add new filters here only
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
FILTER_CONFIG = {
'age': 'options_age',
'gender': 'options_gender',
'ethnicity': 'options_ethnicity',
'income': 'options_income',
'consumer': 'options_consumer',
'business_owner': 'options_business_owner',
'ai_user': 'options_ai_user',
'investable_assets': 'options_investable_assets',
'industry': 'options_industry',
}
def parse_cli_args():
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
# Dynamically add filter arguments from config
for filter_name in FILTER_CONFIG:
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
# Only parse if running as script (not in Jupyter/interactive)
try:
# Check if running in Jupyter by looking for ipykernel
get_ipython() # noqa: F821 # type: ignore
# Return namespace with all filters set to None
no_filters = {f: None for f in FILTER_CONFIG}
# Use the same default as argparse
default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}'
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
except NameError:
args = parser.parse_args()
# Parse JSON strings to lists
for filter_name in FILTER_CONFIG:
val = getattr(args, filter_name)
setattr(args, filter_name, json.loads(val) if val else None)
return args
cli_args = parse_cli_args()
# %%
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
data_all = S.load_data()
# %% Build filtered dataset based on CLI args
# CLI args: None means "no filter applied" - filter_data() will skip None filters
# Build filter values dict dynamically from FILTER_CONFIG
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
_d = S.filter_data(data_all, **_active_filters)
# Write filter description file if filter-name is provided
if cli_args.filter_name and S.fig_save_dir:
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
_filter_slug = S._get_filter_slug()
_filter_slug_dir = S.fig_save_dir / _filter_slug
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
# Build filter description
_filter_desc_lines = [
f"Filter: {cli_args.filter_name}",
"",
"Applied Filters:",
]
_short_desc_parts = []
for filter_name, options_attr in FILTER_CONFIG.items():
all_options = getattr(S, options_attr)
values = _active_filters[filter_name]
display_name = filter_name.replace('_', ' ').title()
# None means no filter applied (same as "All")
if values is not None and values != all_options:
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
else:
_filter_desc_lines.append(f" {display_name}: All")
# Write detailed description INSIDE the filter-slug directory
# Sanitize filter name for filename usage (replace / and other chars)
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
_filter_file.write_text('\n'.join(_filter_desc_lines))
# Append to summary index file at figures/<export_date>/filter_index.txt
_summary_file = S.fig_save_dir / "filter_index.txt"
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
# Append or create the summary file
if _summary_file.exists():
_existing = _summary_file.read_text()
# Avoid duplicate entries for same slug
if _filter_slug not in _existing:
with _summary_file.open('a') as f:
f.write(_summary_line)
else:
_header = "Filter Index\n" + "=" * 80 + "\n\n"
_header += "Directory | Filter Name | Description\n"
_header += "-" * 80 + "\n"
_summary_file.write_text(_header + _summary_line)
# Save to logical variable name for further analysis
data = _d
data.collect()
# %% Determine straight-liner repeat offenders
# Extract question groups with renamed columns that check_straight_liners expects.
# The raw `data` has QID-based column names; the getter methods rename them to
# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc.
ss_or, _ = S.get_ss_orange_red(data)
ss_gb, _ = S.get_ss_green_blue(data)
vs, _ = S.get_voice_scale_1_10(data)
# Combine all question groups into one wide LazyFrame (joined on _recordId)
all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
# Run straight-liner detection across all question groups
# max_score=5 catches all speaking-style straight-lining (1-5 scale)
# and voice-scale values ≤5 on the 1-10 scale
# Note: sl_threshold is NOT set on S here — this script analyses straight-liners,
# it doesn't filter them out of the dataset.
print("Running straight-liner detection across all question groups...")
sl_report, sl_df = check_straight_liners(all_questions, max_score=5)
# %% Quantify repeat offenders
# sl_df has one row per (Record ID, Question Group) that was straight-lined.
# Group by Record ID to count how many question groups each person SL'd.
if sl_df is not None and not sl_df.is_empty():
total_respondents = data.select(pl.len()).collect().item()
# Per-respondent count of straight-lined question groups
respondent_sl_counts = (
sl_df
.group_by("Record ID")
.agg(pl.len().alias("sl_count"))
.sort("sl_count", descending=True)
)
max_sl = respondent_sl_counts["sl_count"].max()
print(f"\nTotal respondents: {total_respondents}")
print(f"Respondents who straight-lined at least 1 question group: "
f"{respondent_sl_counts.height}")
print(f"Maximum question groups straight-lined by one person: {max_sl}")
print()
# Build cumulative distribution: for each threshold N, count respondents
# who straight-lined >= N question groups
cumulative_rows = []
for threshold in range(1, max_sl + 1):
count = respondent_sl_counts.filter(
pl.col("sl_count") >= threshold
).height
pct = (count / total_respondents) * 100
cumulative_rows.append({
"threshold": threshold,
"count": count,
"pct": pct,
})
print(
f"{threshold} question groups straight-lined: "
f"{count} respondents ({pct:.1f}%)"
)
cumulative_df = pl.DataFrame(cumulative_rows)
print(f"\n{cumulative_df}")
# %% Save cumulative data to CSV
_filter_slug = S._get_filter_slug()
_csv_dir = Path(S.fig_save_dir) / _filter_slug
_csv_dir.mkdir(parents=True, exist_ok=True)
_csv_path = _csv_dir / "straight_liner_repeat_offenders.csv"
cumulative_df.write_csv(_csv_path)
print(f"Saved cumulative data to {_csv_path}")
# %% Plot the cumulative distribution
S.plot_straight_liner_repeat_offenders(
cumulative_df,
total_respondents=total_respondents,
)
# %% Per-question straight-lining frequency
# Build human-readable question group names from the raw keys
def _humanise_question_group(key: str) -> str:
"""Convert internal question group key to a readable label.
Examples:
SS_Green_Blue__V14 → Green/Blue V14
SS_Orange_Red__V48 → Orange/Red V48
Voice_Scale_1_10 → Voice Scale (1-10)
"""
if key.startswith("SS_Green_Blue__"):
voice = key.split("__")[1]
return f"Green/Blue {voice}"
if key.startswith("SS_Orange_Red__"):
voice = key.split("__")[1]
return f"Orange/Red {voice}"
if key == "Voice_Scale_1_10":
return "Voice Scale (1-10)"
# Fallback: replace underscores
return key.replace("_", " ")
per_question_counts = (
sl_df
.group_by("Question Group")
.agg(pl.col("Record ID").n_unique().alias("count"))
.sort("count", descending=True)
.with_columns(
(pl.col("count") / total_respondents * 100).alias("pct")
)
)
# Add human-readable names
per_question_counts = per_question_counts.with_columns(
pl.col("Question Group").map_elements(
_humanise_question_group, return_dtype=pl.Utf8
).alias("question")
)
print("\n--- Per-Question Straight-Lining Frequency ---")
print(per_question_counts)
# Save per-question data to CSV
_csv_path_pq = _csv_dir / "straight_liner_per_question.csv"
per_question_counts.write_csv(_csv_path_pq)
print(f"Saved per-question data to {_csv_path_pq}")
# Plot
S.plot_straight_liner_per_question(
per_question_counts,
total_respondents=total_respondents,
)
# %% Show the top repeat offenders (respondents with most SL'd groups)
print("\n--- Top Repeat Offenders ---")
print(respondent_sl_counts.head(20))
else:
print("No straight-liners detected in the dataset.")

File diff suppressed because one or more lines are too long

BIN
docs/README.pdf Normal file

Binary file not shown.

View File

@@ -0,0 +1,104 @@
# Appendix: Quantitative Analysis Plots - Folder Structure Manual
This folder contains all the quantitative analysis plots, sorted by the filters applied to the dataset. Each folder corresponds to a specific demographic cut.
## Folder Overview
* `All_Respondents/`: Analysis of the full dataset (no filters).
* `filter_index.txt`: A master list of every folder code and its corresponding demographic filter.
* **Filter Folders**: All other folders represent specific demographic cuts (e.g., `Age-18to21years`, `Gen-Woman`).
## How to Navigate
Each folder contains the same set of charts generated for that specific filter.
## Directory Reference Table
Below is the complete list of folder names. These names are encodings of the filters applied to the dataset, which we use to maintain consistency across our analysis.
| Directory Code | Filter Description |
| :--- | :--- |
| All_Respondents | All Respondents |
| Age-18to21years | Age: 18 to 21 years |
| Age-22to24years | Age: 22 to 24 years |
| Age-25to34years | Age: 25 to 34 years |
| Age-35to40years | Age: 35 to 40 years |
| Age-41to50years | Age: 41 to 50 years |
| Age-51to59years | Age: 51 to 59 years |
| Age-60to70years | Age: 60 to 70 years |
| Age-70yearsormore | Age: 70 years or more |
| Gen-Man | Gender: Man |
| Gen-Prefernottosay | Gender: Prefer not to say |
| Gen-Woman | Gender: Woman |
| Eth-6_grps_c64411 | Ethnicity: All options containing 'Alaska Native or Indigenous American' |
| Eth-6_grps_8f145b | Ethnicity: All options containing 'Asian or Asian American' |
| Eth-8_grps_71ac47 | Ethnicity: All options containing 'Black or African American' |
| Eth-7_grps_c5b3ce | Ethnicity: All options containing 'Hispanic or Latinx' |
| Eth-BlackorAfricanAmerican<br>MiddleEasternorNorthAfrican<br>WhiteorCaucasian+<br>MiddleEasternorNorthAfrican | Ethnicity: Middle Eastern or North African |
| Eth-AsianorAsianAmericanBlackorAfricanAmerican<br>NativeHawaiianorOtherPacificIslander+<br>NativeHawaiianorOtherPacificIslander | Ethnicity: Native Hawaiian or Other Pacific Islander |
| Eth-10_grps_cef760 | Ethnicity: All options containing 'White or Caucasian' |
| Inc-100000to149999 | Income: $100,000 to $149,999 |
| Inc-150000to199999 | Income: $150,000 to $199,999 |
| Inc-200000ormore | Income: $200,000 or more |
| Inc-25000to34999 | Income: $25,000 to $34,999 |
| Inc-35000to54999 | Income: $35,000 to $54,999 |
| Inc-55000to79999 | Income: $55,000 to $79,999 |
| Inc-80000to99999 | Income: $80,000 to $99,999 |
| Inc-Lessthan25000 | Income: Less than $25,000 |
| Cons-Lower_Mass_A+Lower_Mass_B | Consumer: Lower_Mass_A, Lower_Mass_B |
| Cons-MassAffluent_A+MassAffluent_B | Consumer: MassAffluent_A, MassAffluent_B |
| Cons-Mass_A+Mass_B | Consumer: Mass_A, Mass_B |
| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_A+<br>Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_A, Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
| Cons-Early_Professional | Consumer: Early_Professional |
| Cons-Lower_Mass_B | Consumer: Lower_Mass_B |
| Cons-MassAffluent_B | Consumer: MassAffluent_B |
| Cons-Mass_B | Consumer: Mass_B |
| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
| Cons-Starter | Consumer: Starter |
| BizOwn-No | Business Owner: No |
| BizOwn-Yes | Business Owner: Yes |
| AI-Daily | Ai User: Daily |
| AI-Lessthanonceamonth | Ai User: Less than once a month |
| AI-Morethanoncedaily | Ai User: More than once daily |
| AI-Multipletimesperweek | Ai User: Multiple times per week |
| AI-Onceamonth | Ai User: Once a month |
| AI-Onceaweek | Ai User: Once a week |
| AI-RarelyNever | Ai User: Rarely/Never |
| AI-Daily+<br>Morethanoncedaily+<br>Multipletimesperweek | Ai User: Daily, More than once daily, Multiple times per week |
| AI-4_grps_d4f57a | Ai User: Once a week, Once a month, Less than once a month, Rarely/Never |
| InvAsts-0to24999 | Investable Assets: $0 to $24,999 |
| InvAsts-150000to249999 | Investable Assets: $150,000 to $249,999 |
| InvAsts-1Mto4.9M | Investable Assets: $1M to $4.9M |
| InvAsts-25000to49999 | Investable Assets: $25,000 to $49,999 |
| InvAsts-250000to499999 | Investable Assets: $250,000 to $499,999 |
| InvAsts-50000to149999 | Investable Assets: $50,000 to $149,999 |
| InvAsts-500000to999999 | Investable Assets: $500,000 to $999,999 |
| InvAsts-5Mormore | Investable Assets: $5M or more |
| InvAsts-Prefernottoanswer | Investable Assets: Prefer not to answer |
| Ind-Agricultureforestryfishingorhunting | Industry: Agriculture, forestry, fishing, or hunting |
| Ind-Artsentertainmentorrecreation | Industry: Arts, entertainment, or recreation |
| Ind-Broadcasting | Industry: Broadcasting |
| Ind-Construction | Industry: Construction |
| Ind-EducationCollegeuniversityoradult | Industry: Education College, university, or adult |
| Ind-EducationOther | Industry: Education Other |
| Ind-EducationPrimarysecondaryK-12 | Industry: Education Primary/secondary (K-12) |
| Ind-Governmentandpublicadministration | Industry: Government and public administration |
| Ind-Hotelandfoodservices | Industry: Hotel and food services |
| Ind-InformationOther | Industry: Information Other |
| Ind-InformationServicesanddata | Industry: Information Services and data |
| Ind-Legalservices | Industry: Legal services |
| Ind-ManufacturingComputerandelectronics | Industry: Manufacturing Computer and electronics |
| Ind-ManufacturingOther | Industry: Manufacturing Other |
| Ind-Notemployed | Industry: Not employed |
| Ind-Otherindustrypleasespecify | Industry: Other industry (please specify) |
| Ind-Processing | Industry: Processing |
| Ind-Publishing | Industry: Publishing |
| Ind-Realestaterentalorleasing | Industry: Real estate, rental, or leasing |
| Ind-Retired | Industry: Retired |
| Ind-Scientificortechnicalservices | Industry: Scientific or technical services |
| Ind-Software | Industry: Software |
| Ind-Telecommunications | Industry: Telecommunications |
| Ind-Transportationandwarehousing | Industry: Transportation and warehousing |
| Ind-Utilities | Industry: Utilities |
| Ind-Wholesale | Industry: Wholesale |

1336
plots.py

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
- V46 not in scale 1-10. Qualtrics
- Straightliners
- V45 goed in qual maar slecht in quant

View File

@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
Usage:
uv run python run_filter_combinations.py
uv run python run_filter_combinations.py --dry-run # Preview combinations without running
uv run python run_filter_combinations.py --category age # Only run age combinations
uv run python run_filter_combinations.py --category consumer # Only run consumer segment combinations
"""
import subprocess
@@ -31,123 +33,171 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'
def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
"""
Generate all single-filter combinations.
Each combination isolates ONE filter value while keeping all others at "all selected".
Returns list of dicts with filter kwargs for each run.
Args:
survey: QualtricsSurvey instance with loaded data
category: Optional filter category to limit combinations to.
Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
'business_owner', 'ai_user', 'investable_assets', 'industry'
If None or 'all', generates all combinations.
Returns:
List of dicts with filter kwargs for each run.
"""
combinations = []
# Add "All Respondents" run (no filters = all options selected)
combinations.append({
'name': 'All_Respondents',
'filters': {} # Empty = use defaults (all selected)
})
if not category or category in ['all_filters', 'all']:
combinations.append({
'name': 'All_Respondents',
'filters': {} # Empty = use defaults (all selected)
})
# Age groups - one at a time
for age in survey.options_age:
combinations.append({
'name': f'Age-{age}',
'filters': {'age': [age]}
})
# Gender - one at a time
for gender in survey.options_gender:
combinations.append({
'name': f'Gender-{gender}',
'filters': {'gender': [gender]}
})
# Ethnicity - grouped by individual values
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
# Create filters that include ALL options containing each individual ethnicity value
ethnicity_values = set()
for ethnicity_option in survey.options_ethnicity:
# Split by comma and strip whitespace
values = [v.strip() for v in ethnicity_option.split(',')]
ethnicity_values.update(values)
for ethnicity_value in sorted(ethnicity_values):
# Find all options that contain this value
matching_options = [
opt for opt in survey.options_ethnicity
if ethnicity_value in [v.strip() for v in opt.split(',')]
]
combinations.append({
'name': f'Ethnicity-{ethnicity_value}',
'filters': {'ethnicity': matching_options}
})
# Income - one at a time
for income in survey.options_income:
combinations.append({
'name': f'Income-{income}',
'filters': {'income': [income]}
})
# Consumer segments - combine _A and _B options, and also include standalone
# Group options by base name (removing _A/_B suffix)
consumer_groups = {}
for consumer in survey.options_consumer:
# Check if ends with _A or _B
if consumer.endswith('_A') or consumer.endswith('_B'):
base_name = consumer[:-2] # Remove last 2 chars (_A or _B)
if base_name not in consumer_groups:
consumer_groups[base_name] = []
consumer_groups[base_name].append(consumer)
else:
# Not an _A/_B option, keep as-is
consumer_groups[consumer] = [consumer]
# Add combined _A+_B options
for base_name, options in consumer_groups.items():
if len(options) > 1: # Only combine if there are multiple (_A and _B)
if not category or category in ['all_filters', 'age']:
for age in survey.options_age:
combinations.append({
'name': f'Consumer-{base_name}',
'filters': {'consumer': options}
'name': f'Age-{age}',
'filters': {'age': [age]}
})
# Add standalone options (including individual _A and _B)
for consumer in survey.options_consumer:
combinations.append({
'name': f'Consumer-{consumer}',
'filters': {'consumer': [consumer]}
})
# Gender - one at a time
if not category or category in ['all_filters', 'gender']:
for gender in survey.options_gender:
combinations.append({
'name': f'Gender-{gender}',
'filters': {'gender': [gender]}
})
# Ethnicity - grouped by individual values
if not category or category in ['all_filters', 'ethnicity']:
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
# Create filters that include ALL options containing each individual ethnicity value
ethnicity_values = set()
for ethnicity_option in survey.options_ethnicity:
# Split by comma and strip whitespace
values = [v.strip() for v in ethnicity_option.split(',')]
ethnicity_values.update(values)
for ethnicity_value in sorted(ethnicity_values):
# Find all options that contain this value
matching_options = [
opt for opt in survey.options_ethnicity
if ethnicity_value in [v.strip() for v in opt.split(',')]
]
combinations.append({
'name': f'Ethnicity-{ethnicity_value}',
'filters': {'ethnicity': matching_options}
})
# Income - one at a time
if not category or category in ['all_filters', 'income']:
for income in survey.options_income:
combinations.append({
'name': f'Income-{income}',
'filters': {'income': [income]}
})
# Consumer segments - combine _A and _B options, and also include standalone
if not category or category in ['all_filters', 'consumer']:
# Group options by base name (removing _A/_B suffix)
consumer_groups = {}
for consumer in survey.options_consumer:
# Check if ends with _A or _B
if consumer.endswith('_A') or consumer.endswith('_B'):
base_name = consumer[:-2] # Remove last 2 chars (_A or _B)
if base_name not in consumer_groups:
consumer_groups[base_name] = []
consumer_groups[base_name].append(consumer)
else:
# Not an _A/_B option, keep as-is
consumer_groups[consumer] = [consumer]
# Add combined _A+_B options
for base_name, options in consumer_groups.items():
if len(options) > 1: # Only combine if there are multiple (_A and _B)
combinations.append({
'name': f'Consumer-{base_name}',
'filters': {'consumer': options}
})
# Add standalone options (including individual _A and _B)
for consumer in survey.options_consumer:
combinations.append({
'name': f'Consumer-{consumer}',
'filters': {'consumer': [consumer]}
})
# Business Owner - one at a time
for business_owner in survey.options_business_owner:
combinations.append({
'name': f'BusinessOwner-{business_owner}',
'filters': {'business_owner': [business_owner]}
})
if not category or category in ['all_filters', 'business_owner']:
for business_owner in survey.options_business_owner:
combinations.append({
'name': f'BusinessOwner-{business_owner}',
'filters': {'business_owner': [business_owner]}
})
# AI User - one at a time
for ai_user in survey.options_ai_user:
if not category or category in ['all_filters', 'ai_user']:
for ai_user in survey.options_ai_user:
combinations.append({
'name': f'AIUser-{ai_user}',
'filters': {'ai_user': [ai_user]}
})
# AI user daily, more than once daily, en multiple times a week = frequent
combinations.append({
'name': f'AIUser-{ai_user}',
'filters': {'ai_user': [ai_user]}
'name': 'AIUser-Frequent',
'filters': {'ai_user': [
'Daily', 'More than once daily', 'Multiple times per week'
]}
})
combinations.append({
'name': 'AIUser-RarelyNever',
'filters': {'ai_user': [
'Once a month', 'Less than once a month', 'Once a week', 'Rarely/Never'
]}
})
# Investable Assets - one at a time
for investable_assets in survey.options_investable_assets:
combinations.append({
'name': f'Assets-{investable_assets}',
'filters': {'investable_assets': [investable_assets]}
})
if not category or category in ['all_filters', 'investable_assets']:
for investable_assets in survey.options_investable_assets:
combinations.append({
'name': f'Assets-{investable_assets}',
'filters': {'investable_assets': [investable_assets]}
})
# Industry - one at a time
for industry in survey.options_industry:
if not category or category in ['all_filters', 'industry']:
for industry in survey.options_industry:
combinations.append({
'name': f'Industry-{industry}',
'filters': {'industry': [industry]}
})
# Voice ranking completeness filter
# These use a special flag rather than demographic filters, so we store
# the mode in a dedicated key that run_report passes as --voice-ranking-filter.
if not category or category in ['all_filters', 'voice_ranking']:
combinations.append({
'name': f'Industry-{industry}',
'filters': {'industry': [industry]}
'name': 'VoiceRanking-OnlyMissing',
'filters': {},
'voice_ranking_filter': 'only-missing',
})
combinations.append({
'name': 'VoiceRanking-ExcludeMissing',
'filters': {},
'voice_ranking_filter': 'exclude-missing',
})
return combinations
def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
"""
Run the report script with given filters.
@@ -155,6 +205,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
filters: Dict of filter_name -> list of values
name: Name for this filter combination (used for .txt description file)
dry_run: If True, just print command without running
sl_threshold: If set, exclude respondents with >= N straight-lined question groups
voice_ranking_filter: If set, filter by voice ranking completeness.
'only-missing' keeps only respondents missing QID98 data,
'exclude-missing' removes them.
Returns:
True if successful, False otherwise
@@ -165,6 +219,14 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
if name:
cmd.extend(['--filter-name', name])
# Pass straight-liner threshold if specified
if sl_threshold is not None:
cmd.extend(['--sl-threshold', str(sl_threshold)])
# Pass voice ranking filter if specified
if voice_ranking_filter is not None:
cmd.extend(['--voice-ranking-filter', voice_ranking_filter])
for filter_name, values in filters.items():
if values:
cmd.extend([f'--{filter_name}', json.dumps(values)])
@@ -193,6 +255,13 @@ def main():
import argparse
parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
parser.add_argument(
'--category',
choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
default='all_filters',
help='Filter category to run combinations for (default: all_filters)'
)
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)')
args = parser.parse_args()
# Load survey to get available filter options
@@ -200,15 +269,19 @@ def main():
survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
survey.load_data() # Populates options_* attributes
# Generate all combinations
combinations = get_filter_combinations(survey)
print(f"Generated {len(combinations)} filter combinations")
# Generate combinations for specified category
combinations = get_filter_combinations(survey, category=args.category)
category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
print(f"Generated {len(combinations)} filter combinations{category_desc}")
if args.sl_threshold is not None:
print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups")
if args.dry_run:
print("\nDRY RUN - Commands that would be executed:")
for combo in combinations:
print(f"\n{combo['name']}:")
run_report(combo['filters'], name=combo['name'], dry_run=True)
run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
return
# Run each combination with progress bar
@@ -217,7 +290,7 @@ def main():
for combo in tqdm(combinations, desc="Running reports", unit="filter"):
tqdm.write(f"Running: {combo['name']}")
if run_report(combo['filters'], name=combo['name']):
if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
successful += 1
else:
failed.append(combo['name'])

File diff suppressed because one or more lines are too long

View File

@@ -77,6 +77,13 @@ class ColorPalette:
GENDER_MALE_NEUTRAL = "#B8C9D9" # Grey-Blue
GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
# Gender colors for correlation plots (green/red indicate +/- correlation)
# Male = darker shade, Female = lighter shade
CORR_MALE_POSITIVE = "#1B5E20" # Dark Green
CORR_FEMALE_POSITIVE = "#81C784" # Light Green
CORR_MALE_NEGATIVE = "#B71C1C" # Dark Red
CORR_FEMALE_NEGATIVE = "#E57373" # Light Red
# Speaking Style Colors (named after the style quadrant colors)
STYLE_GREEN = "#2E7D32" # Forest Green
STYLE_BLUE = "#1565C0" # Strong Blue

526
utils.py
View File

@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
yield shape
def _set_shape_alt_text(shape, alt_text: str):
"""
Set alt text (descr attribute) for a PowerPoint shape.
"""
nvPr = None
# Check for common property names used by python-pptx elements
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
if hasattr(shape._element, attr):
nvPr = getattr(shape._element, attr)
break
if nvPr and hasattr(nvPr, 'cNvPr'):
nvPr.cNvPr.set("descr", alt_text)
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
"""
Updates the alt text of images in a PowerPoint presentation by matching
their content with images in a source directory.
Updates the alt text of images in a PowerPoint presentation.
1. First pass: Validates existing alt-text format (<filter>/<filename>).
- Fixes full paths by keeping only the last two parts.
- Clears invalid alt-text.
2. Second pass: If images are missing alt-text, matches them against source directory
using perceptual hash or SHA1.
Args:
ppt_path (str/Path): Path to the PowerPoint file.
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
if output_path is None:
output_path = ppt_path
# 1. Build lookup map of {hash: file_path} from the source directory
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
# 2. Open Presentation
# Open Presentation
try:
prs = Presentation(ppt_path)
except Exception as e:
@@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
return
updates_count = 0
unmatched_images = [] # Collect unmatched images to report at the end
images_needing_match = []
slides = list(prs.slides)
total_slides = len(slides)
print(f"Processing {total_slides} slides...")
print(f"Scanning {total_slides} slides for existing alt-text...")
# Pass 1: Scan and clean existing alt-text
for i, slide in enumerate(slides):
# Use recursive iterator to find all pictures including those in groups/placeholders
picture_shapes = list(_iter_picture_shapes(slide.shapes))
for shape in picture_shapes:
try:
# Get image hash based on selected method
if use_perceptual_hash:
# Use perceptual hash of the image blob for visual content matching
current_hash = _calculate_perceptual_hash(shape.image.blob)
else:
# Use SHA1 hash from python-pptx (exact byte match)
current_hash = shape.image.sha1
alt_text = _get_shape_alt_text(shape)
has_valid_alt = False
if alt_text:
# Handle potential path separators and whitespace
clean_alt = alt_text.strip().replace('\\', '/')
parts = clean_alt.split('/')
if current_hash in image_hash_map:
original_path = image_hash_map[current_hash]
# Check if it looks like a path/file reference (at least 2 parts like dir/file)
if len(parts) >= 2:
# Enforce format: keep last 2 parts (e.g. filter/image.png)
new_alt = '/'.join(parts[-2:])
# Generate Alt Text
try:
# Prepare path for generator.
# Try to relativize to CWD if capable
pass_path = original_path
try:
pass_path = original_path.relative_to(Path.cwd())
except ValueError:
pass
if new_alt != alt_text:
print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
_set_shape_alt_text(shape, new_alt)
updates_count += 1
new_alt_text = image_alt_text_generator(pass_path)
# Check existing alt text to avoid redundant updates/log them
# Accessing alt text via cNvPr
# Note: Different shape types might store non-visual props differently
# Picture: nvPicPr.cNvPr
# GraphicFrame: nvGraphicFramePr.cNvPr
# Group: nvGrpSpPr.cNvPr
# Shape/Placeholder: nvSpPr.cNvPr
nvPr = None
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
if hasattr(shape._element, attr):
nvPr = getattr(shape._element, attr)
break
if nvPr and hasattr(nvPr, 'cNvPr'):
cNvPr = nvPr.cNvPr
existing_alt_text = cNvPr.get("descr", "")
if existing_alt_text != new_alt_text:
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
cNvPr.set("descr", new_alt_text)
updates_count += 1
else:
print(f"Could not find cNvPr for shape on slide {i+1}")
except AssertionError as e:
print(f"Skipping match for {original_path} due to generator error: {e}")
except Exception as e:
print(f"Error updating alt text for {original_path}: {e}")
has_valid_alt = True
else:
# Check if image already has alt text set - if so, skip reporting as unmatched
existing_alt = _get_shape_alt_text(shape)
if existing_alt:
# Image already has alt text, no need to report as unmatched
continue
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
hash_type = "pHash" if use_perceptual_hash else "SHA1"
unmatched_images.append({
'slide': i+1,
'shape_name': shape_name,
'hash_type': hash_type,
'hash': current_hash
})
except AttributeError:
continue
except Exception as e:
print(f"Error processing shape on slide {i+1}: {e}")
# User requested deleting other cases that do not meet format
# If it's single word or doesn't look like our path format
pass # logic below handles this
if not has_valid_alt:
if alt_text:
print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
_set_shape_alt_text(shape, "")
updates_count += 1
# Queue for hash matching
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
images_needing_match.append({
'slide_idx': i, # 0-based
'slide_num': i+1,
'shape': shape,
'shape_name': shape_name
})
# Print summary
if not images_needing_match:
print("\nAll images have valid alt-text format. No hash matching needed.")
if updates_count > 0:
prs.save(output_path)
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("Presentation is up to date.")
return
# Pass 2: Hash Matching
print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
# Build lookup map of {hash: file_path} only if needed
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
unmatched_images = []
for item in images_needing_match:
shape = item['shape']
slide_num = item['slide_num']
try:
# Get image hash
if use_perceptual_hash:
current_hash = _calculate_perceptual_hash(shape.image.blob)
else:
current_hash = shape.image.sha1
if current_hash in image_hash_map:
original_path = image_hash_map[current_hash]
# Generate Alt Text
try:
# Try to relativize to CWD if capable
pass_path = original_path
try:
pass_path = original_path.relative_to(Path.cwd())
except ValueError:
pass
new_alt_text = image_alt_text_generator(pass_path)
print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
_set_shape_alt_text(shape, new_alt_text)
updates_count += 1
except Exception as e:
print(f"Error generating alt text for {original_path}: {e}")
else:
hash_type = "pHash" if use_perceptual_hash else "SHA1"
unmatched_images.append({
'slide': slide_num,
'shape_name': item['shape_name'],
'hash_type': hash_type,
'hash': current_hash
})
except Exception as e:
print(f"Error processing shape on slide {slide_num}: {e}")
# Save and Print Summary
print("\n" + "="*80)
if updates_count > 0:
prs.save(output_path)
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
else:
print("No images matched or required updates.")
print("No matches found for missing images.")
# List unmatched images at the end
if unmatched_images:
print(f"\n{len(unmatched_images)} image(s) not found in source directory:")
print(f"\n{len(unmatched_images)} image(s) could not be matched:")
for img in unmatched_images:
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
else:
print("\n✓ All images matched successfully!")
print("\n✓ All images processed successfully!")
print("="*80)
@@ -723,7 +762,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
class QualtricsSurvey(QualtricsPlotsMixin):
"""Class to handle Qualtrics survey data."""
def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path], figures_dir: Union[str, Path] = None):
if isinstance(data_path, str):
data_path = Path(data_path)
@@ -735,8 +774,12 @@ class QualtricsSurvey(QualtricsPlotsMixin):
self.qid_descr_map = self._extract_qid_descr_map()
self.qsf:dict = self._load_qsf()
# get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
if figures_dir:
self.fig_save_dir = Path(figures_dir)
else:
# get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
if not self.fig_save_dir.exists():
self.fig_save_dir.mkdir(parents=True, exist_ok=True)
@@ -879,40 +922,42 @@ class QualtricsSurvey(QualtricsPlotsMixin):
"""
# Apply filters - skip if empty list (columns with all NULLs produce empty options)
# OR if all options are selected (to avoid dropping NULLs)
self.filter_age = age
if age is not None and len(age) > 0:
if age is not None and len(age) > 0 and set(age) != set(self.options_age):
q = q.filter(pl.col('QID1').is_in(age))
self.filter_gender = gender
if gender is not None and len(gender) > 0:
if gender is not None and len(gender) > 0 and set(gender) != set(self.options_gender):
q = q.filter(pl.col('QID2').is_in(gender))
self.filter_consumer = consumer
if consumer is not None and len(consumer) > 0:
if consumer is not None and len(consumer) > 0 and set(consumer) != set(self.options_consumer):
q = q.filter(pl.col('Consumer').is_in(consumer))
self.filter_ethnicity = ethnicity
if ethnicity is not None and len(ethnicity) > 0:
if ethnicity is not None and len(ethnicity) > 0 and set(ethnicity) != set(self.options_ethnicity):
q = q.filter(pl.col('QID3').is_in(ethnicity))
self.filter_income = income
if income is not None and len(income) > 0:
if income is not None and len(income) > 0 and set(income) != set(self.options_income):
q = q.filter(pl.col('QID15').is_in(income))
self.filter_business_owner = business_owner
if business_owner is not None and len(business_owner) > 0:
if business_owner is not None and len(business_owner) > 0 and set(business_owner) != set(self.options_business_owner):
q = q.filter(pl.col('QID4').is_in(business_owner))
self.filter_ai_user = ai_user
if ai_user is not None and len(ai_user) > 0:
if ai_user is not None and len(ai_user) > 0 and set(ai_user) != set(self.options_ai_user):
q = q.filter(pl.col('QID22').is_in(ai_user))
self.filter_investable_assets = investable_assets
if investable_assets is not None and len(investable_assets) > 0:
if investable_assets is not None and len(investable_assets) > 0 and set(investable_assets) != set(self.options_investable_assets):
q = q.filter(pl.col('QID16').is_in(investable_assets))
self.filter_industry = industry
if industry is not None and len(industry) > 0:
if industry is not None and len(industry) > 0 and set(industry) != set(self.options_industry):
q = q.filter(pl.col('QID17').is_in(industry))
self.data_filtered = q
@@ -1070,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
def get_top_3_voices_missing_ranking(
self, q: pl.LazyFrame
) -> pl.DataFrame:
"""Identify respondents who completed the top-3 voice selection (QID36)
but are missing the explicit ranking question (QID98).
These respondents picked 3 voices in the selection step and have
selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
ranking columns are null. This means ``get_top_3_voices()`` will
return all-null rows for them, causing plots like
``plot_most_ranked_1`` to undercount.
Parameters:
q: The (optionally filtered) LazyFrame from ``load_data()``.
Returns:
A collected ``pl.DataFrame`` with columns:
- ``_recordId`` the respondent identifier
- ``3_Ranked`` comma-separated text of the 3 voices they selected
- ``qid36_rank_cols`` dict-like column with their QID36 selection-
order values (for reference; these are *not* preference ranks)
"""
# Get the top-3 ranking data (QID98-based)
top3, _ = self.get_top_3_voices(q)
top3_df = top3.collect()
ranking_cols = [c for c in top3_df.columns if c != '_recordId']
# Respondents where every QID98 ranking column is null
all_null_expr = pl.lit(True)
for col in ranking_cols:
all_null_expr = all_null_expr & pl.col(col).is_null()
missing_ids = top3_df.filter(all_null_expr).select('_recordId')
if missing_ids.height == 0:
return pl.DataFrame(schema={
'_recordId': pl.Utf8,
'3_Ranked': pl.Utf8,
})
# Enrich with the 3_Ranked text from the 18→8→3 question
v_18_8_3, _ = self.get_18_8_3(q)
v_df = v_18_8_3.collect()
result = missing_ids.join(
v_df.select(['_recordId', '3_Ranked']),
on='_recordId',
how='left',
)
return result
def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.
@@ -1543,6 +1642,235 @@ class QualtricsSurvey(QualtricsPlotsMixin):
return results_df, metadata
def compute_mentions_significance(
self,
data: pl.LazyFrame | pl.DataFrame,
alpha: float = 0.05,
correction: str = "bonferroni",
) -> tuple[pl.DataFrame, dict]:
"""Compute statistical significance for Total Mentions (Rank 1+2+3).
Tests whether the proportion of respondents who included a voice in their Top 3
is significantly different between voices.
Args:
data: Ranking data (rows=respondents, cols=voices, values=rank).
alpha: Significance level.
correction: Multiple comparison correction method.
Returns:
tuple: (pairwise_df, metadata)
"""
from scipy import stats as scipy_stats
import numpy as np
if isinstance(data, pl.LazyFrame):
df = data.collect()
else:
df = data
ranking_cols = [c for c in df.columns if c != '_recordId']
if len(ranking_cols) < 2:
raise ValueError("Need at least 2 ranking columns")
total_respondents = df.height
mentions_data = {}
# Count mentions (any rank) for each voice
for col in ranking_cols:
label = self._clean_voice_label(col)
count = df.filter(pl.col(col).is_not_null()).height
mentions_data[label] = count
labels = sorted(list(mentions_data.keys()))
results = []
n_comparisons = len(labels) * (len(labels) - 1) // 2
for i, label1 in enumerate(labels):
for label2 in labels[i+1:]:
count1 = mentions_data[label1]
count2 = mentions_data[label2]
pct1 = count1 / total_respondents
pct2 = count2 / total_respondents
# Z-test for two proportions
n1 = total_respondents
n2 = total_respondents
p_pooled = (count1 + count2) / (n1 + n2)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
if se > 0:
z_stat = (pct1 - pct2) / se
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
else:
p_value = 1.0
results.append({
'group1': label1,
'group2': label2,
'p_value': float(p_value),
'rank1_count1': count1, # Reusing column names for compatibility with heatmap plotting
'rank1_count2': count2,
'rank1_pct1': round(pct1 * 100, 1),
'rank1_pct2': round(pct2 * 100, 1),
'total1': n1,
'total2': n2,
'effect_size': pct1 - pct2 # Difference in proportions
})
results_df = pl.DataFrame(results)
p_values = results_df['p_value'].to_numpy()
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
if correction == "bonferroni":
p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
elif correction == "holm":
sorted_idx = np.argsort(p_values)
sorted_p = p_values[sorted_idx]
m = len(sorted_p)
adjusted = np.zeros(m)
for j in range(m):
adjusted[j] = sorted_p[j] * (m - j)
for j in range(1, m):
adjusted[j] = max(adjusted[j], adjusted[j-1])
adjusted = np.minimum(adjusted, 1.0)
p_adjusted = adjusted[np.argsort(sorted_idx)]
elif correction == "none":
p_adjusted = p_values.astype(float) # pyright: ignore
results_df = results_df.with_columns([
pl.Series('p_adjusted', p_adjusted),
pl.Series('significant', p_adjusted < alpha),
]).sort('p_value')
metadata = {
'test_type': 'proportion_z_test_mentions',
'alpha': alpha,
'correction': correction,
'n_comparisons': n_comparisons,
}
return results_df, metadata
def compute_rank1_significance(
self,
data: pl.LazyFrame | pl.DataFrame,
alpha: float = 0.05,
correction: str = "bonferroni",
) -> tuple[pl.DataFrame, dict]:
"""Compute statistical significance for Rank 1 selections only.
Like compute_mentions_significance but counts only how many times each
voice/character was ranked **1st**, using total respondents as the
denominator. This tests whether first-choice preference differs
significantly between voices.
Args:
data: Ranking data (rows=respondents, cols=voices, values=rank).
alpha: Significance level.
correction: Multiple comparison correction method.
Returns:
tuple: (pairwise_df, metadata)
"""
from scipy import stats as scipy_stats
import numpy as np
if isinstance(data, pl.LazyFrame):
df = data.collect()
else:
df = data
ranking_cols = [c for c in df.columns if c != '_recordId']
if len(ranking_cols) < 2:
raise ValueError("Need at least 2 ranking columns")
total_respondents = df.height
rank1_data: dict[str, int] = {}
# Count rank-1 selections for each voice
for col in ranking_cols:
label = self._clean_voice_label(col)
count = df.filter(pl.col(col) == 1).height
rank1_data[label] = count
labels = sorted(list(rank1_data.keys()))
results = []
n_comparisons = len(labels) * (len(labels) - 1) // 2
for i, label1 in enumerate(labels):
for label2 in labels[i+1:]:
count1 = rank1_data[label1]
count2 = rank1_data[label2]
pct1 = count1 / total_respondents
pct2 = count2 / total_respondents
# Z-test for two proportions (same denominator for both)
n1 = total_respondents
n2 = total_respondents
p_pooled = (count1 + count2) / (n1 + n2)
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
if se > 0:
z_stat = (pct1 - pct2) / se
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
else:
p_value = 1.0
results.append({
'group1': label1,
'group2': label2,
'p_value': float(p_value),
'rank1_count1': count1,
'rank1_count2': count2,
'rank1_pct1': round(pct1 * 100, 1),
'rank1_pct2': round(pct2 * 100, 1),
'total1': n1,
'total2': n2,
'effect_size': pct1 - pct2,
})
results_df = pl.DataFrame(results)
p_values = results_df['p_value'].to_numpy()
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
if correction == "bonferroni":
p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
elif correction == "holm":
sorted_idx = np.argsort(p_values)
sorted_p = p_values[sorted_idx]
m = len(sorted_p)
adjusted = np.zeros(m)
for j in range(m):
adjusted[j] = sorted_p[j] * (m - j)
for j in range(1, m):
adjusted[j] = max(adjusted[j], adjusted[j-1])
adjusted = np.minimum(adjusted, 1.0)
p_adjusted = adjusted[np.argsort(sorted_idx)]
elif correction == "none":
p_adjusted = p_values.astype(float) # pyright: ignore
results_df = results_df.with_columns([
pl.Series('p_adjusted', p_adjusted),
pl.Series('significant', p_adjusted < alpha),
]).sort('p_value')
metadata = {
'test_type': 'proportion_z_test_rank1',
'alpha': alpha,
'correction': correction,
'n_comparisons': n_comparisons,
}
return results_df, metadata
def process_speaking_style_data(
df: Union[pl.LazyFrame, pl.DataFrame],