Compare commits
28 Commits
e7166a7957
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 03a716e8ec | |||
| 8720bb670d | |||
| 9dfab75925 | |||
| 14e28cf368 | |||
| 8e181e193a | |||
| 6c16993cb3 | |||
| 92c6fc03ab | |||
| 7fb6570190 | |||
| 840bd2940d | |||
| af9a15ccb0 | |||
| a3cf9f103d | |||
| f0eab32c34 | |||
| d231fc02db | |||
| fc76bb0ab5 | |||
| ab78276a97 | |||
| e17646eb70 | |||
| ad1d8c6e58 | |||
| f5b4c247b8 | |||
| a35670aa72 | |||
| 36280a6ff8 | |||
| 9a587dcc4c | |||
| 9a49d1c690 | |||
| 8f505da550 | |||
| 495b56307c | |||
| 1e76a82f24 | |||
| 01b7d50637 | |||
| dca9ac11ba | |||
| 081fb0dd6e |
5
.vscode/extensions.json
vendored
Normal file
5
.vscode/extensions.json
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"wakatime.vscode-wakatime"
|
||||
]
|
||||
}
|
||||
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,5 @@
|
||||
{
|
||||
"chat.tools.terminal.autoApprove": {
|
||||
"/home/luigi/Documents/VoiceBranding/JPMC/Phase-3/.venv/bin/python": true
|
||||
}
|
||||
}
|
||||
@@ -1,691 +0,0 @@
|
||||
|
||||
__generated_with = "0.19.7"
|
||||
|
||||
# %%
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
|
||||
from validation import check_progress, duration_validation, check_straight_liners
|
||||
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
||||
import utils
|
||||
|
||||
from speaking_styles import SPEAKING_STYLES
|
||||
|
||||
# %%
|
||||
# CLI argument parsing for batch automation
|
||||
# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
|
||||
# When run in Jupyter: args will use defaults (all filters = None = all options selected)
|
||||
|
||||
# Central filter configuration - add new filters here only
|
||||
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
# Add new filters here: 'newfilter': 'options_newfilter',
|
||||
}
|
||||
|
||||
def parse_cli_args():
|
||||
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||
|
||||
# Dynamically add filter arguments from config
|
||||
for filter_name in FILTER_CONFIG:
|
||||
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||
|
||||
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||
|
||||
# Only parse if running as script (not in Jupyter/interactive)
|
||||
try:
|
||||
# Check if running in Jupyter by looking for ipykernel
|
||||
get_ipython() # noqa: F821
|
||||
# Return namespace with all filters set to None
|
||||
return argparse.Namespace(**{f: None for f in FILTER_CONFIG}, filter_name=None)
|
||||
except NameError:
|
||||
args = parser.parse_args()
|
||||
# Parse JSON strings to lists
|
||||
for filter_name in FILTER_CONFIG:
|
||||
val = getattr(args, filter_name)
|
||||
setattr(args, filter_name, json.loads(val) if val else None)
|
||||
return args
|
||||
|
||||
cli_args = parse_cli_args()
|
||||
|
||||
# %%
|
||||
|
||||
# file_browser = mo.ui.file_browser(
|
||||
# initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
|
||||
# )
|
||||
# file_browser
|
||||
|
||||
# # %%
|
||||
# mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
|
||||
# RESULTS_FILE = Path(file_browser.path(index=0))
|
||||
|
||||
RESULTS_FILE = 'data/exports/2-3-26_Copy-2-2-26/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
|
||||
# %%
|
||||
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
|
||||
try:
|
||||
data_all = S.load_data()
|
||||
except NotImplementedError as e:
|
||||
mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
|
||||
|
||||
# %%
|
||||
BEST_CHOSEN_CHARACTER = "the_coach"
|
||||
|
||||
# # %%
|
||||
# filter_form = mo.md('''
|
||||
|
||||
|
||||
|
||||
# {age}
|
||||
|
||||
# {gender}
|
||||
|
||||
# {ethnicity}
|
||||
|
||||
# {income}
|
||||
|
||||
# {consumer}
|
||||
# '''
|
||||
# ).batch(
|
||||
# age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
|
||||
# gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
|
||||
# ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
|
||||
# income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
|
||||
# consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
|
||||
# ).form()
|
||||
# mo.md(f'''
|
||||
# ---
|
||||
|
||||
# # Data Filter
|
||||
|
||||
# {filter_form}
|
||||
# ''')
|
||||
|
||||
# %%
|
||||
# mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
|
||||
# CLI args: None means "all options selected" (use S.options_* defaults)
|
||||
# Build filter values dict dynamically from FILTER_CONFIG
|
||||
_active_filters = {}
|
||||
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||
cli_value = getattr(cli_args, filter_name)
|
||||
all_options = getattr(S, options_attr)
|
||||
_active_filters[filter_name] = cli_value if cli_value is not None else all_options
|
||||
|
||||
_d = S.filter_data(data_all, **_active_filters)
|
||||
|
||||
# Write filter description file if filter-name is provided
|
||||
if cli_args.filter_name and S.fig_save_dir:
|
||||
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||
_filter_slug = S._get_filter_slug()
|
||||
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build filter description
|
||||
_filter_desc_lines = [
|
||||
f"Filter: {cli_args.filter_name}",
|
||||
"",
|
||||
"Applied Filters:",
|
||||
]
|
||||
_short_desc_parts = []
|
||||
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||
all_options = getattr(S, options_attr)
|
||||
values = _active_filters[filter_name]
|
||||
display_name = filter_name.replace('_', ' ').title()
|
||||
if values != all_options:
|
||||
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||
else:
|
||||
_filter_desc_lines.append(f" {display_name}: All")
|
||||
|
||||
# Write detailed description INSIDE the filter-slug directory
|
||||
_filter_file = _filter_slug_dir / f"{cli_args.filter_name}.txt"
|
||||
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||
|
||||
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||
|
||||
# Append or create the summary file
|
||||
if _summary_file.exists():
|
||||
_existing = _summary_file.read_text()
|
||||
# Avoid duplicate entries for same slug
|
||||
if _filter_slug not in _existing:
|
||||
with _summary_file.open('a') as f:
|
||||
f.write(_summary_line)
|
||||
else:
|
||||
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||
_header += "Directory | Filter Name | Description\n"
|
||||
_header += "-" * 80 + "\n"
|
||||
_summary_file.write_text(_header + _summary_line)
|
||||
|
||||
# Stop execution and prevent other cells from running if no data is selected
|
||||
# mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
|
||||
data = _d
|
||||
|
||||
# data = data_validated
|
||||
data.collect()
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
# Check if all business owners are missing a 'Consumer type' in demographics
|
||||
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
# Demographic Distributions
|
||||
""")
|
||||
|
||||
# %%
|
||||
demo_plot_cols = [
|
||||
'Age',
|
||||
'Gender',
|
||||
# 'Race/Ethnicity',
|
||||
'Bussiness_Owner',
|
||||
'Consumer'
|
||||
]
|
||||
|
||||
# %%
|
||||
_content = """
|
||||
|
||||
"""
|
||||
for c in demo_plot_cols:
|
||||
_fig = S.plot_demographic_distribution(
|
||||
data=S.get_demographics(data)[0],
|
||||
column=c,
|
||||
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
|
||||
)
|
||||
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
|
||||
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# Brand Character Results
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Best performing: Original vs Refined frankenstein
|
||||
""")
|
||||
|
||||
# %%
|
||||
char_refine_rank = S.get_character_refine(data)[0]
|
||||
# print(char_rank.collect().head())
|
||||
print(char_refine_rank.collect().head())
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Character ranking points
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Character ranking 1-2-3
|
||||
""")
|
||||
|
||||
# %%
|
||||
char_rank = S.get_character_ranking(data)[0]
|
||||
|
||||
# %%
|
||||
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
|
||||
S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
|
||||
|
||||
# %%
|
||||
S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Statistical Significance Character Ranking
|
||||
""")
|
||||
|
||||
# %%
|
||||
_pairwise_df, _meta = S.compute_ranking_significance(char_rank)
|
||||
|
||||
# print(_pairwise_df.columns)
|
||||
|
||||
mo.md(f"""
|
||||
|
||||
|
||||
{mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
|
||||
|
||||
{mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Character Ranking: times 1st place
|
||||
""")
|
||||
|
||||
# %%
|
||||
S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Prominent predefined personality traits wordcloud
|
||||
""")
|
||||
|
||||
# %%
|
||||
top8_traits = S.get_top_8_traits(data)[0]
|
||||
S.plot_traits_wordcloud(
|
||||
data=top8_traits,
|
||||
column='Top_8_Traits',
|
||||
title="Most Prominent Personality Traits",
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Trait frequency per brand character
|
||||
""")
|
||||
|
||||
# %%
|
||||
char_df = S.get_character_refine(data)[0]
|
||||
|
||||
# %%
|
||||
from theme import ColorPalette
|
||||
|
||||
# Assuming you already have char_df (your data from get_character_refine or similar)
|
||||
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
|
||||
character_colors = {
|
||||
'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
|
||||
'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
|
||||
'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
|
||||
'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
|
||||
}
|
||||
|
||||
# Build consistent sort order (by total frequency across all characters)
|
||||
all_trait_counts = {}
|
||||
for char in characters:
|
||||
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
||||
for row in freq_df.iter_rows(named=True):
|
||||
all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
|
||||
|
||||
consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
|
||||
|
||||
_content = """"""
|
||||
# Generate 4 plots (one per character)
|
||||
for char in characters:
|
||||
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
||||
main_color, highlight_color = character_colors[char]
|
||||
chart = S.plot_single_character_trait_frequency(
|
||||
data=freq_df,
|
||||
character_name=char,
|
||||
bar_color=main_color,
|
||||
highlight_color=highlight_color,
|
||||
trait_sort_order=consistent_sort_order,
|
||||
)
|
||||
_content += f"""
|
||||
{mo.ui.altair_chart(chart)}
|
||||
|
||||
|
||||
"""
|
||||
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Statistical significance best characters
|
||||
|
||||
zie chat
|
||||
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
|
||||
>
|
||||
""")
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# Spoken Voice Results
|
||||
""")
|
||||
|
||||
# %%
|
||||
COLOR_GENDER = True
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Top 8 Most Chosen out of 18
|
||||
""")
|
||||
|
||||
# %%
|
||||
v_18_8_3 = S.get_18_8_3(data)[0]
|
||||
|
||||
# %%
|
||||
S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice', color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Top 3 most chosen out of 8
|
||||
""")
|
||||
|
||||
# %%
|
||||
S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice', color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Voice Ranking Weighted Score
|
||||
""")
|
||||
|
||||
# %%
|
||||
top3_voices = S.get_top_3_voices(data)[0]
|
||||
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
|
||||
|
||||
# %%
|
||||
S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Which voice is ranked best in the ranking question for top 3?
|
||||
|
||||
(not best 3 out of 8 question)
|
||||
""")
|
||||
|
||||
# %%
|
||||
S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Top 3 Voice Rankings (1st, 2nd, 3rd)", color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Statistical significance for voice ranking
|
||||
""")
|
||||
|
||||
# %%
|
||||
# print(top3_voices.collect().head())
|
||||
|
||||
# %%
|
||||
|
||||
# _pairwise_df, _metadata = S.compute_ranking_significance(
|
||||
# top3_voices,alpha=0.05,correction="none")
|
||||
|
||||
# # View significant pairs
|
||||
# # print(pairwise_df.filter(pl.col('significant') == True))
|
||||
|
||||
# # Create heatmap visualization
|
||||
# _heatmap = S.plot_significance_heatmap(
|
||||
# _pairwise_df,
|
||||
# metadata=_metadata,
|
||||
# title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
|
||||
# )
|
||||
|
||||
# # Create summary bar chart
|
||||
# _summary = S.plot_significance_summary(
|
||||
# _pairwise_df,
|
||||
# metadata=_metadata
|
||||
# )
|
||||
|
||||
# mo.md(f"""
|
||||
# {mo.ui.altair_chart(_heatmap)}
|
||||
|
||||
# {mo.ui.altair_chart(_summary)}
|
||||
# """)
|
||||
|
||||
# %%
|
||||
## Voice Ranked 1st the most
|
||||
|
||||
# %%
|
||||
S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Voice Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Get your voice scale data (from notebook)
|
||||
voice_1_10, _ = S.get_voice_scale_1_10(data)
|
||||
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)", color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Statistical Significance (Scale 1-10)
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Compute pairwise significance tests
|
||||
pairwise_df, metadata = S.compute_pairwise_significance(
|
||||
voice_1_10,
|
||||
test_type="mannwhitney", # or "ttest", "chi2", "auto"
|
||||
alpha=0.05,
|
||||
correction="bonferroni" # or "holm", "none"
|
||||
)
|
||||
|
||||
# View significant pairs
|
||||
# print(pairwise_df.filter(pl.col('significant') == True))
|
||||
|
||||
# Create heatmap visualization
|
||||
_heatmap = S.plot_significance_heatmap(
|
||||
pairwise_df,
|
||||
metadata=metadata,
|
||||
title="Voice Rating Significance<br>(Pairwise Comparisons)"
|
||||
)
|
||||
|
||||
# Create summary bar chart
|
||||
_summary = S.plot_significance_summary(
|
||||
pairwise_df,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
mo.md(f"""
|
||||
{mo.ui.altair_chart(_heatmap)}
|
||||
|
||||
{mo.ui.altair_chart(_summary)}
|
||||
""")
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Ranking points for Voice per Chosen Brand Character
|
||||
|
||||
**missing mapping**
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Correlation Speaking Styles
|
||||
""")
|
||||
|
||||
# %%
|
||||
ss_or, choice_map_or = S.get_ss_orange_red(data)
|
||||
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
|
||||
|
||||
# Combine the data
|
||||
ss_all = ss_or.join(ss_gb, on='_recordId')
|
||||
_d = ss_all.collect()
|
||||
|
||||
choice_map = {**choice_map_or, **choice_map_gb}
|
||||
# print(_d.head())
|
||||
# print(choice_map)
|
||||
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
||||
|
||||
df_style = utils.process_speaking_style_data(ss_all, choice_map)
|
||||
|
||||
vscales = S.get_voice_scale_1_10(data)[0]
|
||||
df_scale_long = utils.process_voice_scale_data(vscales)
|
||||
|
||||
joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
|
||||
|
||||
df_ranking = utils.process_voice_ranking_data(top3_voices)
|
||||
joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
|
||||
|
||||
# %%
|
||||
joined_ranking.head()
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Colors vs Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Transform to get one row per color with average correlation
|
||||
color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=color_corr_scale,
|
||||
title="Correlation: Speaking Style Colors and Voice Scale 1-10"
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Colors vs Ranking Points
|
||||
""")
|
||||
|
||||
# %%
|
||||
color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
|
||||
joined_ranking,
|
||||
SPEAKING_STYLES,
|
||||
target_column="Ranking_Points"
|
||||
)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=color_corr_ranking,
|
||||
title="Correlation: Speaking Style Colors and Voice Ranking Points"
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Individual Traits vs Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
_content = """"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
# print(f"Correlation plot for {style}...")
|
||||
_fig = S.plot_speaking_style_correlation(
|
||||
data=joined_scale,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10",
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Individual Traits vs Ranking Points
|
||||
""")
|
||||
|
||||
# %%
|
||||
_content = """"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
# print(f"Correlation plot for {style}...")
|
||||
_fig = S.plot_speaking_style_ranking_correlation(
|
||||
data=joined_ranking,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Correlations when "Best Brand Character" is chosen
|
||||
|
||||
Select only the traits that fit with that character
|
||||
""")
|
||||
|
||||
# %%
|
||||
from reference import ORIGINAL_CHARACTER_TRAITS
|
||||
chosen_bc_traits = ORIGINAL_CHARACTER_TRAITS[BEST_CHOSEN_CHARACTER]
|
||||
|
||||
# %%
|
||||
STYLES_SUBSET = utils.filter_speaking_styles(SPEAKING_STYLES, chosen_bc_traits)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Individual Traits vs Ranking Points
|
||||
""")
|
||||
|
||||
# %%
|
||||
_content = ""
|
||||
for _style, _traits in STYLES_SUBSET.items():
|
||||
_fig = S.plot_speaking_style_ranking_correlation(
|
||||
data=joined_ranking,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Ranking Points"""
|
||||
)
|
||||
_content += f"""
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Individual Traits vs Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
_content = """"""
|
||||
|
||||
for _style, _traits in STYLES_SUBSET.items():
|
||||
# print(f"Correlation plot for {style}...")
|
||||
_fig = S.plot_speaking_style_correlation(
|
||||
data=joined_scale,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Scale 1-10""",
|
||||
)
|
||||
_content += f"""
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Colors vs Scale 1-10 (Best Character)
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Transform to get one row per color with average correlation
|
||||
_color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, STYLES_SUBSET)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=_color_corr_scale,
|
||||
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Scale 1-10"""
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Colors vs Ranking Points (Best Character)
|
||||
""")
|
||||
|
||||
# %%
|
||||
_color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
|
||||
joined_ranking,
|
||||
STYLES_SUBSET,
|
||||
target_column="Ranking_Points"
|
||||
)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=_color_corr_ranking,
|
||||
title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Ranking Points"""
|
||||
)
|
||||
@@ -21,9 +21,14 @@ def _():
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
TAG_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
|
||||
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
|
||||
TAG_IMAGE_DIR = Path('figures/2-2-26')
|
||||
TAG_IMAGE_DIR = Path('figures/debug')
|
||||
return TAG_IMAGE_DIR, TAG_SOURCE
|
||||
|
||||
|
||||
@@ -47,10 +52,10 @@ def _():
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2.pptx')
|
||||
REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
|
||||
REPLACE_SOURCE = Path('data/reports/VOICE_Perception-Research-Report_4-2-26_19-30.pptx')
|
||||
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
|
||||
|
||||
NEW_IMAGES_DIR = Path('figures/2-2-26')
|
||||
NEW_IMAGES_DIR = Path('figures/2-4-26')
|
||||
return NEW_IMAGES_DIR, REPLACE_SOURCE
|
||||
|
||||
|
||||
|
||||
84
README.md
84
README.md
@@ -150,36 +150,57 @@ combinations.append({
|
||||
|
||||
## Adding a New Filter Dimension
|
||||
|
||||
To add an entirely new filter dimension (e.g., a new demographic question), edit **only** `FILTER_CONFIG` in `03_quant_report.script.py`:
|
||||
To add an entirely new filter dimension (e.g., a new demographic question), you need to update several files:
|
||||
|
||||
### Checklist
|
||||
|
||||
1. **Ensure `QualtricsSurvey`** has the corresponding `options_*` attribute and `filter_data()` accepts the parameter
|
||||
|
||||
2. **Open** `03_quant_report.script.py`
|
||||
|
||||
3. **Find** `FILTER_CONFIG` near the top of the file:
|
||||
1. **Update `utils.py` — `QualtricsSurvey.__init__()`** to initialize the filter state attribute:
|
||||
|
||||
```python
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
# Add new filters here: 'newfilter': 'options_newfilter',
|
||||
}
|
||||
# In __init__(), add after existing filter_ attributes (around line 758):
|
||||
self.filter_region:list = None # QID99
|
||||
```
|
||||
|
||||
4. **Add** your new filter:
|
||||
2. **Update `utils.py` — `load_data()`** to populate the `options_*` attribute:
|
||||
|
||||
```python
|
||||
# In load_data(), add after existing options:
|
||||
self.options_region = sorted(df['QID99'].drop_nulls().unique().to_list()) if 'QID99' in df.columns else []
|
||||
```
|
||||
|
||||
3. **Update `utils.py` — `filter_data()`** to accept and apply the filter:
|
||||
|
||||
```python
|
||||
# Add parameter to function signature:
|
||||
def filter_data(self, q: pl.LazyFrame, ..., region:list=None) -> pl.LazyFrame:
|
||||
|
||||
# Add filter logic in function body:
|
||||
self.filter_region = region
|
||||
if region is not None:
|
||||
q = q.filter(pl.col('QID99').is_in(region))
|
||||
```
|
||||
|
||||
4. **Update `plots.py` — `_get_filter_slug()`** to include the filter in directory slugs:
|
||||
|
||||
```python
|
||||
# Add to the filters list:
|
||||
('region', 'Reg', getattr(self, 'filter_region', None), 'options_region'),
|
||||
```
|
||||
|
||||
5. **Update `plots.py` — `_get_filter_description()`** for human-readable descriptions:
|
||||
|
||||
```python
|
||||
# Add to the filters list:
|
||||
('Region', getattr(self, 'filter_region', None), 'options_region'),
|
||||
```
|
||||
|
||||
6. **Update `03_quant_report.script.py` — `FILTER_CONFIG`**:
|
||||
|
||||
```python
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
# ... existing filters ...
|
||||
'region': 'options_region', # ← New filter
|
||||
}
|
||||
```
|
||||
@@ -190,4 +211,29 @@ This **automatically**:
|
||||
- Passes it to `S.filter_data()`
|
||||
- Writes it to the `.txt` filter description file
|
||||
|
||||
5. **Update** `run_filter_combinations.py` to generate combinations for the new filter (optional)
|
||||
7. **Update `run_filter_combinations.py`** to generate combinations (optional):
|
||||
|
||||
```python
|
||||
# Add after existing filter loops:
|
||||
for region in survey.options_region:
|
||||
combinations.append({
|
||||
'name': f'Region-{region}',
|
||||
'filters': {'region': [region]}
|
||||
})
|
||||
```
|
||||
|
||||
### Currently Available Filters
|
||||
|
||||
| CLI Argument | Options Attribute | QID Column | Description |
|
||||
|--------------|-------------------|------------|-------------|
|
||||
| `--age` | `options_age` | QID1 | Age groups |
|
||||
| `--gender` | `options_gender` | QID2 | Gender |
|
||||
| `--ethnicity` | `options_ethnicity` | QID3 | Ethnicity |
|
||||
| `--income` | `options_income` | QID15 | Income brackets |
|
||||
| `--consumer` | `options_consumer` | Consumer | Consumer segments |
|
||||
| `--business_owner` | `options_business_owner` | QID4 | Business owner status |
|
||||
| `--employment_status` | `options_employment_status` | QID13 | Employment status |
|
||||
| `--personal_products` | `options_personal_products` | QID14 | Personal products |
|
||||
| `--ai_user` | `options_ai_user` | QID22 | AI user status |
|
||||
| `--investable_assets` | `options_investable_assets` | QID16 | Investable assets |
|
||||
| `--industry` | `options_industry` | QID17 | Industry |
|
||||
263
XX_detailed_trait_analysis.py
Normal file
263
XX_detailed_trait_analysis.py
Normal file
@@ -0,0 +1,263 @@
|
||||
"""Extra analyses of the traits"""
|
||||
# %% Imports
|
||||
|
||||
import utils
|
||||
import polars as pl
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from validation import check_straight_liners
|
||||
|
||||
|
||||
# %% Fixed Variables
|
||||
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
|
||||
|
||||
# %% CLI argument parsing for batch automation
|
||||
# When run as script: uv run XX_statistical_significance.script.py --age '["18
|
||||
# Central filter configuration - add new filters here only
|
||||
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
'business_owner': 'options_business_owner',
|
||||
'ai_user': 'options_ai_user',
|
||||
'investable_assets': 'options_investable_assets',
|
||||
'industry': 'options_industry',
|
||||
}
|
||||
|
||||
def parse_cli_args():
|
||||
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||
|
||||
# Dynamically add filter arguments from config
|
||||
for filter_name in FILTER_CONFIG:
|
||||
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||
|
||||
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||
parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||
|
||||
# Only parse if running as script (not in Jupyter/interactive)
|
||||
try:
|
||||
# Check if running in Jupyter by looking for ipykernel
|
||||
get_ipython() # noqa: F821 # type: ignore
|
||||
# Return namespace with all filters set to None
|
||||
no_filters = {f: None for f in FILTER_CONFIG}
|
||||
# Use the same default as argparse
|
||||
default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
|
||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
|
||||
except NameError:
|
||||
args = parser.parse_args()
|
||||
# Parse JSON strings to lists
|
||||
for filter_name in FILTER_CONFIG:
|
||||
val = getattr(args, filter_name)
|
||||
setattr(args, filter_name, json.loads(val) if val else None)
|
||||
return args
|
||||
|
||||
cli_args = parse_cli_args()
|
||||
|
||||
|
||||
# %%
|
||||
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
||||
data_all = S.load_data()
|
||||
|
||||
|
||||
# %% Build filtered dataset based on CLI args
|
||||
|
||||
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
||||
|
||||
# Build filter values dict dynamically from FILTER_CONFIG
|
||||
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
||||
|
||||
_d = S.filter_data(data_all, **_active_filters)
|
||||
|
||||
# Write filter description file if filter-name is provided
|
||||
if cli_args.filter_name and S.fig_save_dir:
|
||||
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||
_filter_slug = S._get_filter_slug()
|
||||
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build filter description
|
||||
_filter_desc_lines = [
|
||||
f"Filter: {cli_args.filter_name}",
|
||||
"",
|
||||
"Applied Filters:",
|
||||
]
|
||||
_short_desc_parts = []
|
||||
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||
all_options = getattr(S, options_attr)
|
||||
values = _active_filters[filter_name]
|
||||
display_name = filter_name.replace('_', ' ').title()
|
||||
# None means no filter applied (same as "All")
|
||||
if values is not None and values != all_options:
|
||||
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||
else:
|
||||
_filter_desc_lines.append(f" {display_name}: All")
|
||||
|
||||
# Write detailed description INSIDE the filter-slug directory
|
||||
# Sanitize filter name for filename usage (replace / and other chars)
|
||||
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
||||
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
||||
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||
|
||||
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||
|
||||
# Append or create the summary file
|
||||
if _summary_file.exists():
|
||||
_existing = _summary_file.read_text()
|
||||
# Avoid duplicate entries for same slug
|
||||
if _filter_slug not in _existing:
|
||||
with _summary_file.open('a') as f:
|
||||
f.write(_summary_line)
|
||||
else:
|
||||
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||
_header += "Directory | Filter Name | Description\n"
|
||||
_header += "-" * 80 + "\n"
|
||||
_summary_file.write_text(_header + _summary_line)
|
||||
|
||||
# Save to logical variable name for further analysis
|
||||
data = _d
|
||||
data.collect()
|
||||
|
||||
# %% Voices per trait
|
||||
|
||||
|
||||
ss_or, choice_map_or = S.get_ss_orange_red(data)
|
||||
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
|
||||
|
||||
# Combine the data
|
||||
ss_all = ss_or.join(ss_gb, on='_recordId')
|
||||
_d = ss_all.collect()
|
||||
|
||||
choice_map = {**choice_map_or, **choice_map_gb}
|
||||
# print(_d.head())
|
||||
# print(choice_map)
|
||||
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
||||
|
||||
|
||||
# %% Create plots
|
||||
|
||||
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
||||
trait_d = ss_long.filter(pl.col("Description") == trait)
|
||||
|
||||
S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
|
||||
# Save with different filename suffix so we can compare with/without straight-liners
|
||||
|
||||
print("\n--- Straight-lining Checks on TRAITS ---")
|
||||
sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
|
||||
sl_traits_df
|
||||
|
||||
# %%
|
||||
|
||||
if sl_traits_df is not None and not sl_traits_df.is_empty():
|
||||
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
|
||||
n_sl_groups = sl_traits_df.height
|
||||
print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
|
||||
|
||||
# Create key in ss_long to match sl_traits_df for anti-join
|
||||
# Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
|
||||
# ss_long has "Style_Group" and "Voice"
|
||||
ss_long_w_key = ss_long.with_columns(
|
||||
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
||||
)
|
||||
|
||||
# Prepare filter table: Record ID + Question Group
|
||||
sl_filter = sl_traits_df.select([
|
||||
pl.col("Record ID").alias("_recordId"),
|
||||
pl.col("Question Group")
|
||||
])
|
||||
|
||||
# Anti-join to remove specific question blocks that were straight-lined
|
||||
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
|
||||
|
||||
# Re-plot with suffix in title
|
||||
print("Re-plotting traits (Cleaned)...")
|
||||
for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
|
||||
trait_d = ss_long_clean.filter(pl.col("Description") == trait)
|
||||
|
||||
# Modify title to create unique filename (and display title)
|
||||
title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)"
|
||||
|
||||
S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
|
||||
else:
|
||||
print("No straight-liners found on traits.")
|
||||
|
||||
|
||||
|
||||
|
||||
# %% Compare All vs Cleaned
|
||||
if sl_traits_df is not None and not sl_traits_df.is_empty():
|
||||
print("Generating Comparison Plots (All vs Cleaned)...")
|
||||
|
||||
# Always apply the per-question-group filtering here to ensure consistency
|
||||
# (Matches the logic used in the re-plotting section above)
|
||||
print("Applying filter to remove straight-lined question blocks...")
|
||||
ss_long_w_key = ss_long.with_columns(
|
||||
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
||||
)
|
||||
sl_filter = sl_traits_df.select([
|
||||
pl.col("Record ID").alias("_recordId"),
|
||||
pl.col("Question Group")
|
||||
])
|
||||
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
|
||||
|
||||
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
|
||||
|
||||
# --- Verification Prints ---
|
||||
print(f"\n--- Verification of Filter ---")
|
||||
print(f"Original Row Count: {ss_long.height}")
|
||||
print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
|
||||
print(f"Sample IDs affected: {sl_ids[:5]}")
|
||||
print(f"Cleaned Row Count: {ss_long_clean.height}")
|
||||
print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
|
||||
|
||||
# Verify removal
|
||||
# Re-construct key to verify
|
||||
ss_long_check = ss_long.with_columns(
|
||||
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
||||
)
|
||||
sl_filter_check = sl_traits_df.select([
|
||||
pl.col("Record ID").alias("_recordId"),
|
||||
pl.col("Question Group")
|
||||
])
|
||||
|
||||
should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
|
||||
print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
|
||||
|
||||
# Show what was removed (the straight lining behavior)
|
||||
print("\nSample of Straight-Liner Data (Values that caused removal):")
|
||||
print(sl_traits_df.head(5))
|
||||
print("-" * 30 + "\n")
|
||||
# ---------------------------
|
||||
|
||||
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
||||
|
||||
# Get data for this trait from both datasets
|
||||
trait_d_all = ss_long.filter(pl.col("Description") == trait)
|
||||
trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
|
||||
|
||||
# Plot comparison
|
||||
title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)"
|
||||
|
||||
S.plot_speaking_style_trait_scores_comparison(
|
||||
trait_d_all,
|
||||
trait_d_clean,
|
||||
title=title_comp,
|
||||
height=600 # Slightly taller for grouped bars
|
||||
)
|
||||
|
||||
849
XX_quant_report.script.py
Normal file
849
XX_quant_report.script.py
Normal file
@@ -0,0 +1,849 @@
|
||||
|
||||
__generated_with = "0.19.7"
|
||||
|
||||
# %%
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from validation import check_progress, duration_validation, check_straight_liners
|
||||
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
||||
import utils
|
||||
|
||||
from speaking_styles import SPEAKING_STYLES
|
||||
|
||||
# %% Fixed Variables
|
||||
|
||||
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||
# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
|
||||
|
||||
# %%
|
||||
# CLI argument parsing for batch automation
|
||||
# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
|
||||
# When run in Jupyter: args will use defaults (all filters = None = all options selected)
|
||||
|
||||
# Central filter configuration - add new filters here only
|
||||
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
'business_owner': 'options_business_owner',
|
||||
'ai_user': 'options_ai_user',
|
||||
'investable_assets': 'options_investable_assets',
|
||||
'industry': 'options_industry',
|
||||
}
|
||||
|
||||
def parse_cli_args():
|
||||
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||
|
||||
# Dynamically add filter arguments from config
|
||||
for filter_name in FILTER_CONFIG:
|
||||
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||
|
||||
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||
parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
|
||||
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
|
||||
parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
|
||||
|
||||
# Only parse if running as script (not in Jupyter/interactive)
|
||||
try:
|
||||
# Check if running in Jupyter by looking for ipykernel
|
||||
get_ipython() # noqa: F821 # type: ignore
|
||||
# Return namespace with all filters set to None
|
||||
no_filters = {f: None for f in FILTER_CONFIG}
|
||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
|
||||
except NameError:
|
||||
args = parser.parse_args()
|
||||
# Parse JSON strings to lists
|
||||
for filter_name in FILTER_CONFIG:
|
||||
val = getattr(args, filter_name)
|
||||
setattr(args, filter_name, json.loads(val) if val else None)
|
||||
return args
|
||||
|
||||
cli_args = parse_cli_args()
|
||||
BEST_CHOSEN_CHARACTER = cli_args.best_character
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
||||
try:
|
||||
data_all = S.load_data()
|
||||
except NotImplementedError as e:
|
||||
mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
|
||||
|
||||
|
||||
# %% Build filtered dataset based on CLI args
|
||||
|
||||
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
||||
|
||||
# Build filter values dict dynamically from FILTER_CONFIG
|
||||
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
||||
|
||||
# %% Apply filters
|
||||
_d = S.filter_data(data_all, **_active_filters)
|
||||
|
||||
# Write filter description file if filter-name is provided
|
||||
if cli_args.filter_name and S.fig_save_dir:
|
||||
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||
_filter_slug = S._get_filter_slug()
|
||||
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build filter description
|
||||
_filter_desc_lines = [
|
||||
f"Filter: {cli_args.filter_name}",
|
||||
"",
|
||||
"Applied Filters:",
|
||||
]
|
||||
_short_desc_parts = []
|
||||
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||
all_options = getattr(S, options_attr)
|
||||
values = _active_filters[filter_name]
|
||||
display_name = filter_name.replace('_', ' ').title()
|
||||
# None means no filter applied (same as "All")
|
||||
if values is not None and values != all_options:
|
||||
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||
else:
|
||||
_filter_desc_lines.append(f" {display_name}: All")
|
||||
|
||||
# Write detailed description INSIDE the filter-slug directory
|
||||
# Sanitize filter name for filename usage (replace / and other chars)
|
||||
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
||||
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
||||
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||
|
||||
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||
|
||||
# Append or create the summary file
|
||||
if _summary_file.exists():
|
||||
_existing = _summary_file.read_text()
|
||||
# Avoid duplicate entries for same slug
|
||||
if _filter_slug not in _existing:
|
||||
with _summary_file.open('a') as f:
|
||||
f.write(_summary_line)
|
||||
else:
|
||||
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||
_header += "Directory | Filter Name | Description\n"
|
||||
_header += "-" * 80 + "\n"
|
||||
_summary_file.write_text(_header + _summary_line)
|
||||
|
||||
# %% Apply straight-liner threshold filter (if specified)
|
||||
# Removes respondents who straight-lined >= N question groups across
|
||||
# speaking style and voice scale questions.
|
||||
if cli_args.sl_threshold is not None:
|
||||
_sl_n = cli_args.sl_threshold
|
||||
S.sl_threshold = _sl_n # Store on Survey so filter slug/description include it
|
||||
print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...")
|
||||
_n_before = _d.select(pl.len()).collect().item()
|
||||
|
||||
# Extract question groups with renamed columns for check_straight_liners
|
||||
_sl_ss_or, _ = S.get_ss_orange_red(_d)
|
||||
_sl_ss_gb, _ = S.get_ss_green_blue(_d)
|
||||
_sl_vs, _ = S.get_voice_scale_1_10(_d)
|
||||
_sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId')
|
||||
|
||||
_, _sl_df = check_straight_liners(_sl_all_q, max_score=5)
|
||||
|
||||
if _sl_df is not None and not _sl_df.is_empty():
|
||||
# Count straight-lined question groups per respondent
|
||||
_sl_counts = (
|
||||
_sl_df
|
||||
.group_by("Record ID")
|
||||
.agg(pl.len().alias("sl_count"))
|
||||
.filter(pl.col("sl_count") >= _sl_n)
|
||||
.select(pl.col("Record ID").alias("_recordId"))
|
||||
)
|
||||
# Anti-join to remove offending respondents
|
||||
_d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy()
|
||||
# Update filtered data on the Survey object so sample size is correct
|
||||
S.data_filtered = _d
|
||||
_n_after = _d.select(pl.len()).collect().item()
|
||||
print(f" Removed {_n_before - _n_after} respondents ({_n_before} → {_n_after})")
|
||||
else:
|
||||
print(" No straight-liners detected — no respondents removed.")
|
||||
|
||||
# %% Apply voice-ranking completeness filter (if specified)
|
||||
# Keeps only / excludes respondents who are missing the explicit voice
|
||||
# ranking question (QID98) despite completing the top-3 selection (QID36).
|
||||
if cli_args.voice_ranking_filter is not None:
|
||||
S.voice_ranking_filter = cli_args.voice_ranking_filter # Store on Survey so filter slug/description include it
|
||||
_vr_missing = S.get_top_3_voices_missing_ranking(_d)
|
||||
_vr_missing_ids = _vr_missing.select('_recordId')
|
||||
_n_before = _d.select(pl.len()).collect().item()
|
||||
|
||||
if cli_args.voice_ranking_filter == 'only-missing':
|
||||
print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
|
||||
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
|
||||
elif cli_args.voice_ranking_filter == 'exclude-missing':
|
||||
print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
|
||||
_d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
|
||||
|
||||
S.data_filtered = _d
|
||||
_n_after = _d.select(pl.len()).collect().item()
|
||||
print(f" {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
|
||||
|
||||
# Save to logical variable name for further analysis
|
||||
data = _d
|
||||
data.collect()
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# Check if all business owners are missing a 'Consumer type' in demographics
|
||||
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
# Demographic Distributions
|
||||
""")
|
||||
|
||||
# %%
|
||||
demo_plot_cols = [
|
||||
'Age',
|
||||
'Gender',
|
||||
# 'Race/Ethnicity',
|
||||
'Bussiness_Owner',
|
||||
'Consumer'
|
||||
]
|
||||
|
||||
# %%
|
||||
_content = """
|
||||
|
||||
"""
|
||||
for c in demo_plot_cols:
|
||||
_fig = S.plot_demographic_distribution(
|
||||
data=S.get_demographics(data)[0],
|
||||
column=c,
|
||||
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
|
||||
)
|
||||
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
|
||||
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# Brand Character Results
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Best performing: Original vs Refined frankenstein
|
||||
""")
|
||||
|
||||
# %%
|
||||
char_refine_rank = S.get_character_refine(data)[0]
|
||||
# print(char_rank.collect().head())
|
||||
print(char_refine_rank.collect().head())
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Character ranking points
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Character ranking 1-2-3
|
||||
""")
|
||||
|
||||
# %%
|
||||
char_rank = S.get_character_ranking(data)[0]
|
||||
|
||||
# %%
|
||||
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
|
||||
S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
|
||||
|
||||
# %%
|
||||
S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Statistical Significance Character Ranking
|
||||
""")
|
||||
|
||||
# %%
|
||||
# _pairwise_df, _meta = S.compute_ranking_significance(char_rank)
|
||||
|
||||
# # print(_pairwise_df.columns)
|
||||
|
||||
# mo.md(f"""
|
||||
|
||||
|
||||
# {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
|
||||
|
||||
# {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
|
||||
# """)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Character Ranking: times 1st place
|
||||
""")
|
||||
|
||||
# %%
|
||||
S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Prominent predefined personality traits wordcloud
|
||||
""")
|
||||
|
||||
# %%
|
||||
top8_traits = S.get_top_8_traits(data)[0]
|
||||
S.plot_traits_wordcloud(
|
||||
data=top8_traits,
|
||||
column='Top_8_Traits',
|
||||
title="Most Prominent Personality Traits",
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Trait frequency per brand character
|
||||
""")
|
||||
|
||||
# %%
|
||||
char_df = S.get_character_refine(data)[0]
|
||||
|
||||
# %%
|
||||
from theme import ColorPalette
|
||||
|
||||
# Assuming you already have char_df (your data from get_character_refine or similar)
|
||||
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
|
||||
character_colors = {
|
||||
'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
|
||||
'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
|
||||
'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
|
||||
'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
|
||||
}
|
||||
|
||||
# Build consistent sort order (by total frequency across all characters)
|
||||
all_trait_counts = {}
|
||||
for char in characters:
|
||||
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
||||
for row in freq_df.iter_rows(named=True):
|
||||
all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
|
||||
|
||||
consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
|
||||
|
||||
_content = """"""
|
||||
# Generate 4 plots (one per character)
|
||||
for char in characters:
|
||||
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
||||
main_color, highlight_color = character_colors[char]
|
||||
chart = S.plot_single_character_trait_frequency(
|
||||
data=freq_df,
|
||||
character_name=char,
|
||||
bar_color=main_color,
|
||||
highlight_color=highlight_color,
|
||||
trait_sort_order=consistent_sort_order,
|
||||
)
|
||||
_content += f"""
|
||||
{mo.ui.altair_chart(chart)}
|
||||
|
||||
|
||||
"""
|
||||
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Statistical significance best characters
|
||||
|
||||
zie chat
|
||||
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
|
||||
>
|
||||
""")
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# Spoken Voice Results
|
||||
""")
|
||||
|
||||
# %%
|
||||
COLOR_GENDER = True
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Top 8 Most Chosen out of 18
|
||||
""")
|
||||
|
||||
# %%
|
||||
v_18_8_3 = S.get_18_8_3(data)[0]
|
||||
|
||||
# %%
|
||||
S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice', color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Top 3 most chosen out of 8
|
||||
""")
|
||||
|
||||
# %%
|
||||
S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice', color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Voice Ranking Weighted Score
|
||||
""")
|
||||
|
||||
# %%
|
||||
top3_voices = S.get_top_3_voices(data)[0]
|
||||
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
|
||||
|
||||
# %%
|
||||
S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Which voice is ranked best in the ranking question for top 3?
|
||||
|
||||
(not best 3 out of 8 question)
|
||||
""")
|
||||
|
||||
# %%
|
||||
S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Top 3 Voice Rankings (1st, 2nd, 3rd)", color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Statistical significance for voice ranking
|
||||
""")
|
||||
|
||||
# %%
|
||||
# print(top3_voices.collect().head())
|
||||
|
||||
# %%
|
||||
|
||||
# _pairwise_df, _metadata = S.compute_ranking_significance(
|
||||
# top3_voices,alpha=0.05,correction="none")
|
||||
|
||||
# # View significant pairs
|
||||
# # print(pairwise_df.filter(pl.col('significant') == True))
|
||||
|
||||
# # Create heatmap visualization
|
||||
# _heatmap = S.plot_significance_heatmap(
|
||||
# _pairwise_df,
|
||||
# metadata=_metadata,
|
||||
# title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
|
||||
# )
|
||||
|
||||
# # Create summary bar chart
|
||||
# _summary = S.plot_significance_summary(
|
||||
# _pairwise_df,
|
||||
# metadata=_metadata
|
||||
# )
|
||||
|
||||
# mo.md(f"""
|
||||
# {mo.ui.altair_chart(_heatmap)}
|
||||
|
||||
# {mo.ui.altair_chart(_summary)}
|
||||
# """)
|
||||
|
||||
# %%
|
||||
## Voice Ranked 1st the most
|
||||
|
||||
# %%
|
||||
S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Voice Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Get your voice scale data (from notebook)
|
||||
voice_1_10, _ = S.get_voice_scale_1_10(data)
|
||||
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)", color_gender=COLOR_GENDER)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Statistical Significance (Scale 1-10)
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Compute pairwise significance tests
|
||||
# pairwise_df, metadata = S.compute_pairwise_significance(
|
||||
# voice_1_10,
|
||||
# test_type="mannwhitney", # or "ttest", "chi2", "auto"
|
||||
# alpha=0.05,
|
||||
# correction="bonferroni" # or "holm", "none"
|
||||
# )
|
||||
|
||||
# # View significant pairs
|
||||
# # print(pairwise_df.filter(pl.col('significant') == True))
|
||||
|
||||
# # Create heatmap visualization
|
||||
# _heatmap = S.plot_significance_heatmap(
|
||||
# pairwise_df,
|
||||
# metadata=metadata,
|
||||
# title="Voice Rating Significance<br>(Pairwise Comparisons)"
|
||||
# )
|
||||
|
||||
# # Create summary bar chart
|
||||
# _summary = S.plot_significance_summary(
|
||||
# pairwise_df,
|
||||
# metadata=metadata
|
||||
# )
|
||||
|
||||
# mo.md(f"""
|
||||
# {mo.ui.altair_chart(_heatmap)}
|
||||
|
||||
# {mo.ui.altair_chart(_summary)}
|
||||
# """)
|
||||
|
||||
# %%
|
||||
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Ranking points for Voice per Chosen Brand Character
|
||||
|
||||
**missing mapping**
|
||||
""")
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
## Correlation Speaking Styles
|
||||
""")
|
||||
|
||||
# %%
|
||||
ss_or, choice_map_or = S.get_ss_orange_red(data)
|
||||
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
|
||||
|
||||
# Combine the data
|
||||
ss_all = ss_or.join(ss_gb, on='_recordId')
|
||||
_d = ss_all.collect()
|
||||
|
||||
choice_map = {**choice_map_or, **choice_map_gb}
|
||||
# print(_d.head())
|
||||
# print(choice_map)
|
||||
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
||||
|
||||
df_style = utils.process_speaking_style_data(ss_all, choice_map)
|
||||
|
||||
vscales = S.get_voice_scale_1_10(data)[0]
|
||||
df_scale_long = utils.process_voice_scale_data(vscales)
|
||||
|
||||
joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")
|
||||
|
||||
df_ranking = utils.process_voice_ranking_data(top3_voices)
|
||||
joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
|
||||
|
||||
# %%
|
||||
joined_ranking.head()
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Colors vs Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
# Transform to get one row per color with average correlation
|
||||
color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=color_corr_scale,
|
||||
title="Correlation: Speaking Style Colors and Voice Scale 1-10"
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Colors vs Ranking Points
|
||||
""")
|
||||
|
||||
# %%
|
||||
color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
|
||||
joined_ranking,
|
||||
SPEAKING_STYLES,
|
||||
target_column="Ranking_Points"
|
||||
)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=color_corr_ranking,
|
||||
title="Correlation: Speaking Style Colors and Voice Ranking Points"
|
||||
)
|
||||
|
||||
# %%
|
||||
# Gender-filtered correlation plots (Male vs Female voices)
|
||||
from reference import VOICE_GENDER_MAPPING
|
||||
|
||||
MALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Male"]
|
||||
FEMALE_VOICES = [v for v, g in VOICE_GENDER_MAPPING.items() if g == "Female"]
|
||||
|
||||
# Filter joined data by voice gender
|
||||
joined_scale_male = joined_scale.filter(pl.col("Voice").is_in(MALE_VOICES))
|
||||
joined_scale_female = joined_scale.filter(pl.col("Voice").is_in(FEMALE_VOICES))
|
||||
joined_ranking_male = joined_ranking.filter(pl.col("Voice").is_in(MALE_VOICES))
|
||||
joined_ranking_female = joined_ranking.filter(pl.col("Voice").is_in(FEMALE_VOICES))
|
||||
|
||||
# Colors vs Scale 1-10 (grouped by voice gender)
|
||||
S.plot_speaking_style_color_correlation_by_gender(
|
||||
data_male=joined_scale_male,
|
||||
data_female=joined_scale_female,
|
||||
speaking_styles=SPEAKING_STYLES,
|
||||
target_column="Voice_Scale_Score",
|
||||
title="Correlation: Speaking Style Colors and Voice Scale 1-10 (by Voice Gender)",
|
||||
filename="correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_color",
|
||||
)
|
||||
|
||||
# Colors vs Ranking Points (grouped by voice gender)
|
||||
S.plot_speaking_style_color_correlation_by_gender(
|
||||
data_male=joined_ranking_male,
|
||||
data_female=joined_ranking_female,
|
||||
speaking_styles=SPEAKING_STYLES,
|
||||
target_column="Ranking_Points",
|
||||
title="Correlation: Speaking Style Colors and Voice Ranking Points (by Voice Gender)",
|
||||
filename="correlation_speaking_style_and_voice_ranking_points_by_voice_gender_color",
|
||||
)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Individual Traits vs Scale 1-10
|
||||
""")
|
||||
|
||||
# %%
|
||||
_content = """"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
# print(f"Correlation plot for {style}...")
|
||||
_fig = S.plot_speaking_style_scale_correlation(
|
||||
data=joined_scale,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10",
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
mo.md(r"""
|
||||
### Individual Traits vs Ranking Points
|
||||
""")
|
||||
|
||||
# %%
|
||||
_content = """"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
# print(f"Correlation plot for {style}...")
|
||||
_fig = S.plot_speaking_style_ranking_correlation(
|
||||
data=joined_ranking,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
# Individual Traits vs Scale 1-10 (grouped by voice gender)
|
||||
_content = """### Individual Traits vs Scale 1-10 (by Voice Gender)\n\n"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
_fig = S.plot_speaking_style_scale_correlation_by_gender(
|
||||
data_male=joined_scale_male,
|
||||
data_female=joined_scale_female,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10 (by Voice Gender)",
|
||||
filename=f"correlation_speaking_style_and_voice_scale_1-10_by_voice_gender_{_style.lower()}",
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
# Individual Traits vs Ranking Points (grouped by voice gender)
|
||||
_content = """### Individual Traits vs Ranking Points (by Voice Gender)\n\n"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
_fig = S.plot_speaking_style_ranking_correlation_by_gender(
|
||||
data_male=joined_ranking_male,
|
||||
data_female=joined_ranking_female,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f"Correlation: Speaking Style {_style} and Voice Ranking Points (by Voice Gender)",
|
||||
filename=f"correlation_speaking_style_and_voice_ranking_points_by_voice_gender_{_style.lower()}",
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.altair_chart(_fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
|
||||
# %%
|
||||
# ## Correlations when "Best Brand Character" is chosen
|
||||
# For each of the 4 brand characters, filter the dataset to only those respondents
|
||||
# who selected that character as their #1 choice.
|
||||
|
||||
# %%
|
||||
# Prepare character-filtered data subsets
|
||||
char_rank_for_filter = S.get_character_ranking(data)[0].collect()
|
||||
|
||||
CHARACTER_FILTER_MAP = {
|
||||
'Familiar Friend': 'Character_Ranking_Familiar_Friend',
|
||||
'The Coach': 'Character_Ranking_The_Coach',
|
||||
'Personal Assistant': 'Character_Ranking_The_Personal_Assistant',
|
||||
'Bank Teller': 'Character_Ranking_The_Bank_Teller',
|
||||
}
|
||||
|
||||
def get_filtered_data_for_character(char_name: str) -> tuple[pl.DataFrame, pl.DataFrame, int]:
|
||||
"""Filter joined_scale and joined_ranking to respondents who ranked char_name #1."""
|
||||
col = CHARACTER_FILTER_MAP[char_name]
|
||||
respondents = char_rank_for_filter.filter(pl.col(col) == 1).select('_recordId')
|
||||
n = respondents.height
|
||||
filtered_scale = joined_scale.join(respondents, on='_recordId', how='inner')
|
||||
filtered_ranking = joined_ranking.join(respondents, on='_recordId', how='inner')
|
||||
return filtered_scale, filtered_ranking, n
|
||||
|
||||
def _char_filename(char_name: str, suffix: str) -> str:
|
||||
"""Generate filename for character-filtered plots (without n-value).
|
||||
|
||||
Format: bc_ranked_1_{suffix}__{char_slug}
|
||||
This groups all plot types together in directory listings.
|
||||
"""
|
||||
char_slug = char_name.lower().replace(' ', '_')
|
||||
return f"bc_ranked_1_{suffix}__{char_slug}"
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# ### Voice Weighted Ranking Score (by Best Character)
|
||||
for char_name in CHARACTER_FILTER_MAP:
|
||||
_, _, n = get_filtered_data_for_character(char_name)
|
||||
# Get top3 voices for this character subset using _recordIds
|
||||
respondents = char_rank_for_filter.filter(
|
||||
pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
|
||||
).select('_recordId')
|
||||
# Collect top3_voices if it's a LazyFrame, then join
|
||||
top3_df = top3_voices.collect() if isinstance(top3_voices, pl.LazyFrame) else top3_voices
|
||||
filtered_top3 = top3_df.join(respondents, on='_recordId', how='inner')
|
||||
weighted = calculate_weighted_ranking_scores(filtered_top3)
|
||||
S.plot_weighted_ranking_score(
|
||||
data=weighted,
|
||||
title=f'"{char_name}" Ranked #1 (n={n})<br>Most Popular Voice - Weighted Score (1st=3pts, 2nd=2pts, 3rd=1pt)',
|
||||
filename=_char_filename(char_name, "voice_weighted_ranking_score"),
|
||||
color_gender=COLOR_GENDER,
|
||||
)
|
||||
|
||||
# %%
|
||||
# ### Voice Scale 1-10 Average Scores (by Best Character)
|
||||
for char_name in CHARACTER_FILTER_MAP:
|
||||
_, _, n = get_filtered_data_for_character(char_name)
|
||||
# Get voice scale data for this character subset using _recordIds
|
||||
respondents = char_rank_for_filter.filter(
|
||||
pl.col(CHARACTER_FILTER_MAP[char_name]) == 1
|
||||
).select('_recordId')
|
||||
# Collect voice_1_10 if it's a LazyFrame, then join
|
||||
voice_1_10_df = voice_1_10.collect() if isinstance(voice_1_10, pl.LazyFrame) else voice_1_10
|
||||
filtered_voice_1_10 = voice_1_10_df.join(respondents, on='_recordId', how='inner')
|
||||
S.plot_average_scores_with_counts(
|
||||
data=filtered_voice_1_10,
|
||||
title=f'"{char_name}" Ranked #1 (n={n})<br>Voice General Impression (Scale 1-10)',
|
||||
filename=_char_filename(char_name, "voice_scale_1-10"),
|
||||
x_label='Voice',
|
||||
domain=[1, 10],
|
||||
color_gender=COLOR_GENDER,
|
||||
)
|
||||
|
||||
|
||||
|
||||
# %%
|
||||
# ### Speaking Style Colors vs Scale 1-10 (only for Best Character)
|
||||
for char_name in CHARACTER_FILTER_MAP:
|
||||
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
||||
continue
|
||||
|
||||
filtered_scale, _, n = get_filtered_data_for_character(char_name)
|
||||
color_corr, _ = utils.transform_speaking_style_color_correlation(filtered_scale, SPEAKING_STYLES)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=color_corr,
|
||||
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Scale 1-10',
|
||||
filename=_char_filename(char_name, "colors_vs_voice_scale_1-10"),
|
||||
)
|
||||
|
||||
# %%
|
||||
# ### Speaking Style Colors vs Ranking Points (only for Best Character)
|
||||
for char_name in CHARACTER_FILTER_MAP:
|
||||
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
||||
continue
|
||||
|
||||
_, filtered_ranking, n = get_filtered_data_for_character(char_name)
|
||||
color_corr, _ = utils.transform_speaking_style_color_correlation(
|
||||
filtered_ranking, SPEAKING_STYLES, target_column="Ranking_Points"
|
||||
)
|
||||
S.plot_speaking_style_color_correlation(
|
||||
data=color_corr,
|
||||
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: Speaking Style Colors vs Voice Ranking Points',
|
||||
filename=_char_filename(char_name, "colors_vs_voice_ranking_points"),
|
||||
)
|
||||
|
||||
# %%
|
||||
# ### Individual Traits vs Scale 1-10 (only for Best Character)
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
print(f"--- Speaking Style: {_style} ---")
|
||||
for char_name in CHARACTER_FILTER_MAP:
|
||||
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
||||
continue
|
||||
|
||||
filtered_scale, _, n = get_filtered_data_for_character(char_name)
|
||||
S.plot_speaking_style_scale_correlation(
|
||||
data=filtered_scale,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Scale 1-10',
|
||||
filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_scale_1-10"),
|
||||
)
|
||||
|
||||
# %%
|
||||
# ### Individual Traits vs Ranking Points (only for Best Character)
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
print(f"--- Speaking Style: {_style} ---")
|
||||
for char_name in CHARACTER_FILTER_MAP:
|
||||
if char_name.lower().replace(' ', '_') != BEST_CHOSEN_CHARACTER:
|
||||
continue
|
||||
|
||||
_, filtered_ranking, n = get_filtered_data_for_character(char_name)
|
||||
S.plot_speaking_style_ranking_correlation(
|
||||
data=filtered_ranking,
|
||||
style_color=_style,
|
||||
style_traits=_traits,
|
||||
title=f'"{char_name}" Ranked #1 (n={n})<br>Correlation: {_style} vs Voice Ranking Points',
|
||||
filename=_char_filename(char_name, f"{_style.lower()}_vs_voice_ranking_points"),
|
||||
)
|
||||
|
||||
|
||||
# %%
|
||||
370
XX_statistical_significance.script.py
Normal file
370
XX_statistical_significance.script.py
Normal file
@@ -0,0 +1,370 @@
|
||||
"""Extra statistical significance analyses for quant report."""
|
||||
# %% Imports
|
||||
|
||||
import utils
|
||||
import polars as pl
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
# %% Fixed Variables
|
||||
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
|
||||
|
||||
# %% CLI argument parsing for batch automation
|
||||
# When run as script: uv run XX_statistical_significance.script.py --age '["18
|
||||
# Central filter configuration - add new filters here only
|
||||
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
'business_owner': 'options_business_owner',
|
||||
'ai_user': 'options_ai_user',
|
||||
'investable_assets': 'options_investable_assets',
|
||||
'industry': 'options_industry',
|
||||
}
|
||||
|
||||
def parse_cli_args():
|
||||
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||
|
||||
# Dynamically add filter arguments from config
|
||||
for filter_name in FILTER_CONFIG:
|
||||
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||
|
||||
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||
parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||
|
||||
# Only parse if running as script (not in Jupyter/interactive)
|
||||
try:
|
||||
# Check if running in Jupyter by looking for ipykernel
|
||||
get_ipython() # noqa: F821 # type: ignore
|
||||
# Return namespace with all filters set to None
|
||||
no_filters = {f: None for f in FILTER_CONFIG}
|
||||
# Use the same default as argparse
|
||||
default_fig_dir = f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}'
|
||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
|
||||
except NameError:
|
||||
args = parser.parse_args()
|
||||
# Parse JSON strings to lists
|
||||
for filter_name in FILTER_CONFIG:
|
||||
val = getattr(args, filter_name)
|
||||
setattr(args, filter_name, json.loads(val) if val else None)
|
||||
return args
|
||||
|
||||
cli_args = parse_cli_args()
|
||||
|
||||
|
||||
# %%
|
||||
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
||||
data_all = S.load_data()
|
||||
|
||||
|
||||
# %% Build filtered dataset based on CLI args
|
||||
|
||||
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
||||
|
||||
# Build filter values dict dynamically from FILTER_CONFIG
|
||||
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
||||
|
||||
_d = S.filter_data(data_all, **_active_filters)
|
||||
|
||||
# Write filter description file if filter-name is provided
|
||||
if cli_args.filter_name and S.fig_save_dir:
|
||||
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||
_filter_slug = S._get_filter_slug()
|
||||
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build filter description
|
||||
_filter_desc_lines = [
|
||||
f"Filter: {cli_args.filter_name}",
|
||||
"",
|
||||
"Applied Filters:",
|
||||
]
|
||||
_short_desc_parts = []
|
||||
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||
all_options = getattr(S, options_attr)
|
||||
values = _active_filters[filter_name]
|
||||
display_name = filter_name.replace('_', ' ').title()
|
||||
# None means no filter applied (same as "All")
|
||||
if values is not None and values != all_options:
|
||||
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||
else:
|
||||
_filter_desc_lines.append(f" {display_name}: All")
|
||||
|
||||
# Write detailed description INSIDE the filter-slug directory
|
||||
# Sanitize filter name for filename usage (replace / and other chars)
|
||||
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
||||
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
||||
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||
|
||||
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||
|
||||
# Append or create the summary file
|
||||
if _summary_file.exists():
|
||||
_existing = _summary_file.read_text()
|
||||
# Avoid duplicate entries for same slug
|
||||
if _filter_slug not in _existing:
|
||||
with _summary_file.open('a') as f:
|
||||
f.write(_summary_line)
|
||||
else:
|
||||
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||
_header += "Directory | Filter Name | Description\n"
|
||||
_header += "-" * 80 + "\n"
|
||||
_summary_file.write_text(_header + _summary_line)
|
||||
|
||||
# Save to logical variable name for further analysis
|
||||
data = _d
|
||||
data.collect()
|
||||
|
||||
# %% Character coach significatly higher than others
|
||||
|
||||
|
||||
char_rank = S.get_character_ranking(data)[0]
|
||||
|
||||
|
||||
|
||||
_pairwise_df, _meta = S.compute_ranking_significance(
|
||||
char_rank,
|
||||
alpha=0.05,
|
||||
correction="none",
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
### Methodology Analysis
|
||||
|
||||
**Input Data (`char_rank`)**:
|
||||
* Generated by `S.get_character_ranking(data)`.
|
||||
* Contains the ranking values (1st, 2nd, 3rd, 4th) assigned by each respondent to the four options ("The Coach", etc.).
|
||||
* Columns represent the characters; rows represent individual respondents; values are the numerical rank (1 = Top Choice).
|
||||
|
||||
**Processing**:
|
||||
* The function `compute_ranking_significance` aggregates these rankings to find the **"Rank 1 Share"** (the percentage of respondents who picked that character as their #1 favorite).
|
||||
* It builds a contingency table of how many times each character was ranked 1st vs. not 1st (or 1st v 2nd v 3rd).
|
||||
|
||||
**Statistical Test**:
|
||||
* **Test Used**: Pairwise Z-test for two proportions (uncorrected).
|
||||
* **Comparison**: It compares the **Rank 1 Share** of every pair of characters.
|
||||
* *Example*: "Is the 42% of people who chose 'Coach' significantly different from the 29% who chose 'Familiar Friend'?"
|
||||
* **Significance**: A result of `p < 0.05` means the difference in popularity (top-choice preference) is statistically significant and not due to random chance.
|
||||
"""
|
||||
|
||||
|
||||
# %% Plot heatmap of pairwise significance
|
||||
S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Significance: Character Top Choice Preference")
|
||||
|
||||
# %% Plot summary of significant differences (e.g., which characters are significantly higher than others)
|
||||
# S.plot_significance_summary(_pairwise_df, metadata=_meta)
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
# Analysis: Significance of "The Coach"
|
||||
|
||||
**Parameters**: `alpha=0.05`, `correction='none'`
|
||||
* **Rationale**: No correction was applied to allow for detection of all potential pairwise differences (uncorrected p < 0.05). If strict control for family-wise error rate were required (e.g., Bonferroni), the significance threshold would be lower (p < 0.0083).
|
||||
|
||||
**Results**:
|
||||
"The Coach" is the top-ranked option (42.0% Rank 1 share) and shows strong separation from the field.
|
||||
|
||||
* **Vs. Bottom Two**: "The Coach" is significantly higher than both "The Bank Teller" (26.9%, p < 0.001) and "Familiar Friend" (29.4%, p < 0.001).
|
||||
* **Vs. Runner-Up**: "The Coach" is widely preferred over "The Personal Assistant" (33.4%). The difference of **8.6 percentage points** is statistically significant (p = 0.017) at the standard 0.05 level.
|
||||
* *Note*: While p=0.017 is significant in isolation, it would not meet the stricter Bonferroni threshold (0.0083). However, the effect size (+8.6%) is commercially meaningful.
|
||||
|
||||
**Conclusion**:
|
||||
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
|
||||
"""
|
||||
|
||||
# %% Mentions significance analysis
|
||||
|
||||
char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
|
||||
char_rank,
|
||||
alpha=0.05,
|
||||
correction="none",
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
char_pairwise_df_mentions,
|
||||
metadata=_meta_mentions,
|
||||
title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
|
||||
)
|
||||
|
||||
|
||||
# %% voices analysis
|
||||
top3_voices = S.get_top_3_voices(data)[0]
|
||||
|
||||
|
||||
_pairwise_df_voice, _metadata = S.compute_ranking_significance(
|
||||
top3_voices,alpha=0.05,correction="none")
|
||||
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_df_voice,
|
||||
metadata=_metadata,
|
||||
title="Statistical Significance: Voice Top Choice Preference"
|
||||
)
|
||||
# %% Total Mentions Significance (Rank 1+2+3 Combined)
|
||||
# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
|
||||
|
||||
_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
|
||||
top3_voices,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_df_mentions,
|
||||
metadata=_meta_mentions,
|
||||
title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
|
||||
)
|
||||
# %% Male Voices Only Analysis
|
||||
import reference
|
||||
|
||||
def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
|
||||
"""Filter ranking columns to keep only those matching target gender."""
|
||||
cols_to_keep = []
|
||||
|
||||
# Always keep identifier if present
|
||||
if '_recordId' in df.columns:
|
||||
cols_to_keep.append('_recordId')
|
||||
|
||||
for col in df.columns:
|
||||
# Check if column is a voice column (contains Vxx)
|
||||
# Format is typically "Top_3_Voices_ranking__V14"
|
||||
if '__V' in col:
|
||||
voice_id = col.split('__')[1]
|
||||
if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
|
||||
cols_to_keep.append(col)
|
||||
|
||||
return df.select(cols_to_keep)
|
||||
|
||||
# Get full ranking data as DataFrame
|
||||
df_voices = top3_voices.collect()
|
||||
|
||||
# Filter for Male voices
|
||||
df_male_voices = filter_voices_by_gender(df_voices, 'Male')
|
||||
|
||||
# 1. Male Voices: Top Choice Preference (Rank 1)
|
||||
_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
|
||||
df_male_voices,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_pref,
|
||||
metadata=_meta_male_pref,
|
||||
title="Male Voices Only: Top Choice Preference Significance"
|
||||
)
|
||||
|
||||
# 2. Male Voices: Total Mentions (Visibility)
|
||||
_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
|
||||
df_male_voices,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_vis,
|
||||
metadata=_meta_male_vis,
|
||||
title="Male Voices Only: Total Mentions Significance"
|
||||
)
|
||||
# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
|
||||
|
||||
# Start with the male voices dataframe from the previous step
|
||||
voices_to_exclude = ['V88', 'V86', 'V81']
|
||||
|
||||
def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
|
||||
"""Filter ranking columns to exclude specific voices."""
|
||||
cols_to_keep = []
|
||||
|
||||
# Always keep identifier if present
|
||||
if '_recordId' in df.columns:
|
||||
cols_to_keep.append('_recordId')
|
||||
|
||||
for col in df.columns:
|
||||
# Check if column is a voice column (contains Vxx)
|
||||
if '__V' in col:
|
||||
voice_id = col.split('__')[1]
|
||||
if voice_id not in exclude_list:
|
||||
cols_to_keep.append(col)
|
||||
|
||||
return df.select(cols_to_keep)
|
||||
|
||||
df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
|
||||
|
||||
# 1. Male Top Candidates: Top Choice Preference
|
||||
_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
|
||||
df_male_top,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_top_pref,
|
||||
metadata=_meta_male_top_pref,
|
||||
title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
|
||||
)
|
||||
|
||||
# 2. Male Top Candidates: Total Mentions
|
||||
_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
|
||||
df_male_top,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_top_vis,
|
||||
metadata=_meta_male_top_vis,
|
||||
title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
|
||||
)
|
||||
|
||||
# %% [markdown]
|
||||
"""
|
||||
# Rank 1 Selection Significance (Voice Level)
|
||||
|
||||
Similar to the Total Mentions significance analysis above, but counting
|
||||
only how many times each voice was ranked **1st** (out of all respondents).
|
||||
This isolates first-choice preference rather than overall top-3 visibility.
|
||||
"""
|
||||
|
||||
# %% Rank 1 Significance: All Voices
|
||||
|
||||
_pairwise_df_rank1, _meta_rank1 = S.compute_rank1_significance(
|
||||
top3_voices,
|
||||
alpha=0.05,
|
||||
correction="none",
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_df_rank1,
|
||||
metadata=_meta_rank1,
|
||||
title="Statistical Significance: Voice Rank 1 Selection"
|
||||
)
|
||||
|
||||
# %% Rank 1 Significance: Male Voices Only
|
||||
|
||||
_pairwise_df_rank1_male, _meta_rank1_male = S.compute_rank1_significance(
|
||||
df_male_voices,
|
||||
alpha=0.05,
|
||||
correction="none",
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_df_rank1_male,
|
||||
metadata=_meta_rank1_male,
|
||||
title="Male Voices Only: Rank 1 Selection Significance"
|
||||
)
|
||||
|
||||
# %%
|
||||
267
XX_straight_liners.py
Normal file
267
XX_straight_liners.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Extra analyses of the straight-liners"""
|
||||
# %% Imports
|
||||
|
||||
import utils
|
||||
import polars as pl
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from pathlib import Path
|
||||
from validation import check_straight_liners
|
||||
|
||||
|
||||
# %% Fixed Variables
|
||||
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
|
||||
|
||||
# %% CLI argument parsing for batch automation
|
||||
# When run as script: uv run XX_statistical_significance.script.py --age '["18
|
||||
# Central filter configuration - add new filters here only
|
||||
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||
FILTER_CONFIG = {
|
||||
'age': 'options_age',
|
||||
'gender': 'options_gender',
|
||||
'ethnicity': 'options_ethnicity',
|
||||
'income': 'options_income',
|
||||
'consumer': 'options_consumer',
|
||||
'business_owner': 'options_business_owner',
|
||||
'ai_user': 'options_ai_user',
|
||||
'investable_assets': 'options_investable_assets',
|
||||
'industry': 'options_industry',
|
||||
}
|
||||
|
||||
def parse_cli_args():
|
||||
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||
|
||||
# Dynamically add filter arguments from config
|
||||
for filter_name in FILTER_CONFIG:
|
||||
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||
|
||||
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||
parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||
|
||||
# Only parse if running as script (not in Jupyter/interactive)
|
||||
try:
|
||||
# Check if running in Jupyter by looking for ipykernel
|
||||
get_ipython() # noqa: F821 # type: ignore
|
||||
# Return namespace with all filters set to None
|
||||
no_filters = {f: None for f in FILTER_CONFIG}
|
||||
# Use the same default as argparse
|
||||
default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}'
|
||||
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
|
||||
except NameError:
|
||||
args = parser.parse_args()
|
||||
# Parse JSON strings to lists
|
||||
for filter_name in FILTER_CONFIG:
|
||||
val = getattr(args, filter_name)
|
||||
setattr(args, filter_name, json.loads(val) if val else None)
|
||||
return args
|
||||
|
||||
cli_args = parse_cli_args()
|
||||
|
||||
|
||||
# %%
|
||||
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
||||
data_all = S.load_data()
|
||||
|
||||
|
||||
# %% Build filtered dataset based on CLI args
|
||||
|
||||
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
||||
|
||||
# Build filter values dict dynamically from FILTER_CONFIG
|
||||
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
||||
|
||||
_d = S.filter_data(data_all, **_active_filters)
|
||||
|
||||
# Write filter description file if filter-name is provided
|
||||
if cli_args.filter_name and S.fig_save_dir:
|
||||
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||
_filter_slug = S._get_filter_slug()
|
||||
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build filter description
|
||||
_filter_desc_lines = [
|
||||
f"Filter: {cli_args.filter_name}",
|
||||
"",
|
||||
"Applied Filters:",
|
||||
]
|
||||
_short_desc_parts = []
|
||||
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||
all_options = getattr(S, options_attr)
|
||||
values = _active_filters[filter_name]
|
||||
display_name = filter_name.replace('_', ' ').title()
|
||||
# None means no filter applied (same as "All")
|
||||
if values is not None and values != all_options:
|
||||
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||
else:
|
||||
_filter_desc_lines.append(f" {display_name}: All")
|
||||
|
||||
# Write detailed description INSIDE the filter-slug directory
|
||||
# Sanitize filter name for filename usage (replace / and other chars)
|
||||
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
||||
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
||||
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||
|
||||
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||
|
||||
# Append or create the summary file
|
||||
if _summary_file.exists():
|
||||
_existing = _summary_file.read_text()
|
||||
# Avoid duplicate entries for same slug
|
||||
if _filter_slug not in _existing:
|
||||
with _summary_file.open('a') as f:
|
||||
f.write(_summary_line)
|
||||
else:
|
||||
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||
_header += "Directory | Filter Name | Description\n"
|
||||
_header += "-" * 80 + "\n"
|
||||
_summary_file.write_text(_header + _summary_line)
|
||||
|
||||
# Save to logical variable name for further analysis
|
||||
data = _d
|
||||
data.collect()
|
||||
|
||||
|
||||
# %% Determine straight-liner repeat offenders
|
||||
# Extract question groups with renamed columns that check_straight_liners expects.
|
||||
# The raw `data` has QID-based column names; the getter methods rename them to
|
||||
# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc.
|
||||
|
||||
ss_or, _ = S.get_ss_orange_red(data)
|
||||
ss_gb, _ = S.get_ss_green_blue(data)
|
||||
vs, _ = S.get_voice_scale_1_10(data)
|
||||
|
||||
# Combine all question groups into one wide LazyFrame (joined on _recordId)
|
||||
all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
|
||||
|
||||
# Run straight-liner detection across all question groups
|
||||
# max_score=5 catches all speaking-style straight-lining (1-5 scale)
|
||||
# and voice-scale values ≤5 on the 1-10 scale
|
||||
# Note: sl_threshold is NOT set on S here — this script analyses straight-liners,
|
||||
# it doesn't filter them out of the dataset.
|
||||
print("Running straight-liner detection across all question groups...")
|
||||
sl_report, sl_df = check_straight_liners(all_questions, max_score=5)
|
||||
|
||||
# %% Quantify repeat offenders
|
||||
# sl_df has one row per (Record ID, Question Group) that was straight-lined.
|
||||
# Group by Record ID to count how many question groups each person SL'd.
|
||||
|
||||
if sl_df is not None and not sl_df.is_empty():
|
||||
total_respondents = data.select(pl.len()).collect().item()
|
||||
|
||||
# Per-respondent count of straight-lined question groups
|
||||
respondent_sl_counts = (
|
||||
sl_df
|
||||
.group_by("Record ID")
|
||||
.agg(pl.len().alias("sl_count"))
|
||||
.sort("sl_count", descending=True)
|
||||
)
|
||||
|
||||
max_sl = respondent_sl_counts["sl_count"].max()
|
||||
print(f"\nTotal respondents: {total_respondents}")
|
||||
print(f"Respondents who straight-lined at least 1 question group: "
|
||||
f"{respondent_sl_counts.height}")
|
||||
print(f"Maximum question groups straight-lined by one person: {max_sl}")
|
||||
print()
|
||||
|
||||
# Build cumulative distribution: for each threshold N, count respondents
|
||||
# who straight-lined >= N question groups
|
||||
cumulative_rows = []
|
||||
for threshold in range(1, max_sl + 1):
|
||||
count = respondent_sl_counts.filter(
|
||||
pl.col("sl_count") >= threshold
|
||||
).height
|
||||
pct = (count / total_respondents) * 100
|
||||
cumulative_rows.append({
|
||||
"threshold": threshold,
|
||||
"count": count,
|
||||
"pct": pct,
|
||||
})
|
||||
print(
|
||||
f" ≥{threshold} question groups straight-lined: "
|
||||
f"{count} respondents ({pct:.1f}%)"
|
||||
)
|
||||
|
||||
cumulative_df = pl.DataFrame(cumulative_rows)
|
||||
print(f"\n{cumulative_df}")
|
||||
|
||||
# %% Save cumulative data to CSV
|
||||
_filter_slug = S._get_filter_slug()
|
||||
_csv_dir = Path(S.fig_save_dir) / _filter_slug
|
||||
_csv_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
_csv_path = _csv_dir / "straight_liner_repeat_offenders.csv"
|
||||
cumulative_df.write_csv(_csv_path)
|
||||
print(f"Saved cumulative data to {_csv_path}")
|
||||
|
||||
# %% Plot the cumulative distribution
|
||||
S.plot_straight_liner_repeat_offenders(
|
||||
cumulative_df,
|
||||
total_respondents=total_respondents,
|
||||
)
|
||||
|
||||
# %% Per-question straight-lining frequency
|
||||
# Build human-readable question group names from the raw keys
|
||||
def _humanise_question_group(key: str) -> str:
|
||||
"""Convert internal question group key to a readable label.
|
||||
|
||||
Examples:
|
||||
SS_Green_Blue__V14 → Green/Blue – V14
|
||||
SS_Orange_Red__V48 → Orange/Red – V48
|
||||
Voice_Scale_1_10 → Voice Scale (1-10)
|
||||
"""
|
||||
if key.startswith("SS_Green_Blue__"):
|
||||
voice = key.split("__")[1]
|
||||
return f"Green/Blue – {voice}"
|
||||
if key.startswith("SS_Orange_Red__"):
|
||||
voice = key.split("__")[1]
|
||||
return f"Orange/Red – {voice}"
|
||||
if key == "Voice_Scale_1_10":
|
||||
return "Voice Scale (1-10)"
|
||||
# Fallback: replace underscores
|
||||
return key.replace("_", " ")
|
||||
|
||||
per_question_counts = (
|
||||
sl_df
|
||||
.group_by("Question Group")
|
||||
.agg(pl.col("Record ID").n_unique().alias("count"))
|
||||
.sort("count", descending=True)
|
||||
.with_columns(
|
||||
(pl.col("count") / total_respondents * 100).alias("pct")
|
||||
)
|
||||
)
|
||||
|
||||
# Add human-readable names
|
||||
per_question_counts = per_question_counts.with_columns(
|
||||
pl.col("Question Group").map_elements(
|
||||
_humanise_question_group, return_dtype=pl.Utf8
|
||||
).alias("question")
|
||||
)
|
||||
|
||||
print("\n--- Per-Question Straight-Lining Frequency ---")
|
||||
print(per_question_counts)
|
||||
|
||||
# Save per-question data to CSV
|
||||
_csv_path_pq = _csv_dir / "straight_liner_per_question.csv"
|
||||
per_question_counts.write_csv(_csv_path_pq)
|
||||
print(f"Saved per-question data to {_csv_path_pq}")
|
||||
|
||||
# Plot
|
||||
S.plot_straight_liner_per_question(
|
||||
per_question_counts,
|
||||
total_respondents=total_respondents,
|
||||
)
|
||||
|
||||
# %% Show the top repeat offenders (respondents with most SL'd groups)
|
||||
print("\n--- Top Repeat Offenders ---")
|
||||
print(respondent_sl_counts.head(20))
|
||||
|
||||
else:
|
||||
print("No straight-liners detected in the dataset.")
|
||||
1359
analysis_missing_voice_ranking.ipynb
Normal file
1359
analysis_missing_voice_ranking.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
docs/README.pdf
Normal file
BIN
docs/README.pdf
Normal file
Binary file not shown.
104
docs/figures_structure_manual.md
Normal file
104
docs/figures_structure_manual.md
Normal file
@@ -0,0 +1,104 @@
|
||||
# Appendix: Quantitative Analysis Plots - Folder Structure Manual
|
||||
|
||||
This folder contains all the quantitative analysis plots, sorted by the filters applied to the dataset. Each folder corresponds to a specific demographic cut.
|
||||
|
||||
## Folder Overview
|
||||
|
||||
* `All_Respondents/`: Analysis of the full dataset (no filters).
|
||||
* `filter_index.txt`: A master list of every folder code and its corresponding demographic filter.
|
||||
* **Filter Folders**: All other folders represent specific demographic cuts (e.g., `Age-18to21years`, `Gen-Woman`).
|
||||
|
||||
## How to Navigate
|
||||
|
||||
Each folder contains the same set of charts generated for that specific filter.
|
||||
|
||||
## Directory Reference Table
|
||||
|
||||
Below is the complete list of folder names. These names are encodings of the filters applied to the dataset, which we use to maintain consistency across our analysis.
|
||||
|
||||
| Directory Code | Filter Description |
|
||||
| :--- | :--- |
|
||||
| All_Respondents | All Respondents |
|
||||
| Age-18to21years | Age: 18 to 21 years |
|
||||
| Age-22to24years | Age: 22 to 24 years |
|
||||
| Age-25to34years | Age: 25 to 34 years |
|
||||
| Age-35to40years | Age: 35 to 40 years |
|
||||
| Age-41to50years | Age: 41 to 50 years |
|
||||
| Age-51to59years | Age: 51 to 59 years |
|
||||
| Age-60to70years | Age: 60 to 70 years |
|
||||
| Age-70yearsormore | Age: 70 years or more |
|
||||
| Gen-Man | Gender: Man |
|
||||
| Gen-Prefernottosay | Gender: Prefer not to say |
|
||||
| Gen-Woman | Gender: Woman |
|
||||
| Eth-6_grps_c64411 | Ethnicity: All options containing 'Alaska Native or Indigenous American' |
|
||||
| Eth-6_grps_8f145b | Ethnicity: All options containing 'Asian or Asian American' |
|
||||
| Eth-8_grps_71ac47 | Ethnicity: All options containing 'Black or African American' |
|
||||
| Eth-7_grps_c5b3ce | Ethnicity: All options containing 'Hispanic or Latinx' |
|
||||
| Eth-BlackorAfricanAmerican<br>MiddleEasternorNorthAfrican<br>WhiteorCaucasian+<br>MiddleEasternorNorthAfrican | Ethnicity: Middle Eastern or North African |
|
||||
| Eth-AsianorAsianAmericanBlackorAfricanAmerican<br>NativeHawaiianorOtherPacificIslander+<br>NativeHawaiianorOtherPacificIslander | Ethnicity: Native Hawaiian or Other Pacific Islander |
|
||||
| Eth-10_grps_cef760 | Ethnicity: All options containing 'White or Caucasian' |
|
||||
| Inc-100000to149999 | Income: $100,000 to $149,999 |
|
||||
| Inc-150000to199999 | Income: $150,000 to $199,999 |
|
||||
| Inc-200000ormore | Income: $200,000 or more |
|
||||
| Inc-25000to34999 | Income: $25,000 to $34,999 |
|
||||
| Inc-35000to54999 | Income: $35,000 to $54,999 |
|
||||
| Inc-55000to79999 | Income: $55,000 to $79,999 |
|
||||
| Inc-80000to99999 | Income: $80,000 to $99,999 |
|
||||
| Inc-Lessthan25000 | Income: Less than $25,000 |
|
||||
| Cons-Lower_Mass_A+Lower_Mass_B | Consumer: Lower_Mass_A, Lower_Mass_B |
|
||||
| Cons-MassAffluent_A+MassAffluent_B | Consumer: MassAffluent_A, MassAffluent_B |
|
||||
| Cons-Mass_A+Mass_B | Consumer: Mass_A, Mass_B |
|
||||
| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_A+<br>Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_A, Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
|
||||
| Cons-Early_Professional | Consumer: Early_Professional |
|
||||
| Cons-Lower_Mass_B | Consumer: Lower_Mass_B |
|
||||
| Cons-MassAffluent_B | Consumer: MassAffluent_B |
|
||||
| Cons-Mass_B | Consumer: Mass_B |
|
||||
| Cons-Mix_of_Affluent_Wealth__<br>High_Net_Woth_B | Consumer: Mix_of_Affluent_Wealth_&_High_Net_Woth_B |
|
||||
| Cons-Starter | Consumer: Starter |
|
||||
| BizOwn-No | Business Owner: No |
|
||||
| BizOwn-Yes | Business Owner: Yes |
|
||||
| AI-Daily | Ai User: Daily |
|
||||
| AI-Lessthanonceamonth | Ai User: Less than once a month |
|
||||
| AI-Morethanoncedaily | Ai User: More than once daily |
|
||||
| AI-Multipletimesperweek | Ai User: Multiple times per week |
|
||||
| AI-Onceamonth | Ai User: Once a month |
|
||||
| AI-Onceaweek | Ai User: Once a week |
|
||||
| AI-RarelyNever | Ai User: Rarely/Never |
|
||||
| AI-Daily+<br>Morethanoncedaily+<br>Multipletimesperweek | Ai User: Daily, More than once daily, Multiple times per week |
|
||||
| AI-4_grps_d4f57a | Ai User: Once a week, Once a month, Less than once a month, Rarely/Never |
|
||||
| InvAsts-0to24999 | Investable Assets: $0 to $24,999 |
|
||||
| InvAsts-150000to249999 | Investable Assets: $150,000 to $249,999 |
|
||||
| InvAsts-1Mto4.9M | Investable Assets: $1M to $4.9M |
|
||||
| InvAsts-25000to49999 | Investable Assets: $25,000 to $49,999 |
|
||||
| InvAsts-250000to499999 | Investable Assets: $250,000 to $499,999 |
|
||||
| InvAsts-50000to149999 | Investable Assets: $50,000 to $149,999 |
|
||||
| InvAsts-500000to999999 | Investable Assets: $500,000 to $999,999 |
|
||||
| InvAsts-5Mormore | Investable Assets: $5M or more |
|
||||
| InvAsts-Prefernottoanswer | Investable Assets: Prefer not to answer |
|
||||
| Ind-Agricultureforestryfishingorhunting | Industry: Agriculture, forestry, fishing, or hunting |
|
||||
| Ind-Artsentertainmentorrecreation | Industry: Arts, entertainment, or recreation |
|
||||
| Ind-Broadcasting | Industry: Broadcasting |
|
||||
| Ind-Construction | Industry: Construction |
|
||||
| Ind-EducationCollegeuniversityoradult | Industry: Education – College, university, or adult |
|
||||
| Ind-EducationOther | Industry: Education – Other |
|
||||
| Ind-EducationPrimarysecondaryK-12 | Industry: Education – Primary/secondary (K-12) |
|
||||
| Ind-Governmentandpublicadministration | Industry: Government and public administration |
|
||||
| Ind-Hotelandfoodservices | Industry: Hotel and food services |
|
||||
| Ind-InformationOther | Industry: Information – Other |
|
||||
| Ind-InformationServicesanddata | Industry: Information – Services and data |
|
||||
| Ind-Legalservices | Industry: Legal services |
|
||||
| Ind-ManufacturingComputerandelectronics | Industry: Manufacturing – Computer and electronics |
|
||||
| Ind-ManufacturingOther | Industry: Manufacturing – Other |
|
||||
| Ind-Notemployed | Industry: Not employed |
|
||||
| Ind-Otherindustrypleasespecify | Industry: Other industry (please specify) |
|
||||
| Ind-Processing | Industry: Processing |
|
||||
| Ind-Publishing | Industry: Publishing |
|
||||
| Ind-Realestaterentalorleasing | Industry: Real estate, rental, or leasing |
|
||||
| Ind-Retired | Industry: Retired |
|
||||
| Ind-Scientificortechnicalservices | Industry: Scientific or technical services |
|
||||
| Ind-Software | Industry: Software |
|
||||
| Ind-Telecommunications | Industry: Telecommunications |
|
||||
| Ind-Transportationandwarehousing | Industry: Transportation and warehousing |
|
||||
| Ind-Utilities | Industry: Utilities |
|
||||
| Ind-Wholesale | Industry: Wholesale |
|
||||
|
||||
3
potential_dataset_issues.md
Normal file
3
potential_dataset_issues.md
Normal file
@@ -0,0 +1,3 @@
|
||||
- V46 not in scale 1-10. Qualtrics
|
||||
- Straightliners
|
||||
- V45 goed in qual maar slecht in quant
|
||||
@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
|
||||
Usage:
|
||||
uv run python run_filter_combinations.py
|
||||
uv run python run_filter_combinations.py --dry-run # Preview combinations without running
|
||||
uv run python run_filter_combinations.py --category age # Only run age combinations
|
||||
uv run python run_filter_combinations.py --category consumer # Only run consumer segment combinations
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
@@ -31,86 +33,171 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
|
||||
REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'
|
||||
|
||||
|
||||
def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
|
||||
def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
|
||||
"""
|
||||
Generate all single-filter combinations.
|
||||
|
||||
Each combination isolates ONE filter value while keeping all others at "all selected".
|
||||
Returns list of dicts with filter kwargs for each run.
|
||||
|
||||
Args:
|
||||
survey: QualtricsSurvey instance with loaded data
|
||||
category: Optional filter category to limit combinations to.
|
||||
Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
|
||||
'business_owner', 'ai_user', 'investable_assets', 'industry'
|
||||
If None or 'all', generates all combinations.
|
||||
|
||||
Returns:
|
||||
List of dicts with filter kwargs for each run.
|
||||
"""
|
||||
combinations = []
|
||||
|
||||
# Add "All Respondents" run (no filters = all options selected)
|
||||
combinations.append({
|
||||
'name': 'All_Respondents',
|
||||
'filters': {} # Empty = use defaults (all selected)
|
||||
})
|
||||
if not category or category in ['all_filters', 'all']:
|
||||
combinations.append({
|
||||
'name': 'All_Respondents',
|
||||
'filters': {} # Empty = use defaults (all selected)
|
||||
})
|
||||
|
||||
# Age groups - one at a time
|
||||
for age in survey.options_age:
|
||||
combinations.append({
|
||||
'name': f'Age-{age}',
|
||||
'filters': {'age': [age]}
|
||||
})
|
||||
if not category or category in ['all_filters', 'age']:
|
||||
for age in survey.options_age:
|
||||
combinations.append({
|
||||
'name': f'Age-{age}',
|
||||
'filters': {'age': [age]}
|
||||
})
|
||||
|
||||
# Gender - one at a time
|
||||
for gender in survey.options_gender:
|
||||
combinations.append({
|
||||
'name': f'Gender-{gender}',
|
||||
'filters': {'gender': [gender]}
|
||||
})
|
||||
if not category or category in ['all_filters', 'gender']:
|
||||
for gender in survey.options_gender:
|
||||
combinations.append({
|
||||
'name': f'Gender-{gender}',
|
||||
'filters': {'gender': [gender]}
|
||||
})
|
||||
|
||||
# Ethnicity - grouped by individual values
|
||||
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
|
||||
# Create filters that include ALL options containing each individual ethnicity value
|
||||
ethnicity_values = set()
|
||||
for ethnicity_option in survey.options_ethnicity:
|
||||
# Split by comma and strip whitespace
|
||||
values = [v.strip() for v in ethnicity_option.split(',')]
|
||||
ethnicity_values.update(values)
|
||||
|
||||
for ethnicity_value in sorted(ethnicity_values):
|
||||
# Find all options that contain this value
|
||||
matching_options = [
|
||||
opt for opt in survey.options_ethnicity
|
||||
if ethnicity_value in [v.strip() for v in opt.split(',')]
|
||||
]
|
||||
combinations.append({
|
||||
'name': f'Ethnicity-{ethnicity_value}',
|
||||
'filters': {'ethnicity': matching_options}
|
||||
})
|
||||
if not category or category in ['all_filters', 'ethnicity']:
|
||||
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
|
||||
# Create filters that include ALL options containing each individual ethnicity value
|
||||
ethnicity_values = set()
|
||||
for ethnicity_option in survey.options_ethnicity:
|
||||
# Split by comma and strip whitespace
|
||||
values = [v.strip() for v in ethnicity_option.split(',')]
|
||||
ethnicity_values.update(values)
|
||||
|
||||
for ethnicity_value in sorted(ethnicity_values):
|
||||
# Find all options that contain this value
|
||||
matching_options = [
|
||||
opt for opt in survey.options_ethnicity
|
||||
if ethnicity_value in [v.strip() for v in opt.split(',')]
|
||||
]
|
||||
combinations.append({
|
||||
'name': f'Ethnicity-{ethnicity_value}',
|
||||
'filters': {'ethnicity': matching_options}
|
||||
})
|
||||
|
||||
# Income - one at a time
|
||||
for income in survey.options_income:
|
||||
if not category or category in ['all_filters', 'income']:
|
||||
for income in survey.options_income:
|
||||
combinations.append({
|
||||
'name': f'Income-{income}',
|
||||
'filters': {'income': [income]}
|
||||
})
|
||||
|
||||
# Consumer segments - combine _A and _B options, and also include standalone
|
||||
if not category or category in ['all_filters', 'consumer']:
|
||||
# Group options by base name (removing _A/_B suffix)
|
||||
consumer_groups = {}
|
||||
for consumer in survey.options_consumer:
|
||||
# Check if ends with _A or _B
|
||||
if consumer.endswith('_A') or consumer.endswith('_B'):
|
||||
base_name = consumer[:-2] # Remove last 2 chars (_A or _B)
|
||||
if base_name not in consumer_groups:
|
||||
consumer_groups[base_name] = []
|
||||
consumer_groups[base_name].append(consumer)
|
||||
else:
|
||||
# Not an _A/_B option, keep as-is
|
||||
consumer_groups[consumer] = [consumer]
|
||||
|
||||
# Add combined _A+_B options
|
||||
for base_name, options in consumer_groups.items():
|
||||
if len(options) > 1: # Only combine if there are multiple (_A and _B)
|
||||
combinations.append({
|
||||
'name': f'Consumer-{base_name}',
|
||||
'filters': {'consumer': options}
|
||||
})
|
||||
|
||||
# Add standalone options (including individual _A and _B)
|
||||
for consumer in survey.options_consumer:
|
||||
combinations.append({
|
||||
'name': f'Consumer-{consumer}',
|
||||
'filters': {'consumer': [consumer]}
|
||||
})
|
||||
|
||||
# Business Owner - one at a time
|
||||
if not category or category in ['all_filters', 'business_owner']:
|
||||
for business_owner in survey.options_business_owner:
|
||||
combinations.append({
|
||||
'name': f'BusinessOwner-{business_owner}',
|
||||
'filters': {'business_owner': [business_owner]}
|
||||
})
|
||||
|
||||
# AI User - one at a time
|
||||
if not category or category in ['all_filters', 'ai_user']:
|
||||
for ai_user in survey.options_ai_user:
|
||||
combinations.append({
|
||||
'name': f'AIUser-{ai_user}',
|
||||
'filters': {'ai_user': [ai_user]}
|
||||
})
|
||||
|
||||
# AI user daily, more than once daily, en multiple times a week = frequent
|
||||
combinations.append({
|
||||
'name': f'Income-{income}',
|
||||
'filters': {'income': [income]}
|
||||
'name': 'AIUser-Frequent',
|
||||
'filters': {'ai_user': [
|
||||
'Daily', 'More than once daily', 'Multiple times per week'
|
||||
]}
|
||||
})
|
||||
combinations.append({
|
||||
'name': 'AIUser-RarelyNever',
|
||||
'filters': {'ai_user': [
|
||||
'Once a month', 'Less than once a month', 'Once a week', 'Rarely/Never'
|
||||
]}
|
||||
})
|
||||
|
||||
# Consumer segments - combine _A and _B options
|
||||
# Group options by base name (removing _A/_B suffix)
|
||||
consumer_groups = {}
|
||||
for consumer in survey.options_consumer:
|
||||
# Check if ends with _A or _B
|
||||
if consumer.endswith('_A') or consumer.endswith('_B'):
|
||||
base_name = consumer[:-2] # Remove last 2 chars (_A or _B)
|
||||
if base_name not in consumer_groups:
|
||||
consumer_groups[base_name] = []
|
||||
consumer_groups[base_name].append(consumer)
|
||||
else:
|
||||
# Not an _A/_B option, keep as-is
|
||||
consumer_groups[consumer] = [consumer]
|
||||
# Investable Assets - one at a time
|
||||
if not category or category in ['all_filters', 'investable_assets']:
|
||||
for investable_assets in survey.options_investable_assets:
|
||||
combinations.append({
|
||||
'name': f'Assets-{investable_assets}',
|
||||
'filters': {'investable_assets': [investable_assets]}
|
||||
})
|
||||
|
||||
for base_name, options in consumer_groups.items():
|
||||
# Industry - one at a time
|
||||
if not category or category in ['all_filters', 'industry']:
|
||||
for industry in survey.options_industry:
|
||||
combinations.append({
|
||||
'name': f'Industry-{industry}',
|
||||
'filters': {'industry': [industry]}
|
||||
})
|
||||
|
||||
# Voice ranking completeness filter
|
||||
# These use a special flag rather than demographic filters, so we store
|
||||
# the mode in a dedicated key that run_report passes as --voice-ranking-filter.
|
||||
if not category or category in ['all_filters', 'voice_ranking']:
|
||||
combinations.append({
|
||||
'name': f'Consumer-{base_name}',
|
||||
'filters': {'consumer': options}
|
||||
'name': 'VoiceRanking-OnlyMissing',
|
||||
'filters': {},
|
||||
'voice_ranking_filter': 'only-missing',
|
||||
})
|
||||
combinations.append({
|
||||
'name': 'VoiceRanking-ExcludeMissing',
|
||||
'filters': {},
|
||||
'voice_ranking_filter': 'exclude-missing',
|
||||
})
|
||||
|
||||
return combinations
|
||||
|
||||
|
||||
def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
|
||||
def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
|
||||
"""
|
||||
Run the report script with given filters.
|
||||
|
||||
@@ -118,6 +205,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
|
||||
filters: Dict of filter_name -> list of values
|
||||
name: Name for this filter combination (used for .txt description file)
|
||||
dry_run: If True, just print command without running
|
||||
sl_threshold: If set, exclude respondents with >= N straight-lined question groups
|
||||
voice_ranking_filter: If set, filter by voice ranking completeness.
|
||||
'only-missing' keeps only respondents missing QID98 data,
|
||||
'exclude-missing' removes them.
|
||||
|
||||
Returns:
|
||||
True if successful, False otherwise
|
||||
@@ -128,6 +219,14 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
|
||||
if name:
|
||||
cmd.extend(['--filter-name', name])
|
||||
|
||||
# Pass straight-liner threshold if specified
|
||||
if sl_threshold is not None:
|
||||
cmd.extend(['--sl-threshold', str(sl_threshold)])
|
||||
|
||||
# Pass voice ranking filter if specified
|
||||
if voice_ranking_filter is not None:
|
||||
cmd.extend(['--voice-ranking-filter', voice_ranking_filter])
|
||||
|
||||
for filter_name, values in filters.items():
|
||||
if values:
|
||||
cmd.extend([f'--{filter_name}', json.dumps(values)])
|
||||
@@ -156,6 +255,13 @@ def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
|
||||
parser.add_argument(
|
||||
'--category',
|
||||
choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
|
||||
default='all_filters',
|
||||
help='Filter category to run combinations for (default: all_filters)'
|
||||
)
|
||||
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)')
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load survey to get available filter options
|
||||
@@ -163,15 +269,19 @@ def main():
|
||||
survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
|
||||
survey.load_data() # Populates options_* attributes
|
||||
|
||||
# Generate all combinations
|
||||
combinations = get_filter_combinations(survey)
|
||||
print(f"Generated {len(combinations)} filter combinations")
|
||||
# Generate combinations for specified category
|
||||
combinations = get_filter_combinations(survey, category=args.category)
|
||||
category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
|
||||
print(f"Generated {len(combinations)} filter combinations{category_desc}")
|
||||
|
||||
if args.sl_threshold is not None:
|
||||
print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups")
|
||||
|
||||
if args.dry_run:
|
||||
print("\nDRY RUN - Commands that would be executed:")
|
||||
for combo in combinations:
|
||||
print(f"\n{combo['name']}:")
|
||||
run_report(combo['filters'], name=combo['name'], dry_run=True)
|
||||
run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
|
||||
return
|
||||
|
||||
# Run each combination with progress bar
|
||||
@@ -180,7 +290,7 @@ def main():
|
||||
|
||||
for combo in tqdm(combinations, desc="Running reports", unit="filter"):
|
||||
tqdm.write(f"Running: {combo['name']}")
|
||||
if run_report(combo['filters'], name=combo['name']):
|
||||
if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
|
||||
successful += 1
|
||||
else:
|
||||
failed.append(combo['name'])
|
||||
|
||||
992
speech_data_correlation.ipynb
Normal file
992
speech_data_correlation.ipynb
Normal file
File diff suppressed because one or more lines are too long
7
theme.py
7
theme.py
@@ -77,6 +77,13 @@ class ColorPalette:
|
||||
GENDER_MALE_NEUTRAL = "#B8C9D9" # Grey-Blue
|
||||
GENDER_FEMALE_NEUTRAL = "#D9B8C9" # Grey-Pink
|
||||
|
||||
# Gender colors for correlation plots (green/red indicate +/- correlation)
|
||||
# Male = darker shade, Female = lighter shade
|
||||
CORR_MALE_POSITIVE = "#1B5E20" # Dark Green
|
||||
CORR_FEMALE_POSITIVE = "#81C784" # Light Green
|
||||
CORR_MALE_NEGATIVE = "#B71C1C" # Dark Red
|
||||
CORR_FEMALE_NEGATIVE = "#E57373" # Light Red
|
||||
|
||||
# Speaking Style Colors (named after the style quadrant colors)
|
||||
STYLE_GREEN = "#2E7D32" # Forest Green
|
||||
STYLE_BLUE = "#1565C0" # Strong Blue
|
||||
|
||||
561
utils.py
561
utils.py
@@ -413,10 +413,30 @@ def _iter_picture_shapes(shapes):
|
||||
yield shape
|
||||
|
||||
|
||||
def _set_shape_alt_text(shape, alt_text: str):
|
||||
"""
|
||||
Set alt text (descr attribute) for a PowerPoint shape.
|
||||
"""
|
||||
nvPr = None
|
||||
# Check for common property names used by python-pptx elements
|
||||
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
||||
if hasattr(shape._element, attr):
|
||||
nvPr = getattr(shape._element, attr)
|
||||
break
|
||||
|
||||
if nvPr and hasattr(nvPr, 'cNvPr'):
|
||||
nvPr.cNvPr.set("descr", alt_text)
|
||||
|
||||
|
||||
def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str, Path], output_path: Union[str, Path] = None, use_perceptual_hash: bool = True):
|
||||
"""
|
||||
Updates the alt text of images in a PowerPoint presentation by matching
|
||||
their content with images in a source directory.
|
||||
Updates the alt text of images in a PowerPoint presentation.
|
||||
|
||||
1. First pass: Validates existing alt-text format (<filter>/<filename>).
|
||||
- Fixes full paths by keeping only the last two parts.
|
||||
- Clears invalid alt-text.
|
||||
2. Second pass: If images are missing alt-text, matches them against source directory
|
||||
using perceptual hash or SHA1.
|
||||
|
||||
Args:
|
||||
ppt_path (str/Path): Path to the PowerPoint file.
|
||||
@@ -430,10 +450,7 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
if output_path is None:
|
||||
output_path = ppt_path
|
||||
|
||||
# 1. Build lookup map of {hash: file_path} from the source directory
|
||||
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
||||
|
||||
# 2. Open Presentation
|
||||
# Open Presentation
|
||||
try:
|
||||
prs = Presentation(ppt_path)
|
||||
except Exception as e:
|
||||
@@ -441,110 +458,132 @@ def update_ppt_alt_text(ppt_path: Union[str, Path], image_source_dir: Union[str,
|
||||
return
|
||||
|
||||
updates_count = 0
|
||||
unmatched_images = [] # Collect unmatched images to report at the end
|
||||
images_needing_match = []
|
||||
|
||||
slides = list(prs.slides)
|
||||
total_slides = len(slides)
|
||||
|
||||
print(f"Processing {total_slides} slides...")
|
||||
print(f"Scanning {total_slides} slides for existing alt-text...")
|
||||
|
||||
# Pass 1: Scan and clean existing alt-text
|
||||
for i, slide in enumerate(slides):
|
||||
# Use recursive iterator to find all pictures including those in groups/placeholders
|
||||
picture_shapes = list(_iter_picture_shapes(slide.shapes))
|
||||
|
||||
for shape in picture_shapes:
|
||||
try:
|
||||
# Get image hash based on selected method
|
||||
if use_perceptual_hash:
|
||||
# Use perceptual hash of the image blob for visual content matching
|
||||
current_hash = _calculate_perceptual_hash(shape.image.blob)
|
||||
else:
|
||||
# Use SHA1 hash from python-pptx (exact byte match)
|
||||
current_hash = shape.image.sha1
|
||||
alt_text = _get_shape_alt_text(shape)
|
||||
has_valid_alt = False
|
||||
|
||||
if alt_text:
|
||||
# Handle potential path separators and whitespace
|
||||
clean_alt = alt_text.strip().replace('\\', '/')
|
||||
parts = clean_alt.split('/')
|
||||
|
||||
if current_hash in image_hash_map:
|
||||
original_path = image_hash_map[current_hash]
|
||||
# Check if it looks like a path/file reference (at least 2 parts like dir/file)
|
||||
if len(parts) >= 2:
|
||||
# Enforce format: keep last 2 parts (e.g. filter/image.png)
|
||||
new_alt = '/'.join(parts[-2:])
|
||||
|
||||
# Generate Alt Text
|
||||
try:
|
||||
# Prepare path for generator.
|
||||
# Try to relativize to CWD if capable
|
||||
pass_path = original_path
|
||||
try:
|
||||
pass_path = original_path.relative_to(Path.cwd())
|
||||
except ValueError:
|
||||
pass
|
||||
if new_alt != alt_text:
|
||||
print(f"Slide {i+1}: Fixing alt-text format: '{alt_text}' -> '{new_alt}'")
|
||||
_set_shape_alt_text(shape, new_alt)
|
||||
updates_count += 1
|
||||
|
||||
new_alt_text = image_alt_text_generator(pass_path)
|
||||
|
||||
# Check existing alt text to avoid redundant updates/log them
|
||||
# Accessing alt text via cNvPr
|
||||
# Note: Different shape types might store non-visual props differently
|
||||
# Picture: nvPicPr.cNvPr
|
||||
# GraphicFrame: nvGraphicFramePr.cNvPr
|
||||
# Group: nvGrpSpPr.cNvPr
|
||||
# Shape/Placeholder: nvSpPr.cNvPr
|
||||
|
||||
nvPr = None
|
||||
for attr in ['nvPicPr', 'nvSpPr', 'nvGrpSpPr', 'nvGraphicFramePr', 'nvCxnSpPr']:
|
||||
if hasattr(shape._element, attr):
|
||||
nvPr = getattr(shape._element, attr)
|
||||
break
|
||||
|
||||
if nvPr and hasattr(nvPr, 'cNvPr'):
|
||||
cNvPr = nvPr.cNvPr
|
||||
existing_alt_text = cNvPr.get("descr", "")
|
||||
|
||||
if existing_alt_text != new_alt_text:
|
||||
print(f"Slide {i+1}: Updating alt text for image matches '{pass_path}'")
|
||||
print(f" Old: '{existing_alt_text}' -> New: '{new_alt_text}'")
|
||||
cNvPr.set("descr", new_alt_text)
|
||||
updates_count += 1
|
||||
else:
|
||||
print(f"Could not find cNvPr for shape on slide {i+1}")
|
||||
|
||||
except AssertionError as e:
|
||||
print(f"Skipping match for {original_path} due to generator error: {e}")
|
||||
except Exception as e:
|
||||
print(f"Error updating alt text for {original_path}: {e}")
|
||||
|
||||
has_valid_alt = True
|
||||
else:
|
||||
# Check if image already has alt text set - if so, skip reporting as unmatched
|
||||
existing_alt = _get_shape_alt_text(shape)
|
||||
if existing_alt:
|
||||
# Image already has alt text, no need to report as unmatched
|
||||
continue
|
||||
|
||||
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
||||
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
||||
hash_type = "pHash" if use_perceptual_hash else "SHA1"
|
||||
|
||||
unmatched_images.append({
|
||||
'slide': i+1,
|
||||
'shape_name': shape_name,
|
||||
'hash_type': hash_type,
|
||||
'hash': current_hash
|
||||
})
|
||||
|
||||
except AttributeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
print(f"Error processing shape on slide {i+1}: {e}")
|
||||
# User requested deleting other cases that do not meet format
|
||||
# If it's single word or doesn't look like our path format
|
||||
pass # logic below handles this
|
||||
|
||||
if not has_valid_alt:
|
||||
if alt_text:
|
||||
print(f"Slide {i+1}: Invalid/Legacy alt-text '{alt_text}'. Clearing for re-matching.")
|
||||
_set_shape_alt_text(shape, "")
|
||||
updates_count += 1
|
||||
|
||||
# Queue for hash matching
|
||||
shape_id = getattr(shape, 'shape_id', getattr(shape, 'id', 'Unknown ID'))
|
||||
shape_name = shape.name if shape.name else f"Unnamed Shape (ID: {shape_id})"
|
||||
images_needing_match.append({
|
||||
'slide_idx': i, # 0-based
|
||||
'slide_num': i+1,
|
||||
'shape': shape,
|
||||
'shape_name': shape_name
|
||||
})
|
||||
|
||||
# Print summary
|
||||
if not images_needing_match:
|
||||
print("\nAll images have valid alt-text format. No hash matching needed.")
|
||||
if updates_count > 0:
|
||||
prs.save(output_path)
|
||||
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||
else:
|
||||
print("Presentation is up to date.")
|
||||
return
|
||||
|
||||
# Pass 2: Hash Matching
|
||||
print(f"\n{len(images_needing_match)} images missing proper alt-text. Proceeding with hash matching...")
|
||||
|
||||
# Build lookup map of {hash: file_path} only if needed
|
||||
image_hash_map = _build_image_hash_map(image_source_dir, use_perceptual_hash=use_perceptual_hash)
|
||||
|
||||
unmatched_images = []
|
||||
|
||||
for item in images_needing_match:
|
||||
shape = item['shape']
|
||||
slide_num = item['slide_num']
|
||||
|
||||
try:
|
||||
# Get image hash
|
||||
if use_perceptual_hash:
|
||||
current_hash = _calculate_perceptual_hash(shape.image.blob)
|
||||
else:
|
||||
current_hash = shape.image.sha1
|
||||
|
||||
if current_hash in image_hash_map:
|
||||
original_path = image_hash_map[current_hash]
|
||||
|
||||
# Generate Alt Text
|
||||
try:
|
||||
# Try to relativize to CWD if capable
|
||||
pass_path = original_path
|
||||
try:
|
||||
pass_path = original_path.relative_to(Path.cwd())
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
new_alt_text = image_alt_text_generator(pass_path)
|
||||
|
||||
print(f"Slide {slide_num}: Match found! Assigning alt-text '{new_alt_text}'")
|
||||
_set_shape_alt_text(shape, new_alt_text)
|
||||
updates_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating alt text for {original_path}: {e}")
|
||||
else:
|
||||
hash_type = "pHash" if use_perceptual_hash else "SHA1"
|
||||
unmatched_images.append({
|
||||
'slide': slide_num,
|
||||
'shape_name': item['shape_name'],
|
||||
'hash_type': hash_type,
|
||||
'hash': current_hash
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing shape on slide {slide_num}: {e}")
|
||||
|
||||
# Save and Print Summary
|
||||
print("\n" + "="*80)
|
||||
if updates_count > 0:
|
||||
prs.save(output_path)
|
||||
print(f"✓ Saved updated presentation to {output_path} with {updates_count} updates.")
|
||||
else:
|
||||
print("No images matched or required updates.")
|
||||
print("No matches found for missing images.")
|
||||
|
||||
# List unmatched images at the end
|
||||
if unmatched_images:
|
||||
print(f"\n⚠ {len(unmatched_images)} image(s) not found in source directory:")
|
||||
print(f"\n⚠ {len(unmatched_images)} image(s) could not be matched:")
|
||||
for img in unmatched_images:
|
||||
print(f" • Slide {img['slide']}: '{img['shape_name']}' ({img['hash_type']}: {img['hash']})")
|
||||
else:
|
||||
print("\n✓ All images matched successfully!")
|
||||
print("\n✓ All images processed successfully!")
|
||||
print("="*80)
|
||||
|
||||
|
||||
@@ -723,7 +762,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
|
||||
class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
"""Class to handle Qualtrics survey data."""
|
||||
|
||||
def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
|
||||
def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path], figures_dir: Union[str, Path] = None):
|
||||
if isinstance(data_path, str):
|
||||
data_path = Path(data_path)
|
||||
|
||||
@@ -735,8 +774,12 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
self.qid_descr_map = self._extract_qid_descr_map()
|
||||
self.qsf:dict = self._load_qsf()
|
||||
|
||||
# get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
|
||||
self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
|
||||
if figures_dir:
|
||||
self.fig_save_dir = Path(figures_dir)
|
||||
else:
|
||||
# get export directory name for saving figures ie if data_path='data/exports/OneDrive_2026-01-21/...' should be 'figures/OneDrive_2026-01-21'
|
||||
self.fig_save_dir = Path('figures') / self.data_filepath.parts[2]
|
||||
|
||||
if not self.fig_save_dir.exists():
|
||||
self.fig_save_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@@ -750,7 +793,10 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
self.filter_consumer:list = None
|
||||
self.filter_ethnicity:list = None
|
||||
self.filter_income:list = None
|
||||
|
||||
self.filter_business_owner:list = None # QID4
|
||||
self.filter_ai_user:list = None # QID22
|
||||
self.filter_investable_assets:list = None # QID16
|
||||
self.filter_industry:list = None # QID17
|
||||
|
||||
|
||||
|
||||
@@ -838,6 +884,10 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
self.options_consumer = sorted(df['Consumer'].drop_nulls().unique().to_list()) if 'Consumer' in df.columns else []
|
||||
self.options_ethnicity = sorted(df['QID3'].drop_nulls().unique().to_list()) if 'QID3' in df.columns else []
|
||||
self.options_income = sorted(df['QID15'].drop_nulls().unique().to_list()) if 'QID15' in df.columns else []
|
||||
self.options_business_owner = sorted(df['QID4'].drop_nulls().unique().to_list()) if 'QID4' in df.columns else []
|
||||
self.options_ai_user = sorted(df['QID22'].drop_nulls().unique().to_list()) if 'QID22' in df.columns else []
|
||||
self.options_investable_assets = sorted(df['QID16'].drop_nulls().unique().to_list()) if 'QID16' in df.columns else []
|
||||
self.options_industry = sorted(df['QID17'].drop_nulls().unique().to_list()) if 'QID17' in df.columns else []
|
||||
|
||||
return df.lazy()
|
||||
|
||||
@@ -854,41 +904,61 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
|
||||
return q.select(QIDs).rename(rename_dict)
|
||||
|
||||
def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None) -> pl.LazyFrame:
|
||||
def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None, business_owner:list=None, ai_user:list=None, investable_assets:list=None, industry:list=None) -> pl.LazyFrame:
|
||||
"""Filter data based on provided parameters
|
||||
|
||||
Possible parameters:
|
||||
- age: list of age groups to include
|
||||
- gender: list
|
||||
- consumer: list
|
||||
- ethnicity: list
|
||||
- income: list
|
||||
- age: list of age groups to include (QID1)
|
||||
- gender: list (QID2)
|
||||
- consumer: list (Consumer)
|
||||
- ethnicity: list (QID3)
|
||||
- income: list (QID15)
|
||||
- business_owner: list (QID4)
|
||||
- ai_user: list (QID22)
|
||||
- investable_assets: list (QID16)
|
||||
- industry: list (QID17)
|
||||
|
||||
Also saves the result to self.data_filtered.
|
||||
"""
|
||||
|
||||
# Apply filters
|
||||
# Apply filters - skip if empty list (columns with all NULLs produce empty options)
|
||||
# OR if all options are selected (to avoid dropping NULLs)
|
||||
|
||||
self.filter_age = age
|
||||
if age is not None:
|
||||
if age is not None and len(age) > 0 and set(age) != set(self.options_age):
|
||||
q = q.filter(pl.col('QID1').is_in(age))
|
||||
|
||||
self.filter_gender = gender
|
||||
if gender is not None:
|
||||
if gender is not None and len(gender) > 0 and set(gender) != set(self.options_gender):
|
||||
q = q.filter(pl.col('QID2').is_in(gender))
|
||||
|
||||
self.filter_consumer = consumer
|
||||
if consumer is not None:
|
||||
if consumer is not None and len(consumer) > 0 and set(consumer) != set(self.options_consumer):
|
||||
q = q.filter(pl.col('Consumer').is_in(consumer))
|
||||
|
||||
self.filter_ethnicity = ethnicity
|
||||
if ethnicity is not None:
|
||||
if ethnicity is not None and len(ethnicity) > 0 and set(ethnicity) != set(self.options_ethnicity):
|
||||
q = q.filter(pl.col('QID3').is_in(ethnicity))
|
||||
|
||||
self.filter_income = income
|
||||
if income is not None:
|
||||
if income is not None and len(income) > 0 and set(income) != set(self.options_income):
|
||||
q = q.filter(pl.col('QID15').is_in(income))
|
||||
|
||||
self
|
||||
self.filter_business_owner = business_owner
|
||||
if business_owner is not None and len(business_owner) > 0 and set(business_owner) != set(self.options_business_owner):
|
||||
q = q.filter(pl.col('QID4').is_in(business_owner))
|
||||
|
||||
self.filter_ai_user = ai_user
|
||||
if ai_user is not None and len(ai_user) > 0 and set(ai_user) != set(self.options_ai_user):
|
||||
q = q.filter(pl.col('QID22').is_in(ai_user))
|
||||
|
||||
self.filter_investable_assets = investable_assets
|
||||
if investable_assets is not None and len(investable_assets) > 0 and set(investable_assets) != set(self.options_investable_assets):
|
||||
q = q.filter(pl.col('QID16').is_in(investable_assets))
|
||||
|
||||
self.filter_industry = industry
|
||||
if industry is not None and len(industry) > 0 and set(industry) != set(self.options_industry):
|
||||
q = q.filter(pl.col('QID17').is_in(industry))
|
||||
|
||||
self.data_filtered = q
|
||||
return self.data_filtered
|
||||
@@ -1045,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
|
||||
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
|
||||
|
||||
def get_top_3_voices_missing_ranking(
|
||||
self, q: pl.LazyFrame
|
||||
) -> pl.DataFrame:
|
||||
"""Identify respondents who completed the top-3 voice selection (QID36)
|
||||
but are missing the explicit ranking question (QID98).
|
||||
|
||||
These respondents picked 3 voices in the selection step and have
|
||||
selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
|
||||
ranking columns are null. This means ``get_top_3_voices()`` will
|
||||
return all-null rows for them, causing plots like
|
||||
``plot_most_ranked_1`` to undercount.
|
||||
|
||||
Parameters:
|
||||
q: The (optionally filtered) LazyFrame from ``load_data()``.
|
||||
|
||||
Returns:
|
||||
A collected ``pl.DataFrame`` with columns:
|
||||
|
||||
- ``_recordId`` – the respondent identifier
|
||||
- ``3_Ranked`` – comma-separated text of the 3 voices they selected
|
||||
- ``qid36_rank_cols`` – dict-like column with their QID36 selection-
|
||||
order values (for reference; these are *not* preference ranks)
|
||||
"""
|
||||
# Get the top-3 ranking data (QID98-based)
|
||||
top3, _ = self.get_top_3_voices(q)
|
||||
top3_df = top3.collect()
|
||||
|
||||
ranking_cols = [c for c in top3_df.columns if c != '_recordId']
|
||||
|
||||
# Respondents where every QID98 ranking column is null
|
||||
all_null_expr = pl.lit(True)
|
||||
for col in ranking_cols:
|
||||
all_null_expr = all_null_expr & pl.col(col).is_null()
|
||||
|
||||
missing_ids = top3_df.filter(all_null_expr).select('_recordId')
|
||||
|
||||
if missing_ids.height == 0:
|
||||
return pl.DataFrame(schema={
|
||||
'_recordId': pl.Utf8,
|
||||
'3_Ranked': pl.Utf8,
|
||||
})
|
||||
|
||||
# Enrich with the 3_Ranked text from the 18→8→3 question
|
||||
v_18_8_3, _ = self.get_18_8_3(q)
|
||||
v_df = v_18_8_3.collect()
|
||||
|
||||
result = missing_ids.join(
|
||||
v_df.select(['_recordId', '3_Ranked']),
|
||||
on='_recordId',
|
||||
how='left',
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
|
||||
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.
|
||||
@@ -1518,6 +1642,235 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
|
||||
return results_df, metadata
|
||||
|
||||
def compute_mentions_significance(
|
||||
self,
|
||||
data: pl.LazyFrame | pl.DataFrame,
|
||||
alpha: float = 0.05,
|
||||
correction: str = "bonferroni",
|
||||
) -> tuple[pl.DataFrame, dict]:
|
||||
"""Compute statistical significance for Total Mentions (Rank 1+2+3).
|
||||
|
||||
Tests whether the proportion of respondents who included a voice in their Top 3
|
||||
is significantly different between voices.
|
||||
|
||||
Args:
|
||||
data: Ranking data (rows=respondents, cols=voices, values=rank).
|
||||
alpha: Significance level.
|
||||
correction: Multiple comparison correction method.
|
||||
|
||||
Returns:
|
||||
tuple: (pairwise_df, metadata)
|
||||
"""
|
||||
from scipy import stats as scipy_stats
|
||||
import numpy as np
|
||||
|
||||
if isinstance(data, pl.LazyFrame):
|
||||
df = data.collect()
|
||||
else:
|
||||
df = data
|
||||
|
||||
ranking_cols = [c for c in df.columns if c != '_recordId']
|
||||
if len(ranking_cols) < 2:
|
||||
raise ValueError("Need at least 2 ranking columns")
|
||||
|
||||
total_respondents = df.height
|
||||
mentions_data = {}
|
||||
|
||||
# Count mentions (any rank) for each voice
|
||||
for col in ranking_cols:
|
||||
label = self._clean_voice_label(col)
|
||||
count = df.filter(pl.col(col).is_not_null()).height
|
||||
mentions_data[label] = count
|
||||
|
||||
labels = sorted(list(mentions_data.keys()))
|
||||
results = []
|
||||
n_comparisons = len(labels) * (len(labels) - 1) // 2
|
||||
|
||||
for i, label1 in enumerate(labels):
|
||||
for label2 in labels[i+1:]:
|
||||
count1 = mentions_data[label1]
|
||||
count2 = mentions_data[label2]
|
||||
|
||||
pct1 = count1 / total_respondents
|
||||
pct2 = count2 / total_respondents
|
||||
|
||||
# Z-test for two proportions
|
||||
n1 = total_respondents
|
||||
n2 = total_respondents
|
||||
|
||||
p_pooled = (count1 + count2) / (n1 + n2)
|
||||
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
|
||||
|
||||
if se > 0:
|
||||
z_stat = (pct1 - pct2) / se
|
||||
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
|
||||
else:
|
||||
p_value = 1.0
|
||||
|
||||
results.append({
|
||||
'group1': label1,
|
||||
'group2': label2,
|
||||
'p_value': float(p_value),
|
||||
'rank1_count1': count1, # Reusing column names for compatibility with heatmap plotting
|
||||
'rank1_count2': count2,
|
||||
'rank1_pct1': round(pct1 * 100, 1),
|
||||
'rank1_pct2': round(pct2 * 100, 1),
|
||||
'total1': n1,
|
||||
'total2': n2,
|
||||
'effect_size': pct1 - pct2 # Difference in proportions
|
||||
})
|
||||
|
||||
results_df = pl.DataFrame(results)
|
||||
|
||||
p_values = results_df['p_value'].to_numpy()
|
||||
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
|
||||
|
||||
if correction == "bonferroni":
|
||||
p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
|
||||
elif correction == "holm":
|
||||
sorted_idx = np.argsort(p_values)
|
||||
sorted_p = p_values[sorted_idx]
|
||||
m = len(sorted_p)
|
||||
adjusted = np.zeros(m)
|
||||
for j in range(m):
|
||||
adjusted[j] = sorted_p[j] * (m - j)
|
||||
for j in range(1, m):
|
||||
adjusted[j] = max(adjusted[j], adjusted[j-1])
|
||||
adjusted = np.minimum(adjusted, 1.0)
|
||||
p_adjusted = adjusted[np.argsort(sorted_idx)]
|
||||
elif correction == "none":
|
||||
p_adjusted = p_values.astype(float) # pyright: ignore
|
||||
|
||||
results_df = results_df.with_columns([
|
||||
pl.Series('p_adjusted', p_adjusted),
|
||||
pl.Series('significant', p_adjusted < alpha),
|
||||
]).sort('p_value')
|
||||
|
||||
metadata = {
|
||||
'test_type': 'proportion_z_test_mentions',
|
||||
'alpha': alpha,
|
||||
'correction': correction,
|
||||
'n_comparisons': n_comparisons,
|
||||
}
|
||||
|
||||
return results_df, metadata
|
||||
|
||||
def compute_rank1_significance(
|
||||
self,
|
||||
data: pl.LazyFrame | pl.DataFrame,
|
||||
alpha: float = 0.05,
|
||||
correction: str = "bonferroni",
|
||||
) -> tuple[pl.DataFrame, dict]:
|
||||
"""Compute statistical significance for Rank 1 selections only.
|
||||
|
||||
Like compute_mentions_significance but counts only how many times each
|
||||
voice/character was ranked **1st**, using total respondents as the
|
||||
denominator. This tests whether first-choice preference differs
|
||||
significantly between voices.
|
||||
|
||||
Args:
|
||||
data: Ranking data (rows=respondents, cols=voices, values=rank).
|
||||
alpha: Significance level.
|
||||
correction: Multiple comparison correction method.
|
||||
|
||||
Returns:
|
||||
tuple: (pairwise_df, metadata)
|
||||
"""
|
||||
from scipy import stats as scipy_stats
|
||||
import numpy as np
|
||||
|
||||
if isinstance(data, pl.LazyFrame):
|
||||
df = data.collect()
|
||||
else:
|
||||
df = data
|
||||
|
||||
ranking_cols = [c for c in df.columns if c != '_recordId']
|
||||
if len(ranking_cols) < 2:
|
||||
raise ValueError("Need at least 2 ranking columns")
|
||||
|
||||
total_respondents = df.height
|
||||
rank1_data: dict[str, int] = {}
|
||||
|
||||
# Count rank-1 selections for each voice
|
||||
for col in ranking_cols:
|
||||
label = self._clean_voice_label(col)
|
||||
count = df.filter(pl.col(col) == 1).height
|
||||
rank1_data[label] = count
|
||||
|
||||
labels = sorted(list(rank1_data.keys()))
|
||||
results = []
|
||||
n_comparisons = len(labels) * (len(labels) - 1) // 2
|
||||
|
||||
for i, label1 in enumerate(labels):
|
||||
for label2 in labels[i+1:]:
|
||||
count1 = rank1_data[label1]
|
||||
count2 = rank1_data[label2]
|
||||
|
||||
pct1 = count1 / total_respondents
|
||||
pct2 = count2 / total_respondents
|
||||
|
||||
# Z-test for two proportions (same denominator for both)
|
||||
n1 = total_respondents
|
||||
n2 = total_respondents
|
||||
|
||||
p_pooled = (count1 + count2) / (n1 + n2)
|
||||
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
|
||||
|
||||
if se > 0:
|
||||
z_stat = (pct1 - pct2) / se
|
||||
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
|
||||
else:
|
||||
p_value = 1.0
|
||||
|
||||
results.append({
|
||||
'group1': label1,
|
||||
'group2': label2,
|
||||
'p_value': float(p_value),
|
||||
'rank1_count1': count1,
|
||||
'rank1_count2': count2,
|
||||
'rank1_pct1': round(pct1 * 100, 1),
|
||||
'rank1_pct2': round(pct2 * 100, 1),
|
||||
'total1': n1,
|
||||
'total2': n2,
|
||||
'effect_size': pct1 - pct2,
|
||||
})
|
||||
|
||||
results_df = pl.DataFrame(results)
|
||||
|
||||
p_values = results_df['p_value'].to_numpy()
|
||||
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
|
||||
|
||||
if correction == "bonferroni":
|
||||
p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
|
||||
elif correction == "holm":
|
||||
sorted_idx = np.argsort(p_values)
|
||||
sorted_p = p_values[sorted_idx]
|
||||
m = len(sorted_p)
|
||||
adjusted = np.zeros(m)
|
||||
for j in range(m):
|
||||
adjusted[j] = sorted_p[j] * (m - j)
|
||||
for j in range(1, m):
|
||||
adjusted[j] = max(adjusted[j], adjusted[j-1])
|
||||
adjusted = np.minimum(adjusted, 1.0)
|
||||
p_adjusted = adjusted[np.argsort(sorted_idx)]
|
||||
elif correction == "none":
|
||||
p_adjusted = p_values.astype(float) # pyright: ignore
|
||||
|
||||
results_df = results_df.with_columns([
|
||||
pl.Series('p_adjusted', p_adjusted),
|
||||
pl.Series('significant', p_adjusted < alpha),
|
||||
]).sort('p_value')
|
||||
|
||||
metadata = {
|
||||
'test_type': 'proportion_z_test_rank1',
|
||||
'alpha': alpha,
|
||||
'correction': correction,
|
||||
'n_comparisons': n_comparisons,
|
||||
}
|
||||
|
||||
return results_df, metadata
|
||||
|
||||
|
||||
|
||||
def process_speaking_style_data(
|
||||
df: Union[pl.LazyFrame, pl.DataFrame],
|
||||
|
||||
Reference in New Issue
Block a user