264 lines
10 KiB
Python
264 lines
10 KiB
Python
"""Extra analyses of the traits"""
|
|
# %% Imports
|
|
|
|
import utils
|
|
import polars as pl
|
|
import argparse
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from validation import check_straight_liners
|
|
|
|
|
|
# %% Fixed Variables
|
|
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
|
|
|
|
|
# %% CLI argument parsing for batch automation
|
|
# When run as script: uv run XX_statistical_significance.script.py --age '["18
|
|
# Central filter configuration - add new filters here only
|
|
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
|
FILTER_CONFIG = {
|
|
'age': 'options_age',
|
|
'gender': 'options_gender',
|
|
'ethnicity': 'options_ethnicity',
|
|
'income': 'options_income',
|
|
'consumer': 'options_consumer',
|
|
'business_owner': 'options_business_owner',
|
|
'ai_user': 'options_ai_user',
|
|
'investable_assets': 'options_investable_assets',
|
|
'industry': 'options_industry',
|
|
}
|
|
|
|
def parse_cli_args():
|
|
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
|
|
|
# Dynamically add filter arguments from config
|
|
for filter_name in FILTER_CONFIG:
|
|
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
|
|
|
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
|
parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
|
|
|
# Only parse if running as script (not in Jupyter/interactive)
|
|
try:
|
|
# Check if running in Jupyter by looking for ipykernel
|
|
get_ipython() # noqa: F821 # type: ignore
|
|
# Return namespace with all filters set to None
|
|
no_filters = {f: None for f in FILTER_CONFIG}
|
|
# Use the same default as argparse
|
|
default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
|
|
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
|
|
except NameError:
|
|
args = parser.parse_args()
|
|
# Parse JSON strings to lists
|
|
for filter_name in FILTER_CONFIG:
|
|
val = getattr(args, filter_name)
|
|
setattr(args, filter_name, json.loads(val) if val else None)
|
|
return args
|
|
|
|
cli_args = parse_cli_args()
|
|
|
|
|
|
# %%
|
|
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
|
data_all = S.load_data()
|
|
|
|
|
|
# %% Build filtered dataset based on CLI args
|
|
|
|
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
|
|
|
# Build filter values dict dynamically from FILTER_CONFIG
|
|
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
|
|
|
_d = S.filter_data(data_all, **_active_filters)
|
|
|
|
# Write filter description file if filter-name is provided
|
|
if cli_args.filter_name and S.fig_save_dir:
|
|
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
|
_filter_slug = S._get_filter_slug()
|
|
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
|
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Build filter description
|
|
_filter_desc_lines = [
|
|
f"Filter: {cli_args.filter_name}",
|
|
"",
|
|
"Applied Filters:",
|
|
]
|
|
_short_desc_parts = []
|
|
for filter_name, options_attr in FILTER_CONFIG.items():
|
|
all_options = getattr(S, options_attr)
|
|
values = _active_filters[filter_name]
|
|
display_name = filter_name.replace('_', ' ').title()
|
|
# None means no filter applied (same as "All")
|
|
if values is not None and values != all_options:
|
|
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
|
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
|
else:
|
|
_filter_desc_lines.append(f" {display_name}: All")
|
|
|
|
# Write detailed description INSIDE the filter-slug directory
|
|
# Sanitize filter name for filename usage (replace / and other chars)
|
|
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
|
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
|
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
|
|
|
# Append to summary index file at figures/<export_date>/filter_index.txt
|
|
_summary_file = S.fig_save_dir / "filter_index.txt"
|
|
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
|
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
|
|
|
# Append or create the summary file
|
|
if _summary_file.exists():
|
|
_existing = _summary_file.read_text()
|
|
# Avoid duplicate entries for same slug
|
|
if _filter_slug not in _existing:
|
|
with _summary_file.open('a') as f:
|
|
f.write(_summary_line)
|
|
else:
|
|
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
|
_header += "Directory | Filter Name | Description\n"
|
|
_header += "-" * 80 + "\n"
|
|
_summary_file.write_text(_header + _summary_line)
|
|
|
|
# Save to logical variable name for further analysis
|
|
data = _d
|
|
data.collect()
|
|
|
|
# %% Voices per trait
|
|
|
|
|
|
ss_or, choice_map_or = S.get_ss_orange_red(data)
|
|
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
|
|
|
|
# Combine the data
|
|
ss_all = ss_or.join(ss_gb, on='_recordId')
|
|
_d = ss_all.collect()
|
|
|
|
choice_map = {**choice_map_or, **choice_map_gb}
|
|
# print(_d.head())
|
|
# print(choice_map)
|
|
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
|
|
|
|
|
# %% Create plots
|
|
|
|
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
|
trait_d = ss_long.filter(pl.col("Description") == trait)
|
|
|
|
S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True)
|
|
|
|
|
|
|
|
|
|
|
|
# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
|
|
# Save with different filename suffix so we can compare with/without straight-liners
|
|
|
|
print("\n--- Straight-lining Checks on TRAITS ---")
|
|
sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
|
|
sl_traits_df
|
|
|
|
# %%
|
|
|
|
if sl_traits_df is not None and not sl_traits_df.is_empty():
|
|
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
|
|
n_sl_groups = sl_traits_df.height
|
|
print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
|
|
|
|
# Create key in ss_long to match sl_traits_df for anti-join
|
|
# Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
|
|
# ss_long has "Style_Group" and "Voice"
|
|
ss_long_w_key = ss_long.with_columns(
|
|
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
|
)
|
|
|
|
# Prepare filter table: Record ID + Question Group
|
|
sl_filter = sl_traits_df.select([
|
|
pl.col("Record ID").alias("_recordId"),
|
|
pl.col("Question Group")
|
|
])
|
|
|
|
# Anti-join to remove specific question blocks that were straight-lined
|
|
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
|
|
|
|
# Re-plot with suffix in title
|
|
print("Re-plotting traits (Cleaned)...")
|
|
for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
|
|
trait_d = ss_long_clean.filter(pl.col("Description") == trait)
|
|
|
|
# Modify title to create unique filename (and display title)
|
|
title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)"
|
|
|
|
S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
|
|
else:
|
|
print("No straight-liners found on traits.")
|
|
|
|
|
|
|
|
|
|
# %% Compare All vs Cleaned
|
|
if sl_traits_df is not None and not sl_traits_df.is_empty():
|
|
print("Generating Comparison Plots (All vs Cleaned)...")
|
|
|
|
# Always apply the per-question-group filtering here to ensure consistency
|
|
# (Matches the logic used in the re-plotting section above)
|
|
print("Applying filter to remove straight-lined question blocks...")
|
|
ss_long_w_key = ss_long.with_columns(
|
|
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
|
)
|
|
sl_filter = sl_traits_df.select([
|
|
pl.col("Record ID").alias("_recordId"),
|
|
pl.col("Question Group")
|
|
])
|
|
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
|
|
|
|
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
|
|
|
|
# --- Verification Prints ---
|
|
print(f"\n--- Verification of Filter ---")
|
|
print(f"Original Row Count: {ss_long.height}")
|
|
print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
|
|
print(f"Sample IDs affected: {sl_ids[:5]}")
|
|
print(f"Cleaned Row Count: {ss_long_clean.height}")
|
|
print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
|
|
|
|
# Verify removal
|
|
# Re-construct key to verify
|
|
ss_long_check = ss_long.with_columns(
|
|
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
|
)
|
|
sl_filter_check = sl_traits_df.select([
|
|
pl.col("Record ID").alias("_recordId"),
|
|
pl.col("Question Group")
|
|
])
|
|
|
|
should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
|
|
print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
|
|
|
|
# Show what was removed (the straight lining behavior)
|
|
print("\nSample of Straight-Liner Data (Values that caused removal):")
|
|
print(sl_traits_df.head(5))
|
|
print("-" * 30 + "\n")
|
|
# ---------------------------
|
|
|
|
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
|
|
|
# Get data for this trait from both datasets
|
|
trait_d_all = ss_long.filter(pl.col("Description") == trait)
|
|
trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
|
|
|
|
# Plot comparison
|
|
title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)"
|
|
|
|
S.plot_speaking_style_trait_scores_comparison(
|
|
trait_d_all,
|
|
trait_d_clean,
|
|
title=title_comp,
|
|
height=600 # Slightly taller for grouped bars
|
|
)
|
|
|