"""Extra analyses of the traits""" # %% Imports import utils import polars as pl import argparse import json import re from pathlib import Path from validation import check_straight_liners # %% Fixed Variables RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv' QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' # %% CLI argument parsing for batch automation # When run as script: uv run XX_statistical_significance.script.py --age '["18 # Central filter configuration - add new filters here only # Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name' FILTER_CONFIG = { 'age': 'options_age', 'gender': 'options_gender', 'ethnicity': 'options_ethnicity', 'income': 'options_income', 'consumer': 'options_consumer', 'business_owner': 'options_business_owner', 'ai_user': 'options_ai_user', 'investable_assets': 'options_investable_assets', 'industry': 'options_industry', } def parse_cli_args(): parser = argparse.ArgumentParser(description='Generate quant report with optional filters') # Dynamically add filter arguments from config for filter_name in FILTER_CONFIG: parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') # Only parse if running as script (not in Jupyter/interactive) try: # Check if running in Jupyter by looking for ipykernel get_ipython() # noqa: F821 # type: ignore # Return namespace with all filters set to None no_filters = {f: None for f in FILTER_CONFIG} # Use the same default as argparse default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}' return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir) except NameError: args = parser.parse_args() # Parse JSON strings to lists for filter_name in FILTER_CONFIG: val = getattr(args, filter_name) setattr(args, filter_name, json.loads(val) if val else None) return args cli_args = parse_cli_args() # %% S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir) data_all = S.load_data() # %% Build filtered dataset based on CLI args # CLI args: None means "no filter applied" - filter_data() will skip None filters # Build filter values dict dynamically from FILTER_CONFIG _active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG} _d = S.filter_data(data_all, **_active_filters) # Write filter description file if filter-name is provided if cli_args.filter_name and S.fig_save_dir: # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.) _filter_slug = S._get_filter_slug() _filter_slug_dir = S.fig_save_dir / _filter_slug _filter_slug_dir.mkdir(parents=True, exist_ok=True) # Build filter description _filter_desc_lines = [ f"Filter: {cli_args.filter_name}", "", "Applied Filters:", ] _short_desc_parts = [] for filter_name, options_attr in FILTER_CONFIG.items(): all_options = getattr(S, options_attr) values = _active_filters[filter_name] display_name = filter_name.replace('_', ' ').title() # None means no filter applied (same as "All") if values is not None and values != all_options: _short_desc_parts.append(f"{display_name}: {', '.join(values)}") _filter_desc_lines.append(f" {display_name}: {', '.join(values)}") else: _filter_desc_lines.append(f" {display_name}: All") # Write detailed description INSIDE the filter-slug directory # Sanitize filter name for filename usage (replace / and other chars) _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name) _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt" _filter_file.write_text('\n'.join(_filter_desc_lines)) # Append to summary index file at figures//filter_index.txt _summary_file = S.fig_save_dir / "filter_index.txt" _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents" _summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n" # Append or create the summary file if _summary_file.exists(): _existing = _summary_file.read_text() # Avoid duplicate entries for same slug if _filter_slug not in _existing: with _summary_file.open('a') as f: f.write(_summary_line) else: _header = "Filter Index\n" + "=" * 80 + "\n\n" _header += "Directory | Filter Name | Description\n" _header += "-" * 80 + "\n" _summary_file.write_text(_header + _summary_line) # Save to logical variable name for further analysis data = _d data.collect() # %% Voices per trait ss_or, choice_map_or = S.get_ss_orange_red(data) ss_gb, choice_map_gb = S.get_ss_green_blue(data) # Combine the data ss_all = ss_or.join(ss_gb, on='_recordId') _d = ss_all.collect() choice_map = {**choice_map_or, **choice_map_gb} # print(_d.head()) # print(choice_map) ss_long = utils.process_speaking_style_data(ss_all, choice_map) # %% Create plots for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()): trait_d = ss_long.filter(pl.col("Description") == trait) S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True) # %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes # Save with different filename suffix so we can compare with/without straight-liners print("\n--- Straight-lining Checks on TRAITS ---") sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5) sl_traits_df # %% if sl_traits_df is not None and not sl_traits_df.is_empty(): sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list() n_sl_groups = sl_traits_df.height print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.") # Create key in ss_long to match sl_traits_df for anti-join # Question Group key in sl_traits_df is like "SS_Orange_Red__V14" # ss_long has "Style_Group" and "Voice" ss_long_w_key = ss_long.with_columns( (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group") ) # Prepare filter table: Record ID + Question Group sl_filter = sl_traits_df.select([ pl.col("Record ID").alias("_recordId"), pl.col("Question Group") ]) # Anti-join to remove specific question blocks that were straight-lined ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group") # Re-plot with suffix in title print("Re-plotting traits (Cleaned)...") for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()): trait_d = ss_long_clean.filter(pl.col("Description") == trait) # Modify title to create unique filename (and display title) title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)" S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True) else: print("No straight-liners found on traits.") # %% Compare All vs Cleaned if sl_traits_df is not None and not sl_traits_df.is_empty(): print("Generating Comparison Plots (All vs Cleaned)...") # Always apply the per-question-group filtering here to ensure consistency # (Matches the logic used in the re-plotting section above) print("Applying filter to remove straight-lined question blocks...") ss_long_w_key = ss_long.with_columns( (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group") ) sl_filter = sl_traits_df.select([ pl.col("Record ID").alias("_recordId"), pl.col("Question Group") ]) ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group") sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list() # --- Verification Prints --- print(f"\n--- Verification of Filter ---") print(f"Original Row Count: {ss_long.height}") print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}") print(f"Sample IDs affected: {sl_ids[:5]}") print(f"Cleaned Row Count: {ss_long_clean.height}") print(f"Rows Removed: {ss_long.height - ss_long_clean.height}") # Verify removal # Re-construct key to verify ss_long_check = ss_long.with_columns( (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group") ) sl_filter_check = sl_traits_df.select([ pl.col("Record ID").alias("_recordId"), pl.col("Question Group") ]) should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }") # Show what was removed (the straight lining behavior) print("\nSample of Straight-Liner Data (Values that caused removal):") print(sl_traits_df.head(5)) print("-" * 30 + "\n") # --------------------------- for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()): # Get data for this trait from both datasets trait_d_all = ss_long.filter(pl.col("Description") == trait) trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait) # Plot comparison title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)" S.plot_speaking_style_trait_scores_comparison( trait_d_all, trait_d_clean, title=title_comp, height=600 # Slightly taller for grouped bars )