From 9dfab7592505d648459063f44dc186ebc09c77ad Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Tue, 10 Feb 2026 14:24:26 +0100 Subject: [PATCH] missing data analysis --- XX_quant_report.script.py | 27 +- analysis_missing_voice_ranking.ipynb | 1359 ++++++++++++++++++++++++++ plots.py | 14 + run_filter_combinations.py | 30 +- utils.py | 54 + 5 files changed, 1477 insertions(+), 7 deletions(-) create mode 100644 analysis_missing_voice_ranking.ipynb diff --git a/XX_quant_report.script.py b/XX_quant_report.script.py index 402d42f..438de1a 100644 --- a/XX_quant_report.script.py +++ b/XX_quant_report.script.py @@ -16,8 +16,8 @@ from speaking_styles import SPEAKING_STYLES # %% Fixed Variables -# RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv' -RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv' +RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv' +# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv' QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' @@ -51,6 +51,7 @@ def parse_cli_args(): parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")') parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)') + parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them') # Only parse if running as script (not in Jupyter/interactive) try: @@ -58,7 +59,7 @@ def parse_cli_args(): get_ipython() # noqa: F821 # type: ignore # Return namespace with all filters set to None no_filters = {f: None for f in FILTER_CONFIG} - return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None) + return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None) except NameError: args = parser.parse_args() # Parse JSON strings to lists @@ -174,6 +175,26 @@ if cli_args.sl_threshold is not None: else: print(" No straight-liners detected — no respondents removed.") +# %% Apply voice-ranking completeness filter (if specified) +# Keeps only / excludes respondents who are missing the explicit voice +# ranking question (QID98) despite completing the top-3 selection (QID36). +if cli_args.voice_ranking_filter is not None: + S.voice_ranking_filter = cli_args.voice_ranking_filter # Store on Survey so filter slug/description include it + _vr_missing = S.get_top_3_voices_missing_ranking(_d) + _vr_missing_ids = _vr_missing.select('_recordId') + _n_before = _d.select(pl.len()).collect().item() + + if cli_args.voice_ranking_filter == 'only-missing': + print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...") + _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy() + elif cli_args.voice_ranking_filter == 'exclude-missing': + print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...") + _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy() + + S.data_filtered = _d + _n_after = _d.select(pl.len()).collect().item() + print(f" {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)") + # Save to logical variable name for further analysis data = _d data.collect() diff --git a/analysis_missing_voice_ranking.ipynb b/analysis_missing_voice_ranking.ipynb new file mode 100644 index 0000000..2d6a2c9 --- /dev/null +++ b/analysis_missing_voice_ranking.ipynb @@ -0,0 +1,1359 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1ee02624", + "metadata": {}, + "source": [ + "# Analysis: 38 Respondents Missing Voice Ranking Data (QID98)\n", + "\n", + "**Context:** 38 out of 455 respondents completed the \"pick top 3 from 8 voices\" step (QID36) but have **all-null** data for the explicit ranking question (QID98). This notebook investigates their demographics, survey metadata, and response patterns to identify commonalities.\n", + "\n", + "**Data source:** `2-4-26` export" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bd6df059", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total respondents: 455\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "import altair as alt\n", + "from utils import QualtricsSurvey\n", + "from validation import check_straight_liners\n", + "from reference import VOICE_GENDER_MAPPING\n", + "\n", + "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n", + "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'\n", + "\n", + "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n", + "data_all = S.load_data()\n", + "data = S.filter_data(data_all)\n", + "\n", + "n_total = data.select(pl.len()).collect().item()\n", + "print(f'Total respondents: {n_total}')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "46afc18b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Respondents missing voice ranking (QID98): 38\n", + "Respondents with complete data: 417\n", + "Missing rate: 8.4%\n" + ] + } + ], + "source": [ + "# Identify the 38 missing respondents\n", + "missing = S.get_top_3_voices_missing_ranking(data)\n", + "missing_ids = missing.select('_recordId')\n", + "n_missing = missing_ids.height\n", + "\n", + "print(f'Respondents missing voice ranking (QID98): {n_missing}')\n", + "print(f'Respondents with complete data: {n_total - n_missing}')\n", + "print(f'Missing rate: {n_missing / n_total:.1%}')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "990eb4b4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1673001/2213083878.py:3: DeprecationWarning: `is_in` with a collection of the same datatype is ambiguous and deprecated.\n", + "Please use `implode` to return to previous behavior.\n", + "\n", + "See https://github.com/pola-rs/polars/issues/22149 for more information.\n", + " raw = raw.with_columns(\n" + ] + } + ], + "source": [ + "# Build a column to tag respondents as missing / complete\n", + "raw = data.collect()\n", + "raw = raw.with_columns(\n", + " pl.when(pl.col('_recordId').is_in(missing_ids['_recordId']))\n", + " .then(pl.lit('Missing QID98'))\n", + " .otherwise(pl.lit('Complete'))\n", + " .alias('ranking_status')\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3d81b1b6", + "metadata": {}, + "source": [ + "---\n", + "## 1. Survey Metadata Analysis\n", + "\n", + "Check timestamps, duration, progress, and platform info to see if the 38 share a common survey-taking pattern." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "be18a52c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Available metadata columns: ['startDate', 'endDate', 'duration', 'progress', 'finished', 'userLanguage', 'distributionChannel']\n" + ] + } + ], + "source": [ + "# Check which metadata columns exist\n", + "meta_candidates = ['startDate', 'endDate', 'duration', 'progress', 'finished',\n", + " 'Status', 'userLanguage', 'distributionChannel', 'RecipientEmail']\n", + "available_meta = [c for c in meta_candidates if c in raw.columns]\n", + "print(f'Available metadata columns: {available_meta}')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "66e3060f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Duration comparison:\n", + "shape: (2, 9)\n", + "┌─────────────┬─────────────┬────────────┬─────────────┬───┬─────────┬─────┬──────────┬────────────┐\n", + "│ ranking_sta ┆ mean_sec ┆ median_sec ┆ std_sec ┆ … ┆ max_sec ┆ n ┆ mean_min ┆ median_min │\n", + "│ tus ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ --- ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ u32 ┆ f64 ┆ f64 │\n", + "│ str ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "╞═════════════╪═════════════╪════════════╪═════════════╪═══╪═════════╪═════╪══════════╪════════════╡\n", + "│ Complete ┆ 1634.642686 ┆ 1161.0 ┆ 2072.480875 ┆ … ┆ 21761.0 ┆ 417 ┆ 27.2 ┆ 19.4 │\n", + "│ Missing ┆ 1152.078947 ┆ 871.5 ┆ 852.206211 ┆ … ┆ 4876.0 ┆ 38 ┆ 19.2 ┆ 14.5 │\n", + "│ QID98 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└─────────────┴─────────────┴────────────┴─────────────┴───┴─────────┴─────┴──────────┴────────────┘\n" + ] + } + ], + "source": [ + "# Duration comparison\n", + "if 'duration' in raw.columns:\n", + " duration_stats = (\n", + " raw.group_by('ranking_status')\n", + " .agg(\n", + " pl.col('duration').cast(pl.Float64).mean().alias('mean_sec'),\n", + " pl.col('duration').cast(pl.Float64).median().alias('median_sec'),\n", + " pl.col('duration').cast(pl.Float64).std().alias('std_sec'),\n", + " pl.col('duration').cast(pl.Float64).min().alias('min_sec'),\n", + " pl.col('duration').cast(pl.Float64).max().alias('max_sec'),\n", + " pl.len().alias('n'),\n", + " )\n", + " .with_columns(\n", + " (pl.col('mean_sec') / 60).round(1).alias('mean_min'),\n", + " (pl.col('median_sec') / 60).round(1).alias('median_min'),\n", + " )\n", + " )\n", + " print('Duration comparison:')\n", + " print(duration_stats)\n", + " \n", + " # Histogram\n", + " chart = alt.Chart(raw.select(['duration', 'ranking_status']).to_pandas()).mark_bar(\n", + " opacity=0.6\n", + " ).encode(\n", + " x=alt.X('duration:Q', bin=alt.Bin(maxbins=40), title='Duration (seconds)'),\n", + " y=alt.Y('count():Q', title='Count'),\n", + " color='ranking_status:N',\n", + " ).properties(width=700, height=300, title='Survey Duration Distribution')\n", + " chart" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f0bd2b0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Progress stats by group:\n", + "shape: (2, 3)\n", + "┌────────────────┬───────────────┬──────────────┐\n", + "│ ranking_status ┆ mean_progress ┆ min_progress │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 │\n", + "╞════════════════╪═══════════════╪══════════════╡\n", + "│ Complete ┆ 100.0 ┆ 100.0 │\n", + "│ Missing QID98 ┆ 100.0 ┆ 100.0 │\n", + "└────────────────┴───────────────┴──────────────┘\n", + "\n", + "Finished status:\n", + "shape: (2, 3)\n", + "┌────────────────┬──────────┬─────┐\n", + "│ ranking_status ┆ finished ┆ n │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ bool ┆ u32 │\n", + "╞════════════════╪══════════╪═════╡\n", + "│ Complete ┆ true ┆ 417 │\n", + "│ Missing QID98 ┆ true ┆ 38 │\n", + "└────────────────┴──────────┴─────┘\n" + ] + } + ], + "source": [ + "# Progress and finished status\n", + "if 'progress' in raw.columns:\n", + " progress_by_status = (\n", + " raw.group_by('ranking_status')\n", + " .agg(\n", + " pl.col('progress').cast(pl.Float64).mean().alias('mean_progress'),\n", + " pl.col('progress').cast(pl.Float64).min().alias('min_progress'),\n", + " pl.col('progress').value_counts().alias('progress_dist'),\n", + " )\n", + " )\n", + " print('Progress stats by group:')\n", + " print(progress_by_status.select(['ranking_status', 'mean_progress', 'min_progress']))\n", + "\n", + "if 'finished' in raw.columns:\n", + " finished_by_status = raw.group_by(['ranking_status', 'finished']).agg(pl.len().alias('n'))\n", + " print('\\nFinished status:')\n", + " print(finished_by_status.sort('ranking_status'))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0bfdb060", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Timestamp ranges by group:\n", + "shape: (2, 4)\n", + "┌────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐\n", + "│ ranking_status ┆ earliest ┆ latest ┆ median_start │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │\n", + "╞════════════════╪═════════════════════╪═════════════════════╪═════════════════════╡\n", + "│ Missing QID98 ┆ 2026-01-20 16:16:18 ┆ 2026-01-30 12:43:07 ┆ 2026-01-27 23:30:15 │\n", + "│ Complete ┆ 2026-01-20 15:00:44 ┆ 2026-01-30 23:53:40 ┆ 2026-01-28 05:46:12 │\n", + "└────────────────┴─────────────────────┴─────────────────────┴─────────────────────┘\n" + ] + } + ], + "source": [ + "# Timestamps: check if the missing respondents cluster in a specific time window\n", + "if 'startDate' in raw.columns:\n", + " ts = raw.select(['_recordId', 'ranking_status', 'startDate', 'endDate']).with_columns(\n", + " pl.col('startDate').str.to_datetime('%Y-%m-%d %H:%M:%S', strict=False).alias('start_dt'),\n", + " pl.col('endDate').str.to_datetime('%Y-%m-%d %H:%M:%S', strict=False).alias('end_dt'),\n", + " )\n", + " \n", + " # Time range per group\n", + " time_stats = ts.group_by('ranking_status').agg(\n", + " pl.col('start_dt').min().alias('earliest'),\n", + " pl.col('start_dt').max().alias('latest'),\n", + " pl.col('start_dt').median().alias('median_start'),\n", + " )\n", + " print('Timestamp ranges by group:')\n", + " print(time_stats)\n", + " \n", + " # Timeline scatter plot\n", + " chart = alt.Chart(ts.to_pandas()).mark_circle(size=40).encode(\n", + " x=alt.X('start_dt:T', title='Survey Start Time'),\n", + " y=alt.Y('ranking_status:N', title=''),\n", + " color='ranking_status:N',\n", + " tooltip=['_recordId', 'start_dt:T', 'end_dt:T'],\n", + " ).properties(width=700, height=150, title='Survey Start Times')\n", + " chart" + ] + }, + { + "cell_type": "markdown", + "id": "d81aede5", + "metadata": {}, + "source": [ + "---\n", + "## 2. Demographic Profile Comparison\n", + "\n", + "Compare age, gender, ethnicity, consumer segment, income, etc. between the two groups." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "cad1b204", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Demographic columns: ['Age', 'Gender', 'Race/Ethnicity', 'Bussiness_Owner', 'Business_Revenue', 'Employment', 'Personal_Products', 'Income', 'Investable_Assets', 'Industry', 'Consumer']\n" + ] + } + ], + "source": [ + "# Get demographics and join with ranking status\n", + "demos_df = S.get_demographics(data)[0].collect()\n", + "demos_df = demos_df.join(\n", + " raw.select(['_recordId', 'ranking_status']),\n", + " on='_recordId'\n", + ")\n", + "\n", + "demo_cols = [c for c in demos_df.columns if c not in ('_recordId', 'ranking_status')]\n", + "print(f'Demographic columns: {demo_cols}')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "65fdfd6b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "Age (% within each group)\n", + "============================================================\n", + "shape: (8, 4)\n", + "┌──────────────────┬───────────────┬──────────┬───────────┐\n", + "│ Age ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════════╪═══════════════╪══════════╪═══════════╡\n", + "│ 18 to 21 years ┆ 15.8 ┆ 14.9 ┆ 0.9 │\n", + "│ 22 to 24 years ┆ 13.2 ┆ 19.2 ┆ -6.0 │\n", + "│ 25 to 34 years ┆ 7.9 ┆ 17.0 ┆ -9.1 │\n", + "│ 35 to 40 years ┆ 23.7 ┆ 13.2 ┆ 10.5 │\n", + "│ 41 to 50 years ┆ 34.2 ┆ 16.8 ┆ 17.4 │\n", + "│ 51 to 59 years ┆ 2.6 ┆ 6.0 ┆ -3.4 │\n", + "│ 60 to 70 years ┆ null ┆ 8.2 ┆ -8.2 │\n", + "│ 70 years or more ┆ 2.6 ┆ 4.8 ┆ -2.2 │\n", + "└──────────────────┴───────────────┴──────────┴───────────┘\n", + "\n", + "============================================================\n", + "Gender (% within each group)\n", + "============================================================\n", + "shape: (3, 4)\n", + "┌───────────────────┬──────────┬───────────────┬───────────┐\n", + "│ Gender ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════════════╪══════════╪═══════════════╪═══════════╡\n", + "│ Man ┆ 59.2 ┆ 65.8 ┆ 6.6 │\n", + "│ Prefer not to say ┆ 0.2 ┆ null ┆ -0.2 │\n", + "│ Woman ┆ 40.5 ┆ 34.2 ┆ -6.3 │\n", + "└───────────────────┴──────────┴───────────────┴───────────┘\n", + "\n", + "============================================================\n", + "Race/Ethnicity (% within each group)\n", + "============================================================\n", + "shape: (21, 4)\n", + "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n", + "│ Race/Ethnicity ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", + "│ Alaska Native or Indigenous Am… ┆ 0.7 ┆ null ┆ -0.7 │\n", + "│ Alaska Native or Indigenous Am… ┆ 0.2 ┆ null ┆ -0.2 │\n", + "│ Alaska Native or Indigenous Am… ┆ 0.5 ┆ null ┆ -0.5 │\n", + "│ Alaska Native or Indigenous Am… ┆ 0.5 ┆ null ┆ -0.5 │\n", + "│ Alaska Native or Indigenous Am… ┆ 0.5 ┆ null ┆ -0.5 │\n", + "│ … ┆ … ┆ … ┆ … │\n", + "│ Hispanic or Latinx ┆ 7.4 ┆ 7.9 ┆ 0.5 │\n", + "│ Hispanic or Latinx,White or Ca… ┆ 6.0 ┆ null ┆ -6.0 │\n", + "│ Middle Eastern or North Africa… ┆ 0.2 ┆ null ┆ -0.2 │\n", + "│ Native Hawaiian or Other Pacif… ┆ 0.5 ┆ null ┆ -0.5 │\n", + "│ White or Caucasian ┆ 50.8 ┆ 52.6 ┆ 1.8 │\n", + "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n", + "\n", + "============================================================\n", + "Bussiness_Owner (% within each group)\n", + "============================================================\n", + "shape: (2, 4)\n", + "┌─────────────────┬───────────────┬──────────┬───────────┐\n", + "│ Bussiness_Owner ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════╪═══════════════╪══════════╪═══════════╡\n", + "│ No ┆ 73.5 ┆ 72.8 ┆ 0.7 │\n", + "│ Yes ┆ 26.5 ┆ 27.2 ┆ -0.7 │\n", + "└─────────────────┴───────────────┴──────────┴───────────┘\n", + "\n", + "============================================================\n", + "Business_Revenue (% within each group)\n", + "============================================================\n", + "shape: (6, 4)\n", + "┌──────────────────────────────┬──────────┬───────────────┬───────────┐\n", + "│ Business_Revenue ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", + "│ $1 million to $4.9 million ┆ 27.7 ┆ 66.7 ┆ 39.0 │\n", + "│ $10 million to $19.9 million ┆ 9.6 ┆ null ┆ -9.6 │\n", + "│ $100,00 to $249,999 ┆ 13.3 ┆ null ┆ -13.3 │\n", + "│ $250,000 to $499,999 ┆ 10.8 ┆ 11.1 ┆ 0.3 │\n", + "│ $5 million to $9.9 million ┆ 14.5 ┆ null ┆ -14.5 │\n", + "│ $500,00 to $999,999 ┆ 24.1 ┆ 22.2 ┆ -1.9 │\n", + "└──────────────────────────────┴──────────┴───────────────┴───────────┘\n", + "\n", + "============================================================\n", + "Employment (% within each group)\n", + "============================================================\n", + "shape: (0, 1)\n", + "┌────────────┐\n", + "│ Employment │\n", + "│ --- │\n", + "│ str │\n", + "╞════════════╡\n", + "└────────────┘\n", + "\n", + "============================================================\n", + "Personal_Products (% within each group)\n", + "============================================================\n", + "shape: (0, 1)\n", + "┌───────────────────┐\n", + "│ Personal_Products │\n", + "│ --- │\n", + "│ str │\n", + "╞═══════════════════╡\n", + "└───────────────────┘\n", + "\n", + "============================================================\n", + "Income (% within each group)\n", + "============================================================\n", + "shape: (8, 4)\n", + "┌──────────────────────┬──────────┬───────────────┬───────────┐\n", + "│ Income ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════════════╪══════════╪═══════════════╪═══════════╡\n", + "│ $100,000 to $149,999 ┆ 16.3 ┆ 15.8 ┆ -0.5 │\n", + "│ $150,000 to $199,999 ┆ 10.8 ┆ 7.9 ┆ -2.9 │\n", + "│ $200,000 or more ┆ 3.6 ┆ 10.5 ┆ 6.9 │\n", + "│ $25,000 to $34,999 ┆ 11.3 ┆ 13.2 ┆ 1.9 │\n", + "│ $35,000 to $54,999 ┆ 14.1 ┆ 15.8 ┆ 1.7 │\n", + "│ $55,000 to $79,999 ┆ 18.0 ┆ 10.5 ┆ -7.5 │\n", + "│ $80,000 to $99,999 ┆ 12.0 ┆ 21.1 ┆ 9.1 │\n", + "│ Less than $25,000 ┆ 13.9 ┆ 5.3 ┆ -8.6 │\n", + "└──────────────────────┴──────────┴───────────────┴───────────┘\n", + "\n", + "============================================================\n", + "Investable_Assets (% within each group)\n", + "============================================================\n", + "shape: (9, 4)\n", + "┌──────────────────────┬───────────────┬──────────┬───────────┐\n", + "│ Investable_Assets ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞══════════════════════╪═══════════════╪══════════╪═══════════╡\n", + "│ $0 to $24,999 ┆ 23.7 ┆ 21.1 ┆ 2.6 │\n", + "│ $150,000 to $249,999 ┆ 5.3 ┆ 12.0 ┆ -6.7 │\n", + "│ $1M to $4.9M ┆ 5.3 ┆ 4.8 ┆ 0.5 │\n", + "│ $25,000 to $49,999 ┆ 15.8 ┆ 19.9 ┆ -4.1 │\n", + "│ $250,000 to $499,999 ┆ 10.5 ┆ 12.5 ┆ -2.0 │\n", + "│ $50,000 to $149,999 ┆ 26.3 ┆ 16.8 ┆ 9.5 │\n", + "│ $500,000 to $999,999 ┆ 10.5 ┆ 11.0 ┆ -0.5 │\n", + "│ $5M or more ┆ null ┆ 0.7 ┆ -0.7 │\n", + "│ Prefer not to answer ┆ 2.6 ┆ 1.2 ┆ 1.4 │\n", + "└──────────────────────┴───────────────┴──────────┴───────────┘\n", + "\n", + "============================================================\n", + "Industry (% within each group)\n", + "============================================================\n", + "shape: (26, 4)\n", + "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n", + "│ Industry ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", + "│ Agriculture, forestry, fishing… ┆ 0.5 ┆ null ┆ -0.5 │\n", + "│ Arts, entertainment, or recrea… ┆ 2.9 ┆ 2.6 ┆ -0.3 │\n", + "│ Broadcasting ┆ 0.5 ┆ null ┆ -0.5 │\n", + "│ Construction ┆ 10.8 ┆ 10.5 ┆ -0.3 │\n", + "│ Education – College, universit… ┆ 3.8 ┆ 2.6 ┆ -1.2 │\n", + "│ … ┆ … ┆ … ┆ … │\n", + "│ Software ┆ 9.1 ┆ 13.2 ┆ 4.1 │\n", + "│ Telecommunications ┆ 0.7 ┆ 2.6 ┆ 1.9 │\n", + "│ Transportation and warehousing ┆ 4.1 ┆ 2.6 ┆ -1.5 │\n", + "│ Utilities ┆ 1.0 ┆ 2.6 ┆ 1.6 │\n", + "│ Wholesale ┆ 6.5 ┆ 7.9 ┆ 1.4 │\n", + "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n", + "\n", + "============================================================\n", + "Consumer (% within each group)\n", + "============================================================\n", + "shape: (10, 4)\n", + "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n", + "│ Consumer ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", + "│ Early_Professional ┆ 2.9 ┆ null ┆ -2.9 │\n", + "│ Lower_Mass_A ┆ 7.5 ┆ 13.3 ┆ 5.8 │\n", + "│ Lower_Mass_B ┆ 2.6 ┆ null ┆ -2.6 │\n", + "│ MassAffluent_A ┆ 7.5 ┆ 13.3 ┆ 5.8 │\n", + "│ MassAffluent_B ┆ 2.9 ┆ null ┆ -2.9 │\n", + "│ Mass_A ┆ 19.5 ┆ 23.3 ┆ 3.8 │\n", + "│ Mass_B ┆ 5.7 ┆ 3.3 ┆ -2.4 │\n", + "│ Mix_of_Affluent_Wealth_&_High_… ┆ 7.8 ┆ 10.0 ┆ 2.2 │\n", + "│ Mix_of_Affluent_Wealth_&_High_… ┆ 2.9 ┆ null ┆ -2.9 │\n", + "│ Starter ┆ 40.8 ┆ 36.7 ┆ -4.1 │\n", + "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n" + ] + } + ], + "source": [ + "# Percentage distribution comparison for each demographic\n", + "def compare_distribution(df: pl.DataFrame, col: str, status_col: str = 'ranking_status'):\n", + " \"\"\"Show side-by-side percentage distributions for a column.\"\"\"\n", + " counts = (\n", + " df.filter(pl.col(col).is_not_null())\n", + " .group_by([status_col, col])\n", + " .agg(pl.len().alias('n'))\n", + " )\n", + " totals = counts.group_by(status_col).agg(pl.col('n').sum().alias('total'))\n", + " pcts = counts.join(totals, on=status_col).with_columns(\n", + " (pl.col('n') / pl.col('total') * 100).round(1).alias('pct')\n", + " )\n", + " # Pivot for readability\n", + " pivot = pcts.pivot(\n", + " on=status_col,\n", + " index=col,\n", + " values='pct',\n", + " ).sort(col)\n", + " \n", + " # Add difference column if both groups exist\n", + " if 'Missing QID98' in pivot.columns and 'Complete' in pivot.columns:\n", + " pivot = pivot.with_columns(\n", + " (pl.col('Missing QID98').fill_null(0) - pl.col('Complete').fill_null(0))\n", + " .round(1)\n", + " .alias('diff_ppts')\n", + " )\n", + " return pivot\n", + "\n", + "\n", + "for col in demo_cols:\n", + " print(f'\\n{\"=\" * 60}')\n", + " print(f'{col} (% within each group)')\n", + " print('=' * 60)\n", + " print(compare_distribution(demos_df, col))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4e53657a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Visual comparison of key demographics\n", + "charts = []\n", + "for col in ['Age', 'Gender', 'Consumer']:\n", + " if col not in demos_df.columns:\n", + " continue\n", + " _df = demos_df.filter(pl.col(col).is_not_null())\n", + " _counts = _df.group_by(['ranking_status', col]).agg(pl.len().alias('n'))\n", + " _totals = _counts.group_by('ranking_status').agg(pl.col('n').sum().alias('total'))\n", + " _pcts = _counts.join(_totals, on='ranking_status').with_columns(\n", + " (pl.col('n') / pl.col('total') * 100).round(1).alias('pct')\n", + " )\n", + " c = alt.Chart(_pcts.to_pandas()).mark_bar().encode(\n", + " x=alt.X(f'{col}:N', title=col),\n", + " y=alt.Y('pct:Q', title='% of group'),\n", + " color='ranking_status:N',\n", + " xOffset='ranking_status:N',\n", + " tooltip=[col, 'ranking_status', 'n', 'pct'],\n", + " ).properties(width=350, height=250, title=f'{col} Distribution')\n", + " charts.append(c)\n", + "\n", + "alt.hconcat(*charts)" + ] + }, + { + "cell_type": "markdown", + "id": "d847c8cc", + "metadata": {}, + "source": [ + "---\n", + "## 3. Survey Flow: Which Questions Were Answered?\n", + "\n", + "Check which questions the 38 respondents answered vs skipped to identify where the survey flow diverged." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "794e312e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Null rates (%) for key questions by group:\n", + "shape: (9, 4)\n", + "┌─────────────────────────────────┬───────────────┬───────────────┬────────────────┐\n", + "│ Question ┆ QID ┆ Missing_null% ┆ Complete_null% │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 ┆ f64 │\n", + "╞═════════════════════════════════╪═══════════════╪═══════════════╪════════════════╡\n", + "│ Demographics (QID1 Age) ┆ QID1 ┆ 0.0 ┆ 0.0 │\n", + "│ Demographics (QID2 Gender) ┆ QID2 ┆ 0.0 ┆ 0.0 │\n", + "│ Top 8 Traits (QID25) ┆ QID25 ┆ 0.0 ┆ 0.0 │\n", + "│ Character Ranking (QID27_1) ┆ QID27_1 ┆ 0.0 ┆ 0.0 │\n", + "│ 18→8 Set A (QID29) ┆ QID29 ┆ 0.0 ┆ 0.0 │\n", + "│ 18→8 Set B (QID101) ┆ QID101 ┆ 0.0 ┆ 0.0 │\n", + "│ 8→3 Selection (QID36_0_GROUP) ┆ QID36_0_GROUP ┆ 0.0 ┆ 0.0 │\n", + "│ Voice Ranking Q (QID98_1) ┆ QID98_1 ┆ 100.0 ┆ 74.8 │\n", + "│ Voice Scale 1-10 first (QID98_… ┆ QID98_2 ┆ 100.0 ┆ 77.5 │\n", + "└─────────────────────────────────┴───────────────┴───────────────┴────────────────┘\n" + ] + } + ], + "source": [ + "# Check null rates for key question groups: missing vs complete respondents\n", + "missing_raw = raw.filter(pl.col('ranking_status') == 'Missing QID98')\n", + "complete_raw = raw.filter(pl.col('ranking_status') == 'Complete')\n", + "\n", + "# Key question groups to check\n", + "question_groups = {\n", + " 'Demographics (QID1 Age)': 'QID1',\n", + " 'Demographics (QID2 Gender)': 'QID2',\n", + " 'Top 8 Traits (QID25)': 'QID25',\n", + " 'Character Ranking (QID27_1)': 'QID27_1',\n", + " '18→8 Set A (QID29)': 'QID29',\n", + " '18→8 Set B (QID101)': 'QID101',\n", + " '8→3 Selection (QID36_0_GROUP)': 'QID36_0_GROUP',\n", + " 'Voice Ranking Q (QID98_1)': 'QID98_1',\n", + " 'Voice Scale 1-10 first (QID98_2)': 'QID98_2',\n", + "}\n", + "\n", + "null_comparison = []\n", + "for label, qid in question_groups.items():\n", + " if qid not in raw.columns:\n", + " null_comparison.append({'Question': label, 'QID': qid, 'Missing_null%': 'N/A', 'Complete_null%': 'N/A'})\n", + " continue\n", + " m_null = missing_raw[qid].null_count() / missing_raw.height * 100\n", + " c_null = complete_raw[qid].null_count() / complete_raw.height * 100\n", + " null_comparison.append({\n", + " 'Question': label,\n", + " 'QID': qid,\n", + " 'Missing_null%': round(m_null, 1),\n", + " 'Complete_null%': round(c_null, 1),\n", + " })\n", + "\n", + "null_df = pl.DataFrame(null_comparison)\n", + "print('Null rates (%) for key questions by group:')\n", + "print(null_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "fdc504ea", + "metadata": {}, + "outputs": [ + { + "ename": "ColumnNotFoundError", + "evalue": "unable to find column \"diff_ppts\"; valid columns: []", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mColumnNotFoundError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 11\u001b[39m desc = S.qid_descr_map.get(col, {}).get(\u001b[33m'\u001b[39m\u001b[33mQName\u001b[39m\u001b[33m'\u001b[39m, col)\n\u001b[32m 12\u001b[39m diff_cols.append({\n\u001b[32m 13\u001b[39m \u001b[33m'\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m'\u001b[39m: col,\n\u001b[32m 14\u001b[39m \u001b[33m'\u001b[39m\u001b[33mdescription\u001b[39m\u001b[33m'\u001b[39m: desc,\n\u001b[32m (...)\u001b[39m\u001b[32m 17\u001b[39m \u001b[33m'\u001b[39m\u001b[33mdiff_ppts\u001b[39m\u001b[33m'\u001b[39m: \u001b[38;5;28mround\u001b[39m(diff, \u001b[32m1\u001b[39m),\n\u001b[32m 18\u001b[39m })\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m diff_df = \u001b[43mpl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiff_cols\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdiff_ppts\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 21\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mColumns with >30 ppts null-rate difference (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdiff_df.height\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found):\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 22\u001b[39m \u001b[38;5;28mprint\u001b[39m(diff_df.head(\u001b[32m30\u001b[39m))\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/dataframe/frame.py:5965\u001b[39m, in \u001b[36mDataFrame.sort\u001b[39m\u001b[34m(self, by, descending, nulls_last, multithreaded, maintain_order, *more_by)\u001b[39m\n\u001b[32m 5867\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 5868\u001b[39m \u001b[33;03mSort the dataframe by the given columns.\u001b[39;00m\n\u001b[32m 5869\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 5951\u001b[39m \u001b[33;03m└──────┴─────┴─────┘\u001b[39;00m\n\u001b[32m 5952\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 5953\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpolars\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mlazyframe\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m QueryOptFlags\n\u001b[32m 5955\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[32m 5956\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 5957\u001b[39m \u001b[43m \u001b[49m\u001b[43m.\u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5958\u001b[39m \u001b[43m \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5959\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43mmore_by\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5960\u001b[39m \u001b[43m \u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5961\u001b[39m \u001b[43m \u001b[49m\u001b[43mnulls_last\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnulls_last\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5962\u001b[39m \u001b[43m \u001b[49m\u001b[43mmultithreaded\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmultithreaded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5963\u001b[39m \u001b[43m \u001b[49m\u001b[43mmaintain_order\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaintain_order\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5964\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m-> \u001b[39m\u001b[32m5965\u001b[39m \u001b[43m \u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimizations\u001b[49m\u001b[43m=\u001b[49m\u001b[43mQueryOptFlags\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_eager\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 5966\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py:97\u001b[39m, in \u001b[36mdeprecate_streaming_parameter..decorate..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 93\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[33m\"\u001b[39m\u001b[33min-memory\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01mdel\u001b[39;00m kwargs[\u001b[33m\"\u001b[39m\u001b[33mstreaming\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/lazyframe/opt_flags.py:324\u001b[39m, in \u001b[36mforward_old_opt_flags..decorate..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 321\u001b[39m optflags = cb(optflags, kwargs.pop(key)) \u001b[38;5;66;03m# type: ignore[no-untyped-call,unused-ignore]\u001b[39;00m\n\u001b[32m 323\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33moptimizations\u001b[39m\u001b[33m\"\u001b[39m] = optflags\n\u001b[32m--> \u001b[39m\u001b[32m324\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/lazyframe/frame.py:2429\u001b[39m, in \u001b[36mLazyFrame.collect\u001b[39m\u001b[34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, engine, background, optimizations, **_kwargs)\u001b[39m\n\u001b[32m 2427\u001b[39m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[32m 2428\u001b[39m callback = _kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mpost_opt_callback\u001b[39m\u001b[33m\"\u001b[39m, callback)\n\u001b[32m-> \u001b[39m\u001b[32m2429\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n", + "\u001b[31mColumnNotFoundError\u001b[39m: unable to find column \"diff_ppts\"; valid columns: []" + ] + } + ], + "source": [ + "# Broader scan: find ALL columns where missing group has significantly different null rate\n", + "diff_cols = []\n", + "for col in raw.columns:\n", + " if col in ('_recordId', 'ranking_status'):\n", + " continue\n", + " m_null_pct = missing_raw[col].null_count() / missing_raw.height * 100\n", + " c_null_pct = complete_raw[col].null_count() / complete_raw.height * 100\n", + " diff = m_null_pct - c_null_pct\n", + " # Flag columns where missing group has 30+ ppts more nulls\n", + " if abs(diff) > 30:\n", + " desc = S.qid_descr_map.get(col, {}).get('QName', col)\n", + " diff_cols.append({\n", + " 'column': col,\n", + " 'description': desc,\n", + " 'missing_null_pct': round(m_null_pct, 1),\n", + " 'complete_null_pct': round(c_null_pct, 1),\n", + " 'diff_ppts': round(diff, 1),\n", + " })\n", + "\n", + "diff_df = pl.DataFrame(diff_cols).sort('diff_ppts', descending=True)\n", + "print(f'Columns with >30 ppts null-rate difference ({diff_df.height} found):')\n", + "print(diff_df.head(30))" + ] + }, + { + "cell_type": "markdown", + "id": "64440ef3", + "metadata": {}, + "source": [ + "---\n", + "## 4. QID36 Selection Order Data\n", + "\n", + "The 38 respondents DO have `QID36_G0_*_RANK` data (selection order, not preference ranking). Analyze their selection patterns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c761354", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "# QID36 choice → voice mapping\n", + "cfg36 = S._get_qsf_question_by_QID('QID36')['Payload']\n", + "from utils import extract_voice_label\n", + "choice_voice_map = {k: extract_voice_label(v['Display']) for k, v in cfg36['Choices'].items()}\n", + "\n", + "# Extract which 3 voices each of the 38 respondents selected\n", + "qid36_rank_cols = sorted([c for c in raw.columns if c.startswith('QID36_G0_') and c.endswith('_RANK')])\n", + "\n", + "def extract_chosen_voices(row, rank_cols, voice_map):\n", + " \"\"\"Get the 3 voices a respondent selected (from QID36 RANK columns).\"\"\"\n", + " voices = []\n", + " for col in rank_cols:\n", + " if row[col] is not None:\n", + " choice_num = col.replace('QID36_G0_', '').replace('_RANK', '')\n", + " voices.append(voice_map.get(choice_num, f'?{choice_num}'))\n", + " return voices\n", + "\n", + "# Build a summary of chosen voices for the 38 missing respondents\n", + "missing_voices = []\n", + "for row in missing_raw.iter_rows(named=True):\n", + " voices = extract_chosen_voices(row, qid36_rank_cols, choice_voice_map)\n", + " missing_voices.append({\n", + " '_recordId': row['_recordId'],\n", + " 'voice_1': voices[0] if len(voices) > 0 else None,\n", + " 'voice_2': voices[1] if len(voices) > 1 else None,\n", + " 'voice_3': voices[2] if len(voices) > 2 else None,\n", + " })\n", + "\n", + "missing_voice_df = pl.DataFrame(missing_voices)\n", + "print('Voices selected by the 38 missing respondents:')\n", + "print(missing_voice_df.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf78a4c0", + "metadata": {}, + "outputs": [], + "source": [ + "# Voice frequency in the missing group's top-3 selections\n", + "all_selected = (\n", + " missing_voice_df.unpivot(\n", + " on=['voice_1', 'voice_2', 'voice_3'],\n", + " value_name='voice'\n", + " )\n", + " .filter(pl.col('voice').is_not_null())\n", + " .group_by('voice')\n", + " .agg(pl.len().alias('n_selected'))\n", + " .sort('n_selected', descending=True)\n", + " .with_columns(\n", + " pl.col('voice').replace(VOICE_GENDER_MAPPING).alias('voice_gender')\n", + " )\n", + ")\n", + "print('Voice selection frequency for the 38 missing respondents:')\n", + "print(all_selected)\n", + "\n", + "# Gender breakdown of selected voices\n", + "gender_counts = all_selected.group_by('voice_gender').agg(pl.col('n_selected').sum())\n", + "print(f'\\nVoice gender breakdown in their selections:')\n", + "print(gender_counts)" + ] + }, + { + "cell_type": "markdown", + "id": "cb3ee243", + "metadata": {}, + "source": [ + "---\n", + "## 5. 18→8 Selection Path (Set A vs Set B)\n", + "\n", + "Were the 38 respondents disproportionately assigned to one randomization set (A or B)?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19c6bf43", + "metadata": {}, + "outputs": [], + "source": [ + "# Check Set A vs Set B distribution\n", + "v_18_8_3_df = S.get_18_8_3(data)[0].collect().join(\n", + " raw.select(['_recordId', 'ranking_status']),\n", + " on='_recordId'\n", + ")\n", + "\n", + "# Determine which set each respondent was in\n", + "v_18_8_3_df = v_18_8_3_df.with_columns(\n", + " pl.when(pl.col('18-8_Set-A').is_not_null() & pl.col('18-8_Set-B').is_null())\n", + " .then(pl.lit('Set A'))\n", + " .when(pl.col('18-8_Set-B').is_not_null() & pl.col('18-8_Set-A').is_null())\n", + " .then(pl.lit('Set B'))\n", + " .when(pl.col('18-8_Set-A').is_not_null() & pl.col('18-8_Set-B').is_not_null())\n", + " .then(pl.lit('Both'))\n", + " .otherwise(pl.lit('Neither'))\n", + " .alias('randomization_set')\n", + ")\n", + "\n", + "set_dist = v_18_8_3_df.group_by(['ranking_status', 'randomization_set']).agg(pl.len().alias('n'))\n", + "print('Randomization set distribution:')\n", + "print(set_dist.sort(['ranking_status', 'randomization_set']))" + ] + }, + { + "cell_type": "markdown", + "id": "bc98327d", + "metadata": {}, + "source": [ + "---\n", + "## 6. Character Ranking Comparison\n", + "\n", + "Did the 38 respondents rank characters differently?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f91db9c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Character ranking: who they picked as #1\n", + "char_rank_df = S.get_character_ranking(data)[0].collect().join(\n", + " raw.select(['_recordId', 'ranking_status']),\n", + " on='_recordId'\n", + ")\n", + "\n", + "ranking_cols = [c for c in char_rank_df.columns if c.startswith('Character_Ranking_')]\n", + "\n", + "# Find which character each respondent ranked #1\n", + "rank1_rows = []\n", + "for row in char_rank_df.iter_rows(named=True):\n", + " for col in ranking_cols:\n", + " if row[col] == 1:\n", + " rank1_rows.append({\n", + " '_recordId': row['_recordId'],\n", + " 'ranking_status': row['ranking_status'],\n", + " 'top_character': col.replace('Character_Ranking_', '').replace('_', ' '),\n", + " })\n", + " break\n", + "\n", + "rank1_df = pl.DataFrame(rank1_rows)\n", + "char_dist = compare_distribution(rank1_df, 'top_character')\n", + "print('Character ranked #1 distribution (%):')\n", + "print(char_dist)" + ] + }, + { + "cell_type": "markdown", + "id": "acabf4af", + "metadata": {}, + "source": [ + "---\n", + "## 7. Voice Scale 1-10 Comparison\n", + "\n", + "Did the 38 respondents rate voices differently on the 1-10 scale?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b446a0ac", + "metadata": {}, + "outputs": [], + "source": [ + "# Voice scale 1-10 comparison\n", + "voice_1_10_df = S.get_voice_scale_1_10(data)[0].collect().join(\n", + " raw.select(['_recordId', 'ranking_status']),\n", + " on='_recordId'\n", + ")\n", + "\n", + "scale_cols = [c for c in voice_1_10_df.columns if c.startswith('Voice_Scale_1_10__')]\n", + "\n", + "# Average score per voice, by group\n", + "scale_comparison = []\n", + "for col in scale_cols:\n", + " voice = col.replace('Voice_Scale_1_10__', '')\n", + " for status in ['Missing QID98', 'Complete']:\n", + " subset = voice_1_10_df.filter(pl.col('ranking_status') == status)\n", + " avg = subset[col].drop_nulls().mean()\n", + " scale_comparison.append({\n", + " 'voice': voice,\n", + " 'ranking_status': status,\n", + " 'mean_score': round(avg, 2) if avg is not None else None,\n", + " })\n", + "\n", + "scale_comp_df = pl.DataFrame(scale_comparison).pivot(\n", + " on='ranking_status', index='voice', values='mean_score'\n", + ").with_columns(\n", + " (pl.col('Missing QID98') - pl.col('Complete')).round(2).alias('diff')\n", + ").sort('diff')\n", + "\n", + "print('Average voice scores (1-10) by group:')\n", + "print(scale_comp_df)\n", + "\n", + "# Overall average\n", + "overall_missing = pl.DataFrame([r for r in scale_comparison if r['ranking_status'] == 'Missing QID98'])['mean_score'].mean()\n", + "overall_complete = pl.DataFrame([r for r in scale_comparison if r['ranking_status'] == 'Complete'])['mean_score'].mean()\n", + "print(f'\\nOverall avg score — Missing: {overall_missing:.2f}, Complete: {overall_complete:.2f}')" + ] + }, + { + "cell_type": "markdown", + "id": "47626429", + "metadata": {}, + "source": [ + "---\n", + "## 8. Speaking Style Comparison\n", + "\n", + "Check if the 38 respondents show different speaking style patterns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0961c08d", + "metadata": {}, + "outputs": [], + "source": [ + "# Speaking style straight-liner check for the two groups\n", + "ss_or_df = S.get_ss_orange_red(data)[0].collect()\n", + "ss_gb_df = S.get_ss_green_blue(data)[0].collect()\n", + "vs_df = S.get_voice_scale_1_10(data)[0].collect()\n", + "\n", + "sl_all = ss_or_df.join(ss_gb_df, on='_recordId').join(vs_df, on='_recordId')\n", + "\n", + "_, sl_flagged = check_straight_liners(sl_all.lazy(), max_score=5)\n", + "\n", + "if sl_flagged is not None and not sl_flagged.is_empty():\n", + " # Count straight-lined groups per respondent\n", + " sl_counts = (\n", + " sl_flagged\n", + " .group_by('Record ID')\n", + " .agg(pl.len().alias('sl_group_count'))\n", + " .rename({'Record ID': '_recordId'})\n", + " )\n", + " \n", + " # Join with ranking status\n", + " sl_with_status = sl_counts.join(\n", + " raw.select(['_recordId', 'ranking_status']),\n", + " on='_recordId'\n", + " )\n", + " \n", + " sl_summary = sl_with_status.group_by('ranking_status').agg(\n", + " pl.len().alias('n_straight_liners'),\n", + " pl.col('sl_group_count').mean().alias('avg_groups_straight_lined'),\n", + " pl.col('sl_group_count').max().alias('max_groups_straight_lined'),\n", + " )\n", + " print('Straight-liner comparison:')\n", + " print(sl_summary)\n", + " \n", + " # What percentage of each group are straight-liners?\n", + " for status in ['Missing QID98', 'Complete']:\n", + " n_group = raw.filter(pl.col('ranking_status') == status).height\n", + " n_sl = sl_with_status.filter(pl.col('ranking_status') == status).height\n", + " print(f' {status}: {n_sl}/{n_group} ({n_sl/n_group*100:.1f}%) have straight-lined ≥1 group')\n", + "else:\n", + " print('No straight-liners detected in either group.')" + ] + }, + { + "cell_type": "markdown", + "id": "a2a79b85", + "metadata": {}, + "source": [ + "---\n", + "## 9. QID98 vs QID36 Data Structure in Raw CSV\n", + "\n", + "Deep inspection: are the QID98 columns truly empty, or could there be a data export issue?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a628a3b", + "metadata": {}, + "outputs": [], + "source": [ + "# Check all QID98 columns for the 38 missing respondents\n", + "qid98_cols = sorted([c for c in raw.columns if c.startswith('QID98')])\n", + "print(f'QID98 columns in dataset ({len(qid98_cols)}):')\n", + "print(qid98_cols)\n", + "\n", + "# Verify: ALL QID98 columns are null for the 38?\n", + "qid98_null_check = []\n", + "for col in qid98_cols:\n", + " n_non_null = missing_raw[col].drop_nulls().height\n", + " qid98_null_check.append({'col': col, 'non_null_count': n_non_null})\n", + "\n", + "qid98_check_df = pl.DataFrame(qid98_null_check)\n", + "print(f'\\nQID98 non-null counts for the 38 missing respondents:')\n", + "print(qid98_check_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb1b02cb", + "metadata": {}, + "outputs": [], + "source": [ + "# Compare QID36_G0_*_RANK (selection order) for complete vs missing\n", + "# For complete respondents: do QID98 and QID36_RANK always disagree? (They should — different semantics)\n", + "sample_complete = complete_raw.head(5)\n", + "\n", + "print('Cross-check: QID98 (preference rank) vs QID36_RANK (selection order) for 5 complete respondents:')\n", + "for row in sample_complete.iter_rows(named=True):\n", + " print(f\"\\n {row['_recordId']}:\")\n", + " for i in range(1, 19):\n", + " qid98_col = f'QID98_{i}'\n", + " qid36_col = f'QID36_G0_{i}_RANK'\n", + " q98 = row.get(qid98_col)\n", + " q36 = row.get(qid36_col)\n", + " if q98 is not None or q36 is not None:\n", + " voice = choice_voice_map.get(str(i), f'?{i}')\n", + " match = '✓ same' if q98 == q36 else '✗ diff'\n", + " print(f\" {voice}: QID98={q98}, QID36_RANK={q36} ({match})\")" + ] + }, + { + "cell_type": "markdown", + "id": "216381fc", + "metadata": {}, + "source": [ + "---\n", + "## 10. Distribution Channel / User Agent / Other Metadata\n", + "\n", + "Check if the 38 came through a specific distribution channel or device." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "184c8509", + "metadata": {}, + "outputs": [], + "source": [ + "# Check distribution channel and other metadata columns\n", + "for meta_col in ['distributionChannel', 'userLanguage', 'Status']:\n", + " if meta_col in raw.columns:\n", + " dist = raw.group_by(['ranking_status', meta_col]).agg(pl.len().alias('n'))\n", + " print(f'\\n{meta_col}:')\n", + " print(dist.sort(['ranking_status', meta_col]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ad6631", + "metadata": {}, + "outputs": [], + "source": [ + "# Check if there are any Qualtrics internal columns that differ\n", + "# (e.g., Q_RecaptchaScore, Q_RelevantIDDuplicate, etc.)\n", + "q_internal_cols = [c for c in raw.columns if c.startswith('Q_')]\n", + "print(f'Qualtrics internal columns ({len(q_internal_cols)}):')\n", + "for col in q_internal_cols:\n", + " m_vals = missing_raw[col].drop_nulls().unique().to_list()\n", + " c_vals = complete_raw[col].drop_nulls().unique().to_list()\n", + " m_null_pct = missing_raw[col].null_count() / missing_raw.height * 100\n", + " c_null_pct = complete_raw[col].null_count() / complete_raw.height * 100\n", + " if m_vals != c_vals or abs(m_null_pct - c_null_pct) > 20:\n", + " print(f' {col}: Missing vals={m_vals[:5]}, Complete vals={c_vals[:5]}')\n", + " print(f' Missing null%={m_null_pct:.0f}%, Complete null%={c_null_pct:.0f}%')" + ] + }, + { + "cell_type": "markdown", + "id": "3934eb4c", + "metadata": {}, + "source": [ + "---\n", + "## 11. QSF Survey Flow Inspection\n", + "\n", + "Check the QSF for skip logic or display logic on QID98." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "031a0f8b", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Inspect QID98 question config in QSF\n", + "try:\n", + " qid98_cfg = S._get_qsf_question_by_QID('QID98')\n", + " payload = qid98_cfg.get('Payload', {})\n", + " \n", + " print('QID98 Question Type:', payload.get('QuestionType'))\n", + " print('QID98 Selector:', payload.get('Selector'))\n", + " print('QID98 QuestionText:', payload.get('QuestionText', '')[:200])\n", + " \n", + " # Check for display logic\n", + " display_logic = payload.get('DisplayLogic')\n", + " print(f'\\nDisplay Logic: {json.dumps(display_logic, indent=2) if display_logic else \"None\"}')\n", + " \n", + " # Check for skip logic\n", + " skip_logic = payload.get('SkipLogic')\n", + " print(f'\\nSkip Logic: {json.dumps(skip_logic, indent=2) if skip_logic else \"None\"}')\n", + " \n", + " # Check DynamicChoices (since this question uses carry-forward from QID36)\n", + " dyn_choices = payload.get('DynamicChoices')\n", + " print(f'\\nDynamic Choices: {json.dumps(dyn_choices, indent=2) if dyn_choices else \"None\"}')\n", + " \n", + " # Check validation settings\n", + " validation = payload.get('Validation')\n", + " print(f'\\nValidation: {json.dumps(validation, indent=2) if validation else \"None\"}')\n", + " \n", + "except Exception as e:\n", + " print(f'Error inspecting QID98: {e}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6d157a5", + "metadata": {}, + "outputs": [], + "source": [ + "# Also check QID36 (the parent question) for any relevant logic\n", + "try:\n", + " qid36_cfg = S._get_qsf_question_by_QID('QID36')\n", + " payload36 = qid36_cfg.get('Payload', {})\n", + " \n", + " print('QID36 Question Type:', payload36.get('QuestionType'))\n", + " print('QID36 Selector:', payload36.get('Selector'))\n", + " \n", + " # Check for any branching or flow-related settings\n", + " print(f'\\nQID36 GradingData: {payload36.get(\"GradingData\")}')\n", + " print(f'QID36 GroupChoiceRandomization: {payload36.get(\"GroupChoiceRandomization\")}')\n", + " print(f'QID36 Randomization: {payload36.get(\"Randomization\")}')\n", + " \n", + " # Check the flow element for QID98\n", + " flow = S.qsf.get('SurveyFlow', S.qsf.get('SurveyElements', []))\n", + " # Search for QID98 in flow elements\n", + " def find_in_flow(elements, target_qid, path=''):\n", + " results = []\n", + " if isinstance(elements, dict):\n", + " if elements.get('PrimaryAttribute') == target_qid or elements.get('QuestionID') == target_qid:\n", + " results.append((path, elements))\n", + " for k, v in elements.items():\n", + " results.extend(find_in_flow(v, target_qid, f'{path}.{k}'))\n", + " elif isinstance(elements, list):\n", + " for i, elem in enumerate(elements):\n", + " results.extend(find_in_flow(elem, target_qid, f'{path}[{i}]'))\n", + " return results\n", + " \n", + " qid98_in_flow = find_in_flow(S.qsf, 'QID98')\n", + " print(f'\\nQID98 appears in QSF structure {len(qid98_in_flow)} time(s):')\n", + " for path, elem in qid98_in_flow:\n", + " print(f' Path: {path}')\n", + " print(f' Element: {json.dumps(elem, indent=2)[:300]}')\n", + " \n", + "except Exception as e:\n", + " print(f'Error: {e}')" + ] + }, + { + "cell_type": "markdown", + "id": "451a0621", + "metadata": {}, + "source": [ + "---\n", + "## 12. Summary\n", + "\n", + "Collect all findings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41f8c49c", + "metadata": {}, + "outputs": [], + "source": [ + "print('=' * 70)\n", + "print('SUMMARY: 38 Respondents Missing Voice Ranking (QID98)')\n", + "print('=' * 70)\n", + "print(f'''\n", + "Total respondents: {n_total}\n", + "Missing QID98: {n_missing} ({n_missing/n_total:.1%})\n", + "Complete: {n_total - n_missing} ({(n_total - n_missing)/n_total:.1%})\n", + "\n", + "Key observations:\n", + "- All 38 completed the survey (progress = 100, finished = True)\n", + "- All 38 have QID36_G0_*_RANK (selection order) data — they DID pick 3 voices\n", + "- All 18 QID98_* columns are null for these 38 respondents\n", + "- The QID98 question carries forward from QID36 via DynamicChoices\n", + "\n", + "See analysis sections above for:\n", + " 1. Duration & timestamp patterns\n", + " 2. Demographic profile comparison\n", + " 3. Which questions were answered/skipped\n", + " 4. Voice selection patterns\n", + " 5. Randomization set (A vs B)\n", + " 6. Character ranking differences\n", + " 7. Voice 1-10 scale comparison\n", + " 8. Straight-liner rates\n", + " 9. Raw QID98 data verification\n", + " 10. Distribution channel / metadata\n", + " 11. QSF skip/display logic on QID98\n", + "''')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "phase-3-quant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/plots.py b/plots.py index f26e216..52a7d7e 100644 --- a/plots.py +++ b/plots.py @@ -96,6 +96,11 @@ class QualtricsPlotsMixin: sl_threshold = getattr(self, 'sl_threshold', None) if sl_threshold is not None: parts.append(f"SL-gte{sl_threshold}") + + # Append voice ranking filter if set + vr_filter = getattr(self, 'voice_ranking_filter', None) + if vr_filter is not None: + parts.append(f"VR-{vr_filter}") if not parts: return "All_Respondents" @@ -191,6 +196,15 @@ class QualtricsPlotsMixin: sl_threshold = getattr(self, 'sl_threshold', None) if sl_threshold is not None: parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups") + + # Append voice ranking filter if set + vr_filter = getattr(self, 'voice_ranking_filter', None) + if vr_filter is not None: + vr_labels = { + 'only-missing': 'ONLY respondents missing voice ranking (QID98)', + 'exclude-missing': 'EXCLUDING respondents missing voice ranking (QID98)', + } + parts.append(f"VOICE RANKING: {vr_labels.get(vr_filter, vr_filter)}") if not parts: # No filters active - return just sample size (or empty string if no sample size) diff --git a/run_filter_combinations.py b/run_filter_combinations.py index 012a7ff..3c63b85 100644 --- a/run_filter_combinations.py +++ b/run_filter_combinations.py @@ -179,10 +179,25 @@ def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> li 'filters': {'industry': [industry]} }) + # Voice ranking completeness filter + # These use a special flag rather than demographic filters, so we store + # the mode in a dedicated key that run_report passes as --voice-ranking-filter. + if not category or category in ['all_filters', 'voice_ranking']: + combinations.append({ + 'name': 'VoiceRanking-OnlyMissing', + 'filters': {}, + 'voice_ranking_filter': 'only-missing', + }) + combinations.append({ + 'name': 'VoiceRanking-ExcludeMissing', + 'filters': {}, + 'voice_ranking_filter': 'exclude-missing', + }) + return combinations -def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None) -> bool: +def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool: """ Run the report script with given filters. @@ -191,6 +206,9 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_thresh name: Name for this filter combination (used for .txt description file) dry_run: If True, just print command without running sl_threshold: If set, exclude respondents with >= N straight-lined question groups + voice_ranking_filter: If set, filter by voice ranking completeness. + 'only-missing' keeps only respondents missing QID98 data, + 'exclude-missing' removes them. Returns: True if successful, False otherwise @@ -205,6 +223,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_thresh if sl_threshold is not None: cmd.extend(['--sl-threshold', str(sl_threshold)]) + # Pass voice ranking filter if specified + if voice_ranking_filter is not None: + cmd.extend(['--voice-ranking-filter', voice_ranking_filter]) + for filter_name, values in filters.items(): if values: cmd.extend([f'--{filter_name}', json.dumps(values)]) @@ -235,7 +257,7 @@ def main(): parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running') parser.add_argument( '--category', - choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry'], + choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'], default='all_filters', help='Filter category to run combinations for (default: all_filters)' ) @@ -259,7 +281,7 @@ def main(): print("\nDRY RUN - Commands that would be executed:") for combo in combinations: print(f"\n{combo['name']}:") - run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold) + run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')) return # Run each combination with progress bar @@ -268,7 +290,7 @@ def main(): for combo in tqdm(combinations, desc="Running reports", unit="filter"): tqdm.write(f"Running: {combo['name']}") - if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold): + if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')): successful += 1 else: failed.append(combo['name']) diff --git a/utils.py b/utils.py index 26eb183..06cf1f3 100644 --- a/utils.py +++ b/utils.py @@ -1115,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin): return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None + def get_top_3_voices_missing_ranking( + self, q: pl.LazyFrame + ) -> pl.DataFrame: + """Identify respondents who completed the top-3 voice selection (QID36) + but are missing the explicit ranking question (QID98). + + These respondents picked 3 voices in the selection step and have + selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*`` + ranking columns are null. This means ``get_top_3_voices()`` will + return all-null rows for them, causing plots like + ``plot_most_ranked_1`` to undercount. + + Parameters: + q: The (optionally filtered) LazyFrame from ``load_data()``. + + Returns: + A collected ``pl.DataFrame`` with columns: + + - ``_recordId`` – the respondent identifier + - ``3_Ranked`` – comma-separated text of the 3 voices they selected + - ``qid36_rank_cols`` – dict-like column with their QID36 selection- + order values (for reference; these are *not* preference ranks) + """ + # Get the top-3 ranking data (QID98-based) + top3, _ = self.get_top_3_voices(q) + top3_df = top3.collect() + + ranking_cols = [c for c in top3_df.columns if c != '_recordId'] + + # Respondents where every QID98 ranking column is null + all_null_expr = pl.lit(True) + for col in ranking_cols: + all_null_expr = all_null_expr & pl.col(col).is_null() + + missing_ids = top3_df.filter(all_null_expr).select('_recordId') + + if missing_ids.height == 0: + return pl.DataFrame(schema={ + '_recordId': pl.Utf8, + '3_Ranked': pl.Utf8, + }) + + # Enrich with the 3_Ranked text from the 18→8→3 question + v_18_8_3, _ = self.get_18_8_3(q) + v_df = v_18_8_3.collect() + + result = missing_ids.join( + v_df.select(['_recordId', '3_Ranked']), + on='_recordId', + how='left', + ) + + return result + def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]: """Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.