From 9dfab7592505d648459063f44dc186ebc09c77ad Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Tue, 10 Feb 2026 14:24:26 +0100
Subject: [PATCH] missing data analysis

---
 XX_quant_report.script.py            |   27 +-
 analysis_missing_voice_ranking.ipynb | 1359 ++++++++++++++++++++++++++
 plots.py                             |   14 +
 run_filter_combinations.py           |   30 +-
 utils.py                             |   54 +
 5 files changed, 1477 insertions(+), 7 deletions(-)
 create mode 100644 analysis_missing_voice_ranking.ipynb

diff --git a/XX_quant_report.script.py b/XX_quant_report.script.py
index 402d42f..438de1a 100644
--- a/XX_quant_report.script.py
+++ b/XX_quant_report.script.py
@@ -16,8 +16,8 @@ from speaking_styles import SPEAKING_STYLES
 
 # %% Fixed Variables
 
-# RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
-RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+# RESULTS_FILE = 'data/exports/debug/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
 QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
 
 
@@ -51,6 +51,7 @@ def parse_cli_args():
     parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
     parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
     parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
+    parser.add_argument('--voice-ranking-filter', type=str, default=None, choices=['only-missing', 'exclude-missing'], help='Filter by voice ranking completeness: "only-missing" keeps only respondents missing QID98 ranking data, "exclude-missing" removes them')
     
     # Only parse if running as script (not in Jupyter/interactive)
     try:
@@ -58,7 +59,7 @@ def parse_cli_args():
         get_ipython()  # noqa: F821 # type: ignore
         # Return namespace with all filters set to None
         no_filters = {f: None for f in FILTER_CONFIG}
-        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None)
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None, voice_ranking_filter=None)
     except NameError:
         args = parser.parse_args()
         # Parse JSON strings to lists
@@ -174,6 +175,26 @@ if cli_args.sl_threshold is not None:
     else:
         print("  No straight-liners detected — no respondents removed.")
 
+# %% Apply voice-ranking completeness filter (if specified)
+# Keeps only / excludes respondents who are missing the explicit voice
+# ranking question (QID98) despite completing the top-3 selection (QID36).
+if cli_args.voice_ranking_filter is not None:
+    S.voice_ranking_filter = cli_args.voice_ranking_filter  # Store on Survey so filter slug/description include it
+    _vr_missing = S.get_top_3_voices_missing_ranking(_d)
+    _vr_missing_ids = _vr_missing.select('_recordId')
+    _n_before = _d.select(pl.len()).collect().item()
+
+    if cli_args.voice_ranking_filter == 'only-missing':
+        print(f"Voice ranking filter: keeping ONLY respondents missing QID98 ranking data...")
+        _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='inner').lazy()
+    elif cli_args.voice_ranking_filter == 'exclude-missing':
+        print(f"Voice ranking filter: EXCLUDING respondents missing QID98 ranking data...")
+        _d = _d.collect().join(_vr_missing_ids, on='_recordId', how='anti').lazy()
+
+    S.data_filtered = _d
+    _n_after = _d.select(pl.len()).collect().item()
+    print(f"  {_n_before} → {_n_after} respondents ({_vr_missing_ids.height} missing ranking data)")
+
 # Save to logical variable name for further analysis
 data = _d
 data.collect()
diff --git a/analysis_missing_voice_ranking.ipynb b/analysis_missing_voice_ranking.ipynb
new file mode 100644
index 0000000..2d6a2c9
--- /dev/null
+++ b/analysis_missing_voice_ranking.ipynb
@@ -0,0 +1,1359 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1ee02624",
+   "metadata": {},
+   "source": [
+    "# Analysis: 38 Respondents Missing Voice Ranking Data (QID98)\n",
+    "\n",
+    "**Context:** 38 out of 455 respondents completed the \"pick top 3 from 8 voices\" step (QID36) but have **all-null** data for the explicit ranking question (QID98). This notebook investigates their demographics, survey metadata, and response patterns to identify commonalities.\n",
+    "\n",
+    "**Data source:** `2-4-26` export"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "bd6df059",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total respondents: 455\n"
+     ]
+    }
+   ],
+   "source": [
+    "import polars as pl\n",
+    "import altair as alt\n",
+    "from utils import QualtricsSurvey\n",
+    "from validation import check_straight_liners\n",
+    "from reference import VOICE_GENDER_MAPPING\n",
+    "\n",
+    "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n",
+    "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'\n",
+    "\n",
+    "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n",
+    "data_all = S.load_data()\n",
+    "data = S.filter_data(data_all)\n",
+    "\n",
+    "n_total = data.select(pl.len()).collect().item()\n",
+    "print(f'Total respondents: {n_total}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "46afc18b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Respondents missing voice ranking (QID98): 38\n",
+      "Respondents with complete data: 417\n",
+      "Missing rate: 8.4%\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Identify the 38 missing respondents\n",
+    "missing = S.get_top_3_voices_missing_ranking(data)\n",
+    "missing_ids = missing.select('_recordId')\n",
+    "n_missing = missing_ids.height\n",
+    "\n",
+    "print(f'Respondents missing voice ranking (QID98): {n_missing}')\n",
+    "print(f'Respondents with complete data: {n_total - n_missing}')\n",
+    "print(f'Missing rate: {n_missing / n_total:.1%}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "990eb4b4",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_1673001/2213083878.py:3: DeprecationWarning: `is_in` with a collection of the same datatype is ambiguous and deprecated.\n",
+      "Please use `implode` to return to previous behavior.\n",
+      "\n",
+      "See https://github.com/pola-rs/polars/issues/22149 for more information.\n",
+      "  raw = raw.with_columns(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Build a column to tag respondents as missing / complete\n",
+    "raw = data.collect()\n",
+    "raw = raw.with_columns(\n",
+    "    pl.when(pl.col('_recordId').is_in(missing_ids['_recordId']))\n",
+    "    .then(pl.lit('Missing QID98'))\n",
+    "    .otherwise(pl.lit('Complete'))\n",
+    "    .alias('ranking_status')\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3d81b1b6",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 1. Survey Metadata Analysis\n",
+    "\n",
+    "Check timestamps, duration, progress, and platform info to see if the 38 share a common survey-taking pattern."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "be18a52c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Available metadata columns: ['startDate', 'endDate', 'duration', 'progress', 'finished', 'userLanguage', 'distributionChannel']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check which metadata columns exist\n",
+    "meta_candidates = ['startDate', 'endDate', 'duration', 'progress', 'finished',\n",
+    "                   'Status', 'userLanguage', 'distributionChannel', 'RecipientEmail']\n",
+    "available_meta = [c for c in meta_candidates if c in raw.columns]\n",
+    "print(f'Available metadata columns: {available_meta}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "66e3060f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Duration comparison:\n",
+      "shape: (2, 9)\n",
+      "┌─────────────┬─────────────┬────────────┬─────────────┬───┬─────────┬─────┬──────────┬────────────┐\n",
+      "│ ranking_sta ┆ mean_sec    ┆ median_sec ┆ std_sec     ┆ … ┆ max_sec ┆ n   ┆ mean_min ┆ median_min │\n",
+      "│ tus         ┆ ---         ┆ ---        ┆ ---         ┆   ┆ ---     ┆ --- ┆ ---      ┆ ---        │\n",
+      "│ ---         ┆ f64         ┆ f64        ┆ f64         ┆   ┆ f64     ┆ u32 ┆ f64      ┆ f64        │\n",
+      "│ str         ┆             ┆            ┆             ┆   ┆         ┆     ┆          ┆            │\n",
+      "╞═════════════╪═════════════╪════════════╪═════════════╪═══╪═════════╪═════╪══════════╪════════════╡\n",
+      "│ Complete    ┆ 1634.642686 ┆ 1161.0     ┆ 2072.480875 ┆ … ┆ 21761.0 ┆ 417 ┆ 27.2     ┆ 19.4       │\n",
+      "│ Missing     ┆ 1152.078947 ┆ 871.5      ┆ 852.206211  ┆ … ┆ 4876.0  ┆ 38  ┆ 19.2     ┆ 14.5       │\n",
+      "│ QID98       ┆             ┆            ┆             ┆   ┆         ┆     ┆          ┆            │\n",
+      "└─────────────┴─────────────┴────────────┴─────────────┴───┴─────────┴─────┴──────────┴────────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Duration comparison\n",
+    "if 'duration' in raw.columns:\n",
+    "    duration_stats = (\n",
+    "        raw.group_by('ranking_status')\n",
+    "        .agg(\n",
+    "            pl.col('duration').cast(pl.Float64).mean().alias('mean_sec'),\n",
+    "            pl.col('duration').cast(pl.Float64).median().alias('median_sec'),\n",
+    "            pl.col('duration').cast(pl.Float64).std().alias('std_sec'),\n",
+    "            pl.col('duration').cast(pl.Float64).min().alias('min_sec'),\n",
+    "            pl.col('duration').cast(pl.Float64).max().alias('max_sec'),\n",
+    "            pl.len().alias('n'),\n",
+    "        )\n",
+    "        .with_columns(\n",
+    "            (pl.col('mean_sec') / 60).round(1).alias('mean_min'),\n",
+    "            (pl.col('median_sec') / 60).round(1).alias('median_min'),\n",
+    "        )\n",
+    "    )\n",
+    "    print('Duration comparison:')\n",
+    "    print(duration_stats)\n",
+    "    \n",
+    "    # Histogram\n",
+    "    chart = alt.Chart(raw.select(['duration', 'ranking_status']).to_pandas()).mark_bar(\n",
+    "        opacity=0.6\n",
+    "    ).encode(\n",
+    "        x=alt.X('duration:Q', bin=alt.Bin(maxbins=40), title='Duration (seconds)'),\n",
+    "        y=alt.Y('count():Q', title='Count'),\n",
+    "        color='ranking_status:N',\n",
+    "    ).properties(width=700, height=300, title='Survey Duration Distribution')\n",
+    "    chart"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f0bd2b0f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Progress stats by group:\n",
+      "shape: (2, 3)\n",
+      "┌────────────────┬───────────────┬──────────────┐\n",
+      "│ ranking_status ┆ mean_progress ┆ min_progress │\n",
+      "│ ---            ┆ ---           ┆ ---          │\n",
+      "│ str            ┆ f64           ┆ f64          │\n",
+      "╞════════════════╪═══════════════╪══════════════╡\n",
+      "│ Complete       ┆ 100.0         ┆ 100.0        │\n",
+      "│ Missing QID98  ┆ 100.0         ┆ 100.0        │\n",
+      "└────────────────┴───────────────┴──────────────┘\n",
+      "\n",
+      "Finished status:\n",
+      "shape: (2, 3)\n",
+      "┌────────────────┬──────────┬─────┐\n",
+      "│ ranking_status ┆ finished ┆ n   │\n",
+      "│ ---            ┆ ---      ┆ --- │\n",
+      "│ str            ┆ bool     ┆ u32 │\n",
+      "╞════════════════╪══════════╪═════╡\n",
+      "│ Complete       ┆ true     ┆ 417 │\n",
+      "│ Missing QID98  ┆ true     ┆ 38  │\n",
+      "└────────────────┴──────────┴─────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Progress and finished status\n",
+    "if 'progress' in raw.columns:\n",
+    "    progress_by_status = (\n",
+    "        raw.group_by('ranking_status')\n",
+    "        .agg(\n",
+    "            pl.col('progress').cast(pl.Float64).mean().alias('mean_progress'),\n",
+    "            pl.col('progress').cast(pl.Float64).min().alias('min_progress'),\n",
+    "            pl.col('progress').value_counts().alias('progress_dist'),\n",
+    "        )\n",
+    "    )\n",
+    "    print('Progress stats by group:')\n",
+    "    print(progress_by_status.select(['ranking_status', 'mean_progress', 'min_progress']))\n",
+    "\n",
+    "if 'finished' in raw.columns:\n",
+    "    finished_by_status = raw.group_by(['ranking_status', 'finished']).agg(pl.len().alias('n'))\n",
+    "    print('\\nFinished status:')\n",
+    "    print(finished_by_status.sort('ranking_status'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0bfdb060",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Timestamp ranges by group:\n",
+      "shape: (2, 4)\n",
+      "┌────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐\n",
+      "│ ranking_status ┆ earliest            ┆ latest              ┆ median_start        │\n",
+      "│ ---            ┆ ---                 ┆ ---                 ┆ ---                 │\n",
+      "│ str            ┆ datetime[μs]        ┆ datetime[μs]        ┆ datetime[μs]        │\n",
+      "╞════════════════╪═════════════════════╪═════════════════════╪═════════════════════╡\n",
+      "│ Missing QID98  ┆ 2026-01-20 16:16:18 ┆ 2026-01-30 12:43:07 ┆ 2026-01-27 23:30:15 │\n",
+      "│ Complete       ┆ 2026-01-20 15:00:44 ┆ 2026-01-30 23:53:40 ┆ 2026-01-28 05:46:12 │\n",
+      "└────────────────┴─────────────────────┴─────────────────────┴─────────────────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Timestamps: check if the missing respondents cluster in a specific time window\n",
+    "if 'startDate' in raw.columns:\n",
+    "    ts = raw.select(['_recordId', 'ranking_status', 'startDate', 'endDate']).with_columns(\n",
+    "        pl.col('startDate').str.to_datetime('%Y-%m-%d %H:%M:%S', strict=False).alias('start_dt'),\n",
+    "        pl.col('endDate').str.to_datetime('%Y-%m-%d %H:%M:%S', strict=False).alias('end_dt'),\n",
+    "    )\n",
+    "    \n",
+    "    # Time range per group\n",
+    "    time_stats = ts.group_by('ranking_status').agg(\n",
+    "        pl.col('start_dt').min().alias('earliest'),\n",
+    "        pl.col('start_dt').max().alias('latest'),\n",
+    "        pl.col('start_dt').median().alias('median_start'),\n",
+    "    )\n",
+    "    print('Timestamp ranges by group:')\n",
+    "    print(time_stats)\n",
+    "    \n",
+    "    # Timeline scatter plot\n",
+    "    chart = alt.Chart(ts.to_pandas()).mark_circle(size=40).encode(\n",
+    "        x=alt.X('start_dt:T', title='Survey Start Time'),\n",
+    "        y=alt.Y('ranking_status:N', title=''),\n",
+    "        color='ranking_status:N',\n",
+    "        tooltip=['_recordId', 'start_dt:T', 'end_dt:T'],\n",
+    "    ).properties(width=700, height=150, title='Survey Start Times')\n",
+    "    chart"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d81aede5",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 2. Demographic Profile Comparison\n",
+    "\n",
+    "Compare age, gender, ethnicity, consumer segment, income, etc. between the two groups."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "cad1b204",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Demographic columns: ['Age', 'Gender', 'Race/Ethnicity', 'Bussiness_Owner', 'Business_Revenue', 'Employment', 'Personal_Products', 'Income', 'Investable_Assets', 'Industry', 'Consumer']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Get demographics and join with ranking status\n",
+    "demos_df = S.get_demographics(data)[0].collect()\n",
+    "demos_df = demos_df.join(\n",
+    "    raw.select(['_recordId', 'ranking_status']),\n",
+    "    on='_recordId'\n",
+    ")\n",
+    "\n",
+    "demo_cols = [c for c in demos_df.columns if c not in ('_recordId', 'ranking_status')]\n",
+    "print(f'Demographic columns: {demo_cols}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "65fdfd6b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "============================================================\n",
+      "Age  (% within each group)\n",
+      "============================================================\n",
+      "shape: (8, 4)\n",
+      "┌──────────────────┬───────────────┬──────────┬───────────┐\n",
+      "│ Age              ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n",
+      "│ ---              ┆ ---           ┆ ---      ┆ ---       │\n",
+      "│ str              ┆ f64           ┆ f64      ┆ f64       │\n",
+      "╞══════════════════╪═══════════════╪══════════╪═══════════╡\n",
+      "│ 18 to 21 years   ┆ 15.8          ┆ 14.9     ┆ 0.9       │\n",
+      "│ 22 to 24 years   ┆ 13.2          ┆ 19.2     ┆ -6.0      │\n",
+      "│ 25 to 34 years   ┆ 7.9           ┆ 17.0     ┆ -9.1      │\n",
+      "│ 35 to 40 years   ┆ 23.7          ┆ 13.2     ┆ 10.5      │\n",
+      "│ 41 to 50 years   ┆ 34.2          ┆ 16.8     ┆ 17.4      │\n",
+      "│ 51 to 59 years   ┆ 2.6           ┆ 6.0      ┆ -3.4      │\n",
+      "│ 60 to 70 years   ┆ null          ┆ 8.2      ┆ -8.2      │\n",
+      "│ 70 years or more ┆ 2.6           ┆ 4.8      ┆ -2.2      │\n",
+      "└──────────────────┴───────────────┴──────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Gender  (% within each group)\n",
+      "============================================================\n",
+      "shape: (3, 4)\n",
+      "┌───────────────────┬──────────┬───────────────┬───────────┐\n",
+      "│ Gender            ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n",
+      "│ ---               ┆ ---      ┆ ---           ┆ ---       │\n",
+      "│ str               ┆ f64      ┆ f64           ┆ f64       │\n",
+      "╞═══════════════════╪══════════╪═══════════════╪═══════════╡\n",
+      "│ Man               ┆ 59.2     ┆ 65.8          ┆ 6.6       │\n",
+      "│ Prefer not to say ┆ 0.2      ┆ null          ┆ -0.2      │\n",
+      "│ Woman             ┆ 40.5     ┆ 34.2          ┆ -6.3      │\n",
+      "└───────────────────┴──────────┴───────────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Race/Ethnicity  (% within each group)\n",
+      "============================================================\n",
+      "shape: (21, 4)\n",
+      "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n",
+      "│ Race/Ethnicity                  ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n",
+      "│ ---                             ┆ ---      ┆ ---           ┆ ---       │\n",
+      "│ str                             ┆ f64      ┆ f64           ┆ f64       │\n",
+      "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n",
+      "│ Alaska Native or Indigenous Am… ┆ 0.7      ┆ null          ┆ -0.7      │\n",
+      "│ Alaska Native or Indigenous Am… ┆ 0.2      ┆ null          ┆ -0.2      │\n",
+      "│ Alaska Native or Indigenous Am… ┆ 0.5      ┆ null          ┆ -0.5      │\n",
+      "│ Alaska Native or Indigenous Am… ┆ 0.5      ┆ null          ┆ -0.5      │\n",
+      "│ Alaska Native or Indigenous Am… ┆ 0.5      ┆ null          ┆ -0.5      │\n",
+      "│ …                               ┆ …        ┆ …             ┆ …         │\n",
+      "│ Hispanic or Latinx              ┆ 7.4      ┆ 7.9           ┆ 0.5       │\n",
+      "│ Hispanic or Latinx,White or Ca… ┆ 6.0      ┆ null          ┆ -6.0      │\n",
+      "│ Middle Eastern or North Africa… ┆ 0.2      ┆ null          ┆ -0.2      │\n",
+      "│ Native Hawaiian or Other Pacif… ┆ 0.5      ┆ null          ┆ -0.5      │\n",
+      "│ White or Caucasian              ┆ 50.8     ┆ 52.6          ┆ 1.8       │\n",
+      "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Bussiness_Owner  (% within each group)\n",
+      "============================================================\n",
+      "shape: (2, 4)\n",
+      "┌─────────────────┬───────────────┬──────────┬───────────┐\n",
+      "│ Bussiness_Owner ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n",
+      "│ ---             ┆ ---           ┆ ---      ┆ ---       │\n",
+      "│ str             ┆ f64           ┆ f64      ┆ f64       │\n",
+      "╞═════════════════╪═══════════════╪══════════╪═══════════╡\n",
+      "│ No              ┆ 73.5          ┆ 72.8     ┆ 0.7       │\n",
+      "│ Yes             ┆ 26.5          ┆ 27.2     ┆ -0.7      │\n",
+      "└─────────────────┴───────────────┴──────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Business_Revenue  (% within each group)\n",
+      "============================================================\n",
+      "shape: (6, 4)\n",
+      "┌──────────────────────────────┬──────────┬───────────────┬───────────┐\n",
+      "│ Business_Revenue             ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n",
+      "│ ---                          ┆ ---      ┆ ---           ┆ ---       │\n",
+      "│ str                          ┆ f64      ┆ f64           ┆ f64       │\n",
+      "╞══════════════════════════════╪══════════╪═══════════════╪═══════════╡\n",
+      "│ $1 million to $4.9 million   ┆ 27.7     ┆ 66.7          ┆ 39.0      │\n",
+      "│ $10 million to $19.9 million ┆ 9.6      ┆ null          ┆ -9.6      │\n",
+      "│ $100,00 to $249,999          ┆ 13.3     ┆ null          ┆ -13.3     │\n",
+      "│ $250,000 to $499,999         ┆ 10.8     ┆ 11.1          ┆ 0.3       │\n",
+      "│ $5 million to $9.9 million   ┆ 14.5     ┆ null          ┆ -14.5     │\n",
+      "│ $500,00 to $999,999          ┆ 24.1     ┆ 22.2          ┆ -1.9      │\n",
+      "└──────────────────────────────┴──────────┴───────────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Employment  (% within each group)\n",
+      "============================================================\n",
+      "shape: (0, 1)\n",
+      "┌────────────┐\n",
+      "│ Employment │\n",
+      "│ ---        │\n",
+      "│ str        │\n",
+      "╞════════════╡\n",
+      "└────────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Personal_Products  (% within each group)\n",
+      "============================================================\n",
+      "shape: (0, 1)\n",
+      "┌───────────────────┐\n",
+      "│ Personal_Products │\n",
+      "│ ---               │\n",
+      "│ str               │\n",
+      "╞═══════════════════╡\n",
+      "└───────────────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Income  (% within each group)\n",
+      "============================================================\n",
+      "shape: (8, 4)\n",
+      "┌──────────────────────┬──────────┬───────────────┬───────────┐\n",
+      "│ Income               ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n",
+      "│ ---                  ┆ ---      ┆ ---           ┆ ---       │\n",
+      "│ str                  ┆ f64      ┆ f64           ┆ f64       │\n",
+      "╞══════════════════════╪══════════╪═══════════════╪═══════════╡\n",
+      "│ $100,000 to $149,999 ┆ 16.3     ┆ 15.8          ┆ -0.5      │\n",
+      "│ $150,000 to $199,999 ┆ 10.8     ┆ 7.9           ┆ -2.9      │\n",
+      "│ $200,000 or more     ┆ 3.6      ┆ 10.5          ┆ 6.9       │\n",
+      "│ $25,000 to $34,999   ┆ 11.3     ┆ 13.2          ┆ 1.9       │\n",
+      "│ $35,000 to $54,999   ┆ 14.1     ┆ 15.8          ┆ 1.7       │\n",
+      "│ $55,000 to $79,999   ┆ 18.0     ┆ 10.5          ┆ -7.5      │\n",
+      "│ $80,000 to $99,999   ┆ 12.0     ┆ 21.1          ┆ 9.1       │\n",
+      "│ Less than $25,000    ┆ 13.9     ┆ 5.3           ┆ -8.6      │\n",
+      "└──────────────────────┴──────────┴───────────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Investable_Assets  (% within each group)\n",
+      "============================================================\n",
+      "shape: (9, 4)\n",
+      "┌──────────────────────┬───────────────┬──────────┬───────────┐\n",
+      "│ Investable_Assets    ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n",
+      "│ ---                  ┆ ---           ┆ ---      ┆ ---       │\n",
+      "│ str                  ┆ f64           ┆ f64      ┆ f64       │\n",
+      "╞══════════════════════╪═══════════════╪══════════╪═══════════╡\n",
+      "│ $0 to $24,999        ┆ 23.7          ┆ 21.1     ┆ 2.6       │\n",
+      "│ $150,000 to $249,999 ┆ 5.3           ┆ 12.0     ┆ -6.7      │\n",
+      "│ $1M to $4.9M         ┆ 5.3           ┆ 4.8      ┆ 0.5       │\n",
+      "│ $25,000 to $49,999   ┆ 15.8          ┆ 19.9     ┆ -4.1      │\n",
+      "│ $250,000 to $499,999 ┆ 10.5          ┆ 12.5     ┆ -2.0      │\n",
+      "│ $50,000 to $149,999  ┆ 26.3          ┆ 16.8     ┆ 9.5       │\n",
+      "│ $500,000 to $999,999 ┆ 10.5          ┆ 11.0     ┆ -0.5      │\n",
+      "│ $5M or more          ┆ null          ┆ 0.7      ┆ -0.7      │\n",
+      "│ Prefer not to answer ┆ 2.6           ┆ 1.2      ┆ 1.4       │\n",
+      "└──────────────────────┴───────────────┴──────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Industry  (% within each group)\n",
+      "============================================================\n",
+      "shape: (26, 4)\n",
+      "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n",
+      "│ Industry                        ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n",
+      "│ ---                             ┆ ---      ┆ ---           ┆ ---       │\n",
+      "│ str                             ┆ f64      ┆ f64           ┆ f64       │\n",
+      "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n",
+      "│ Agriculture, forestry, fishing… ┆ 0.5      ┆ null          ┆ -0.5      │\n",
+      "│ Arts, entertainment, or recrea… ┆ 2.9      ┆ 2.6           ┆ -0.3      │\n",
+      "│ Broadcasting                    ┆ 0.5      ┆ null          ┆ -0.5      │\n",
+      "│ Construction                    ┆ 10.8     ┆ 10.5          ┆ -0.3      │\n",
+      "│ Education – College, universit… ┆ 3.8      ┆ 2.6           ┆ -1.2      │\n",
+      "│ …                               ┆ …        ┆ …             ┆ …         │\n",
+      "│ Software                        ┆ 9.1      ┆ 13.2          ┆ 4.1       │\n",
+      "│ Telecommunications              ┆ 0.7      ┆ 2.6           ┆ 1.9       │\n",
+      "│ Transportation and warehousing  ┆ 4.1      ┆ 2.6           ┆ -1.5      │\n",
+      "│ Utilities                       ┆ 1.0      ┆ 2.6           ┆ 1.6       │\n",
+      "│ Wholesale                       ┆ 6.5      ┆ 7.9           ┆ 1.4       │\n",
+      "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n",
+      "\n",
+      "============================================================\n",
+      "Consumer  (% within each group)\n",
+      "============================================================\n",
+      "shape: (10, 4)\n",
+      "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n",
+      "│ Consumer                        ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n",
+      "│ ---                             ┆ ---      ┆ ---           ┆ ---       │\n",
+      "│ str                             ┆ f64      ┆ f64           ┆ f64       │\n",
+      "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n",
+      "│ Early_Professional              ┆ 2.9      ┆ null          ┆ -2.9      │\n",
+      "│ Lower_Mass_A                    ┆ 7.5      ┆ 13.3          ┆ 5.8       │\n",
+      "│ Lower_Mass_B                    ┆ 2.6      ┆ null          ┆ -2.6      │\n",
+      "│ MassAffluent_A                  ┆ 7.5      ┆ 13.3          ┆ 5.8       │\n",
+      "│ MassAffluent_B                  ┆ 2.9      ┆ null          ┆ -2.9      │\n",
+      "│ Mass_A                          ┆ 19.5     ┆ 23.3          ┆ 3.8       │\n",
+      "│ Mass_B                          ┆ 5.7      ┆ 3.3           ┆ -2.4      │\n",
+      "│ Mix_of_Affluent_Wealth_&_High_… ┆ 7.8      ┆ 10.0          ┆ 2.2       │\n",
+      "│ Mix_of_Affluent_Wealth_&_High_… ┆ 2.9      ┆ null          ┆ -2.9      │\n",
+      "│ Starter                         ┆ 40.8     ┆ 36.7          ┆ -4.1      │\n",
+      "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Percentage distribution comparison for each demographic\n",
+    "def compare_distribution(df: pl.DataFrame, col: str, status_col: str = 'ranking_status'):\n",
+    "    \"\"\"Show side-by-side percentage distributions for a column.\"\"\"\n",
+    "    counts = (\n",
+    "        df.filter(pl.col(col).is_not_null())\n",
+    "        .group_by([status_col, col])\n",
+    "        .agg(pl.len().alias('n'))\n",
+    "    )\n",
+    "    totals = counts.group_by(status_col).agg(pl.col('n').sum().alias('total'))\n",
+    "    pcts = counts.join(totals, on=status_col).with_columns(\n",
+    "        (pl.col('n') / pl.col('total') * 100).round(1).alias('pct')\n",
+    "    )\n",
+    "    # Pivot for readability\n",
+    "    pivot = pcts.pivot(\n",
+    "        on=status_col,\n",
+    "        index=col,\n",
+    "        values='pct',\n",
+    "    ).sort(col)\n",
+    "    \n",
+    "    # Add difference column if both groups exist\n",
+    "    if 'Missing QID98' in pivot.columns and 'Complete' in pivot.columns:\n",
+    "        pivot = pivot.with_columns(\n",
+    "            (pl.col('Missing QID98').fill_null(0) - pl.col('Complete').fill_null(0))\n",
+    "            .round(1)\n",
+    "            .alias('diff_ppts')\n",
+    "        )\n",
+    "    return pivot\n",
+    "\n",
+    "\n",
+    "for col in demo_cols:\n",
+    "    print(f'\\n{\"=\" * 60}')\n",
+    "    print(f'{col}  (% within each group)')\n",
+    "    print('=' * 60)\n",
+    "    print(compare_distribution(demos_df, col))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "4e53657a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "<style>\n",
+       "  #altair-viz-33b185d428554ec09aa026007a2a6751.vega-embed {\n",
+       "    width: 100%;\n",
+       "    display: flex;\n",
+       "  }\n",
+       "\n",
+       "  #altair-viz-33b185d428554ec09aa026007a2a6751.vega-embed details,\n",
+       "  #altair-viz-33b185d428554ec09aa026007a2a6751.vega-embed details summary {\n",
+       "    position: relative;\n",
+       "  }\n",
+       "</style>\n",
+       "<div id=\"altair-viz-33b185d428554ec09aa026007a2a6751\"></div>\n",
+       "<script type=\"text/javascript\">\n",
+       "  var VEGA_DEBUG = (typeof VEGA_DEBUG == \"undefined\") ? {} : VEGA_DEBUG;\n",
+       "  (function(spec, embedOpt){\n",
+       "    let outputDiv = document.currentScript.previousElementSibling;\n",
+       "    if (outputDiv.id !== \"altair-viz-33b185d428554ec09aa026007a2a6751\") {\n",
+       "      outputDiv = document.getElementById(\"altair-viz-33b185d428554ec09aa026007a2a6751\");\n",
+       "    }\n",
+       "\n",
+       "    const paths = {\n",
+       "      \"vega\": \"https://cdn.jsdelivr.net/npm/vega@6?noext\",\n",
+       "      \"vega-lib\": \"https://cdn.jsdelivr.net/npm/vega-lib?noext\",\n",
+       "      \"vega-lite\": \"https://cdn.jsdelivr.net/npm/vega-lite@6.1.0?noext\",\n",
+       "      \"vega-embed\": \"https://cdn.jsdelivr.net/npm/vega-embed@7?noext\",\n",
+       "    };\n",
+       "\n",
+       "    function maybeLoadScript(lib, version) {\n",
+       "      var key = `${lib.replace(\"-\", \"\")}_version`;\n",
+       "      return (VEGA_DEBUG[key] == version) ?\n",
+       "        Promise.resolve(paths[lib]) :\n",
+       "        new Promise(function(resolve, reject) {\n",
+       "          var s = document.createElement('script');\n",
+       "          document.getElementsByTagName(\"head\")[0].appendChild(s);\n",
+       "          s.async = true;\n",
+       "          s.onload = () => {\n",
+       "            VEGA_DEBUG[key] = version;\n",
+       "            return resolve(paths[lib]);\n",
+       "          };\n",
+       "          s.onerror = () => reject(`Error loading script: ${paths[lib]}`);\n",
+       "          s.src = paths[lib];\n",
+       "        });\n",
+       "    }\n",
+       "\n",
+       "    function showError(err) {\n",
+       "      outputDiv.innerHTML = `<div class=\"error\" style=\"color:red;\">${err}</div>`;\n",
+       "      throw err;\n",
+       "    }\n",
+       "\n",
+       "    function displayChart(vegaEmbed) {\n",
+       "      vegaEmbed(outputDiv, spec, embedOpt)\n",
+       "        .catch(err => showError(`Javascript Error: ${err.message}<br>This usually means there's a typo in your chart specification. See the javascript console for the full traceback.`));\n",
+       "    }\n",
+       "\n",
+       "    if(typeof define === \"function\" && define.amd) {\n",
+       "      requirejs.config({paths});\n",
+       "      let deps = [\"vega-embed\"];\n",
+       "      require(deps, displayChart, err => showError(`Error loading script: ${err.message}`));\n",
+       "    } else {\n",
+       "      maybeLoadScript(\"vega\", \"6\")\n",
+       "        .then(() => maybeLoadScript(\"vega-lite\", \"6.1.0\"))\n",
+       "        .then(() => maybeLoadScript(\"vega-embed\", \"7\"))\n",
+       "        .catch(showError)\n",
+       "        .then(() => displayChart(vegaEmbed));\n",
+       "    }\n",
+       "  })({\"config\": {\"view\": {\"continuousWidth\": 1000, \"continuousHeight\": 500, \"strokeWidth\": 0}, \"background\": \"white\", \"axis\": {\"grid\": true, \"gridColor\": \"lightgray\", \"labelFontSize\": 11, \"titleFontSize\": 12, \"labelColor\": \"black\", \"titleColor\": \"black\", \"labelLimit\": 200}, \"axisX\": {\"labelAngle\": -45, \"labelLimit\": 200}, \"axisY\": {\"labelAngle\": 0}, \"legend\": {\"orient\": \"top\", \"direction\": \"horizontal\", \"titleFontSize\": 11, \"labelFontSize\": 11}, \"title\": {\"fontSize\": 14, \"color\": \"black\", \"anchor\": \"start\", \"subtitleFontSize\": 10, \"subtitleColor\": \"gray\"}, \"bar\": {\"color\": \"#0077B6\"}}, \"hconcat\": [{\"data\": {\"name\": \"data-d7cb1f75d6582acb8f0191d3a681a0be\"}, \"mark\": {\"type\": \"bar\"}, \"encoding\": {\"color\": {\"field\": \"ranking_status\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"Age\", \"type\": \"nominal\"}, {\"field\": \"ranking_status\", \"type\": \"nominal\"}, {\"field\": \"n\", \"type\": \"quantitative\"}, {\"field\": \"pct\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"Age\", \"title\": \"Age\", \"type\": \"nominal\"}, \"xOffset\": {\"field\": \"ranking_status\", \"type\": \"nominal\"}, \"y\": {\"field\": \"pct\", \"title\": \"% of group\", \"type\": \"quantitative\"}}, \"height\": 250, \"title\": \"Age Distribution\", \"width\": 350}, {\"data\": {\"name\": \"data-082eb1086db857c3196b340adc20f9e1\"}, \"mark\": {\"type\": \"bar\"}, \"encoding\": {\"color\": {\"field\": \"ranking_status\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"Gender\", \"type\": \"nominal\"}, {\"field\": \"ranking_status\", \"type\": \"nominal\"}, {\"field\": \"n\", \"type\": \"quantitative\"}, {\"field\": \"pct\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"Gender\", \"title\": \"Gender\", \"type\": \"nominal\"}, \"xOffset\": {\"field\": \"ranking_status\", \"type\": \"nominal\"}, \"y\": {\"field\": \"pct\", \"title\": \"% of group\", \"type\": \"quantitative\"}}, \"height\": 250, \"title\": \"Gender Distribution\", \"width\": 350}, {\"data\": {\"name\": \"data-7f8e015301c6b6899af5f6dda6cb2974\"}, \"mark\": {\"type\": \"bar\"}, \"encoding\": {\"color\": {\"field\": \"ranking_status\", \"type\": \"nominal\"}, \"tooltip\": [{\"field\": \"Consumer\", \"type\": \"nominal\"}, {\"field\": \"ranking_status\", \"type\": \"nominal\"}, {\"field\": \"n\", \"type\": \"quantitative\"}, {\"field\": \"pct\", \"type\": \"quantitative\"}], \"x\": {\"field\": \"Consumer\", \"title\": \"Consumer\", \"type\": \"nominal\"}, \"xOffset\": {\"field\": \"ranking_status\", \"type\": \"nominal\"}, \"y\": {\"field\": \"pct\", \"title\": \"% of group\", \"type\": \"quantitative\"}}, \"height\": 250, \"title\": \"Consumer Distribution\", \"width\": 350}], \"$schema\": \"https://vega.github.io/schema/vega-lite/v6.1.0.json\", \"datasets\": {\"data-d7cb1f75d6582acb8f0191d3a681a0be\": [{\"ranking_status\": \"Missing QID98\", \"Age\": \"22 to 24 years\", \"n\": 5, \"total\": 38, \"pct\": 13.2}, {\"ranking_status\": \"Complete\", \"Age\": \"60 to 70 years\", \"n\": 34, \"total\": 417, \"pct\": 8.2}, {\"ranking_status\": \"Complete\", \"Age\": \"25 to 34 years\", \"n\": 71, \"total\": 417, \"pct\": 17.0}, {\"ranking_status\": \"Complete\", \"Age\": \"18 to 21 years\", \"n\": 62, \"total\": 417, \"pct\": 14.9}, {\"ranking_status\": \"Missing QID98\", \"Age\": \"51 to 59 years\", \"n\": 1, \"total\": 38, \"pct\": 2.6}, {\"ranking_status\": \"Complete\", \"Age\": \"51 to 59 years\", \"n\": 25, \"total\": 417, \"pct\": 6.0}, {\"ranking_status\": \"Complete\", \"Age\": \"35 to 40 years\", \"n\": 55, \"total\": 417, \"pct\": 13.2}, {\"ranking_status\": \"Complete\", \"Age\": \"22 to 24 years\", \"n\": 80, \"total\": 417, \"pct\": 19.2}, {\"ranking_status\": \"Missing QID98\", \"Age\": \"41 to 50 years\", \"n\": 13, \"total\": 38, \"pct\": 34.2}, {\"ranking_status\": \"Missing QID98\", \"Age\": \"70 years or more\", \"n\": 1, \"total\": 38, \"pct\": 2.6}, {\"ranking_status\": \"Complete\", \"Age\": \"70 years or more\", \"n\": 20, \"total\": 417, \"pct\": 4.8}, {\"ranking_status\": \"Complete\", \"Age\": \"41 to 50 years\", \"n\": 70, \"total\": 417, \"pct\": 16.8}, {\"ranking_status\": \"Missing QID98\", \"Age\": \"25 to 34 years\", \"n\": 3, \"total\": 38, \"pct\": 7.9}, {\"ranking_status\": \"Missing QID98\", \"Age\": \"18 to 21 years\", \"n\": 6, \"total\": 38, \"pct\": 15.8}, {\"ranking_status\": \"Missing QID98\", \"Age\": \"35 to 40 years\", \"n\": 9, \"total\": 38, \"pct\": 23.7}], \"data-082eb1086db857c3196b340adc20f9e1\": [{\"ranking_status\": \"Complete\", \"Gender\": \"Prefer not to say\", \"n\": 1, \"total\": 417, \"pct\": 0.2}, {\"ranking_status\": \"Missing QID98\", \"Gender\": \"Woman\", \"n\": 13, \"total\": 38, \"pct\": 34.2}, {\"ranking_status\": \"Missing QID98\", \"Gender\": \"Man\", \"n\": 25, \"total\": 38, \"pct\": 65.8}, {\"ranking_status\": \"Complete\", \"Gender\": \"Man\", \"n\": 247, \"total\": 417, \"pct\": 59.2}, {\"ranking_status\": \"Complete\", \"Gender\": \"Woman\", \"n\": 169, \"total\": 417, \"pct\": 40.5}], \"data-7f8e015301c6b6899af5f6dda6cb2974\": [{\"ranking_status\": \"Complete\", \"Consumer\": \"Starter\", \"n\": 142, \"total\": 348, \"pct\": 40.8}, {\"ranking_status\": \"Missing QID98\", \"Consumer\": \"Starter\", \"n\": 11, \"total\": 30, \"pct\": 36.7}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Mass_B\", \"n\": 20, \"total\": 348, \"pct\": 5.7}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Early_Professional\", \"n\": 10, \"total\": 348, \"pct\": 2.9}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Mix_of_Affluent_Wealth_&_High_Net_Woth_A\", \"n\": 27, \"total\": 348, \"pct\": 7.8}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Mass_A\", \"n\": 68, \"total\": 348, \"pct\": 19.5}, {\"ranking_status\": \"Missing QID98\", \"Consumer\": \"MassAffluent_A\", \"n\": 4, \"total\": 30, \"pct\": 13.3}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Lower_Mass_A\", \"n\": 26, \"total\": 348, \"pct\": 7.5}, {\"ranking_status\": \"Missing QID98\", \"Consumer\": \"Mass_B\", \"n\": 1, \"total\": 30, \"pct\": 3.3}, {\"ranking_status\": \"Missing QID98\", \"Consumer\": \"Lower_Mass_A\", \"n\": 4, \"total\": 30, \"pct\": 13.3}, {\"ranking_status\": \"Complete\", \"Consumer\": \"MassAffluent_A\", \"n\": 26, \"total\": 348, \"pct\": 7.5}, {\"ranking_status\": \"Missing QID98\", \"Consumer\": \"Mix_of_Affluent_Wealth_&_High_Net_Woth_A\", \"n\": 3, \"total\": 30, \"pct\": 10.0}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Mix_of_Affluent_Wealth_&_High_Net_Woth_B\", \"n\": 10, \"total\": 348, \"pct\": 2.9}, {\"ranking_status\": \"Complete\", \"Consumer\": \"Lower_Mass_B\", \"n\": 9, \"total\": 348, \"pct\": 2.6}, {\"ranking_status\": \"Complete\", \"Consumer\": \"MassAffluent_B\", \"n\": 10, \"total\": 348, \"pct\": 2.9}, {\"ranking_status\": \"Missing QID98\", \"Consumer\": \"Mass_A\", \"n\": 7, \"total\": 30, \"pct\": 23.3}]}}, {\"mode\": \"vega-lite\"});\n",
+       "</script>"
+      ],
+      "text/plain": [
+       "alt.HConcatChart(...)"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Visual comparison of key demographics\n",
+    "charts = []\n",
+    "for col in ['Age', 'Gender', 'Consumer']:\n",
+    "    if col not in demos_df.columns:\n",
+    "        continue\n",
+    "    _df = demos_df.filter(pl.col(col).is_not_null())\n",
+    "    _counts = _df.group_by(['ranking_status', col]).agg(pl.len().alias('n'))\n",
+    "    _totals = _counts.group_by('ranking_status').agg(pl.col('n').sum().alias('total'))\n",
+    "    _pcts = _counts.join(_totals, on='ranking_status').with_columns(\n",
+    "        (pl.col('n') / pl.col('total') * 100).round(1).alias('pct')\n",
+    "    )\n",
+    "    c = alt.Chart(_pcts.to_pandas()).mark_bar().encode(\n",
+    "        x=alt.X(f'{col}:N', title=col),\n",
+    "        y=alt.Y('pct:Q', title='% of group'),\n",
+    "        color='ranking_status:N',\n",
+    "        xOffset='ranking_status:N',\n",
+    "        tooltip=[col, 'ranking_status', 'n', 'pct'],\n",
+    "    ).properties(width=350, height=250, title=f'{col} Distribution')\n",
+    "    charts.append(c)\n",
+    "\n",
+    "alt.hconcat(*charts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d847c8cc",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 3. Survey Flow: Which Questions Were Answered?\n",
+    "\n",
+    "Check which questions the 38 respondents answered vs skipped to identify where the survey flow diverged."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "794e312e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Null rates (%) for key questions by group:\n",
+      "shape: (9, 4)\n",
+      "┌─────────────────────────────────┬───────────────┬───────────────┬────────────────┐\n",
+      "│ Question                        ┆ QID           ┆ Missing_null% ┆ Complete_null% │\n",
+      "│ ---                             ┆ ---           ┆ ---           ┆ ---            │\n",
+      "│ str                             ┆ str           ┆ f64           ┆ f64            │\n",
+      "╞═════════════════════════════════╪═══════════════╪═══════════════╪════════════════╡\n",
+      "│ Demographics (QID1 Age)         ┆ QID1          ┆ 0.0           ┆ 0.0            │\n",
+      "│ Demographics (QID2 Gender)      ┆ QID2          ┆ 0.0           ┆ 0.0            │\n",
+      "│ Top 8 Traits (QID25)            ┆ QID25         ┆ 0.0           ┆ 0.0            │\n",
+      "│ Character Ranking (QID27_1)     ┆ QID27_1       ┆ 0.0           ┆ 0.0            │\n",
+      "│ 18→8 Set A (QID29)              ┆ QID29         ┆ 0.0           ┆ 0.0            │\n",
+      "│ 18→8 Set B (QID101)             ┆ QID101        ┆ 0.0           ┆ 0.0            │\n",
+      "│ 8→3 Selection (QID36_0_GROUP)   ┆ QID36_0_GROUP ┆ 0.0           ┆ 0.0            │\n",
+      "│ Voice Ranking Q (QID98_1)       ┆ QID98_1       ┆ 100.0         ┆ 74.8           │\n",
+      "│ Voice Scale 1-10 first (QID98_… ┆ QID98_2       ┆ 100.0         ┆ 77.5           │\n",
+      "└─────────────────────────────────┴───────────────┴───────────────┴────────────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check null rates for key question groups: missing vs complete respondents\n",
+    "missing_raw = raw.filter(pl.col('ranking_status') == 'Missing QID98')\n",
+    "complete_raw = raw.filter(pl.col('ranking_status') == 'Complete')\n",
+    "\n",
+    "# Key question groups to check\n",
+    "question_groups = {\n",
+    "    'Demographics (QID1 Age)': 'QID1',\n",
+    "    'Demographics (QID2 Gender)': 'QID2',\n",
+    "    'Top 8 Traits (QID25)': 'QID25',\n",
+    "    'Character Ranking (QID27_1)': 'QID27_1',\n",
+    "    '18→8 Set A (QID29)': 'QID29',\n",
+    "    '18→8 Set B (QID101)': 'QID101',\n",
+    "    '8→3 Selection (QID36_0_GROUP)': 'QID36_0_GROUP',\n",
+    "    'Voice Ranking Q (QID98_1)': 'QID98_1',\n",
+    "    'Voice Scale 1-10 first (QID98_2)': 'QID98_2',\n",
+    "}\n",
+    "\n",
+    "null_comparison = []\n",
+    "for label, qid in question_groups.items():\n",
+    "    if qid not in raw.columns:\n",
+    "        null_comparison.append({'Question': label, 'QID': qid, 'Missing_null%': 'N/A', 'Complete_null%': 'N/A'})\n",
+    "        continue\n",
+    "    m_null = missing_raw[qid].null_count() / missing_raw.height * 100\n",
+    "    c_null = complete_raw[qid].null_count() / complete_raw.height * 100\n",
+    "    null_comparison.append({\n",
+    "        'Question': label,\n",
+    "        'QID': qid,\n",
+    "        'Missing_null%': round(m_null, 1),\n",
+    "        'Complete_null%': round(c_null, 1),\n",
+    "    })\n",
+    "\n",
+    "null_df = pl.DataFrame(null_comparison)\n",
+    "print('Null rates (%) for key questions by group:')\n",
+    "print(null_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "fdc504ea",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ColumnNotFoundError",
+     "evalue": "unable to find column \"diff_ppts\"; valid columns: []",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mColumnNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m     11\u001b[39m         desc = S.qid_descr_map.get(col, {}).get(\u001b[33m'\u001b[39m\u001b[33mQName\u001b[39m\u001b[33m'\u001b[39m, col)\n\u001b[32m     12\u001b[39m         diff_cols.append({\n\u001b[32m     13\u001b[39m             \u001b[33m'\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m'\u001b[39m: col,\n\u001b[32m     14\u001b[39m             \u001b[33m'\u001b[39m\u001b[33mdescription\u001b[39m\u001b[33m'\u001b[39m: desc,\n\u001b[32m   (...)\u001b[39m\u001b[32m     17\u001b[39m             \u001b[33m'\u001b[39m\u001b[33mdiff_ppts\u001b[39m\u001b[33m'\u001b[39m: \u001b[38;5;28mround\u001b[39m(diff, \u001b[32m1\u001b[39m),\n\u001b[32m     18\u001b[39m         })\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m diff_df = \u001b[43mpl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiff_cols\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdiff_ppts\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m     21\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mColumns with >30 ppts null-rate difference (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdiff_df.height\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found):\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m     22\u001b[39m \u001b[38;5;28mprint\u001b[39m(diff_df.head(\u001b[32m30\u001b[39m))\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/dataframe/frame.py:5965\u001b[39m, in \u001b[36mDataFrame.sort\u001b[39m\u001b[34m(self, by, descending, nulls_last, multithreaded, maintain_order, *more_by)\u001b[39m\n\u001b[32m   5867\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m   5868\u001b[39m \u001b[33;03mSort the dataframe by the given columns.\u001b[39;00m\n\u001b[32m   5869\u001b[39m \n\u001b[32m   (...)\u001b[39m\u001b[32m   5951\u001b[39m \u001b[33;03m└──────┴─────┴─────┘\u001b[39;00m\n\u001b[32m   5952\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m   5953\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpolars\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mlazyframe\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m QueryOptFlags\n\u001b[32m   5955\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[32m   5956\u001b[39m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   5957\u001b[39m \u001b[43m    \u001b[49m\u001b[43m.\u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   5958\u001b[39m \u001b[43m        \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   5959\u001b[39m \u001b[43m        \u001b[49m\u001b[43m*\u001b[49m\u001b[43mmore_by\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   5960\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   5961\u001b[39m \u001b[43m        \u001b[49m\u001b[43mnulls_last\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnulls_last\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   5962\u001b[39m \u001b[43m        \u001b[49m\u001b[43mmultithreaded\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmultithreaded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   5963\u001b[39m \u001b[43m        \u001b[49m\u001b[43mmaintain_order\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaintain_order\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   5964\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m-> \u001b[39m\u001b[32m5965\u001b[39m \u001b[43m    \u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimizations\u001b[49m\u001b[43m=\u001b[49m\u001b[43mQueryOptFlags\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_eager\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   5966\u001b[39m )\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py:97\u001b[39m, in \u001b[36mdeprecate_streaming_parameter.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m     93\u001b[39m         kwargs[\u001b[33m\"\u001b[39m\u001b[33mengine\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[33m\"\u001b[39m\u001b[33min-memory\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m     95\u001b[39m     \u001b[38;5;28;01mdel\u001b[39;00m kwargs[\u001b[33m\"\u001b[39m\u001b[33mstreaming\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m---> \u001b[39m\u001b[32m97\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/lazyframe/opt_flags.py:324\u001b[39m, in \u001b[36mforward_old_opt_flags.<locals>.decorate.<locals>.wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m    321\u001b[39m         optflags = cb(optflags, kwargs.pop(key))  \u001b[38;5;66;03m# type: ignore[no-untyped-call,unused-ignore]\u001b[39;00m\n\u001b[32m    323\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33moptimizations\u001b[39m\u001b[33m\"\u001b[39m] = optflags\n\u001b[32m--> \u001b[39m\u001b[32m324\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunction\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/lazyframe/frame.py:2429\u001b[39m, in \u001b[36mLazyFrame.collect\u001b[39m\u001b[34m(self, type_coercion, predicate_pushdown, projection_pushdown, simplify_expression, slice_pushdown, comm_subplan_elim, comm_subexpr_elim, cluster_with_columns, collapse_joins, no_optimization, engine, background, optimizations, **_kwargs)\u001b[39m\n\u001b[32m   2427\u001b[39m \u001b[38;5;66;03m# Only for testing purposes\u001b[39;00m\n\u001b[32m   2428\u001b[39m callback = _kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mpost_opt_callback\u001b[39m\u001b[33m\"\u001b[39m, callback)\n\u001b[32m-> \u001b[39m\u001b[32m2429\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m wrap_df(\u001b[43mldf\u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback\u001b[49m\u001b[43m)\u001b[49m)\n",
+      "\u001b[31mColumnNotFoundError\u001b[39m: unable to find column \"diff_ppts\"; valid columns: []"
+     ]
+    }
+   ],
+   "source": [
+    "# Broader scan: find ALL columns where missing group has significantly different null rate\n",
+    "diff_cols = []\n",
+    "for col in raw.columns:\n",
+    "    if col in ('_recordId', 'ranking_status'):\n",
+    "        continue\n",
+    "    m_null_pct = missing_raw[col].null_count() / missing_raw.height * 100\n",
+    "    c_null_pct = complete_raw[col].null_count() / complete_raw.height * 100\n",
+    "    diff = m_null_pct - c_null_pct\n",
+    "    # Flag columns where missing group has 30+ ppts more nulls\n",
+    "    if abs(diff) > 30:\n",
+    "        desc = S.qid_descr_map.get(col, {}).get('QName', col)\n",
+    "        diff_cols.append({\n",
+    "            'column': col,\n",
+    "            'description': desc,\n",
+    "            'missing_null_pct': round(m_null_pct, 1),\n",
+    "            'complete_null_pct': round(c_null_pct, 1),\n",
+    "            'diff_ppts': round(diff, 1),\n",
+    "        })\n",
+    "\n",
+    "diff_df = pl.DataFrame(diff_cols).sort('diff_ppts', descending=True)\n",
+    "print(f'Columns with >30 ppts null-rate difference ({diff_df.height} found):')\n",
+    "print(diff_df.head(30))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64440ef3",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 4. QID36 Selection Order Data\n",
+    "\n",
+    "The 38 respondents DO have `QID36_G0_*_RANK` data (selection order, not preference ranking). Analyze their selection patterns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c761354",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "# QID36 choice → voice mapping\n",
+    "cfg36 = S._get_qsf_question_by_QID('QID36')['Payload']\n",
+    "from utils import extract_voice_label\n",
+    "choice_voice_map = {k: extract_voice_label(v['Display']) for k, v in cfg36['Choices'].items()}\n",
+    "\n",
+    "# Extract which 3 voices each of the 38 respondents selected\n",
+    "qid36_rank_cols = sorted([c for c in raw.columns if c.startswith('QID36_G0_') and c.endswith('_RANK')])\n",
+    "\n",
+    "def extract_chosen_voices(row, rank_cols, voice_map):\n",
+    "    \"\"\"Get the 3 voices a respondent selected (from QID36 RANK columns).\"\"\"\n",
+    "    voices = []\n",
+    "    for col in rank_cols:\n",
+    "        if row[col] is not None:\n",
+    "            choice_num = col.replace('QID36_G0_', '').replace('_RANK', '')\n",
+    "            voices.append(voice_map.get(choice_num, f'?{choice_num}'))\n",
+    "    return voices\n",
+    "\n",
+    "# Build a summary of chosen voices for the 38 missing respondents\n",
+    "missing_voices = []\n",
+    "for row in missing_raw.iter_rows(named=True):\n",
+    "    voices = extract_chosen_voices(row, qid36_rank_cols, choice_voice_map)\n",
+    "    missing_voices.append({\n",
+    "        '_recordId': row['_recordId'],\n",
+    "        'voice_1': voices[0] if len(voices) > 0 else None,\n",
+    "        'voice_2': voices[1] if len(voices) > 1 else None,\n",
+    "        'voice_3': voices[2] if len(voices) > 2 else None,\n",
+    "    })\n",
+    "\n",
+    "missing_voice_df = pl.DataFrame(missing_voices)\n",
+    "print('Voices selected by the 38 missing respondents:')\n",
+    "print(missing_voice_df.head(10))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf78a4c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Voice frequency in the missing group's top-3 selections\n",
+    "all_selected = (\n",
+    "    missing_voice_df.unpivot(\n",
+    "        on=['voice_1', 'voice_2', 'voice_3'],\n",
+    "        value_name='voice'\n",
+    "    )\n",
+    "    .filter(pl.col('voice').is_not_null())\n",
+    "    .group_by('voice')\n",
+    "    .agg(pl.len().alias('n_selected'))\n",
+    "    .sort('n_selected', descending=True)\n",
+    "    .with_columns(\n",
+    "        pl.col('voice').replace(VOICE_GENDER_MAPPING).alias('voice_gender')\n",
+    "    )\n",
+    ")\n",
+    "print('Voice selection frequency for the 38 missing respondents:')\n",
+    "print(all_selected)\n",
+    "\n",
+    "# Gender breakdown of selected voices\n",
+    "gender_counts = all_selected.group_by('voice_gender').agg(pl.col('n_selected').sum())\n",
+    "print(f'\\nVoice gender breakdown in their selections:')\n",
+    "print(gender_counts)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb3ee243",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 5. 18→8 Selection Path (Set A vs Set B)\n",
+    "\n",
+    "Were the 38 respondents disproportionately assigned to one randomization set (A or B)?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19c6bf43",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check Set A vs Set B distribution\n",
+    "v_18_8_3_df = S.get_18_8_3(data)[0].collect().join(\n",
+    "    raw.select(['_recordId', 'ranking_status']),\n",
+    "    on='_recordId'\n",
+    ")\n",
+    "\n",
+    "# Determine which set each respondent was in\n",
+    "v_18_8_3_df = v_18_8_3_df.with_columns(\n",
+    "    pl.when(pl.col('18-8_Set-A').is_not_null() & pl.col('18-8_Set-B').is_null())\n",
+    "    .then(pl.lit('Set A'))\n",
+    "    .when(pl.col('18-8_Set-B').is_not_null() & pl.col('18-8_Set-A').is_null())\n",
+    "    .then(pl.lit('Set B'))\n",
+    "    .when(pl.col('18-8_Set-A').is_not_null() & pl.col('18-8_Set-B').is_not_null())\n",
+    "    .then(pl.lit('Both'))\n",
+    "    .otherwise(pl.lit('Neither'))\n",
+    "    .alias('randomization_set')\n",
+    ")\n",
+    "\n",
+    "set_dist = v_18_8_3_df.group_by(['ranking_status', 'randomization_set']).agg(pl.len().alias('n'))\n",
+    "print('Randomization set distribution:')\n",
+    "print(set_dist.sort(['ranking_status', 'randomization_set']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc98327d",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 6. Character Ranking Comparison\n",
+    "\n",
+    "Did the 38 respondents rank characters differently?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f91db9c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Character ranking: who they picked as #1\n",
+    "char_rank_df = S.get_character_ranking(data)[0].collect().join(\n",
+    "    raw.select(['_recordId', 'ranking_status']),\n",
+    "    on='_recordId'\n",
+    ")\n",
+    "\n",
+    "ranking_cols = [c for c in char_rank_df.columns if c.startswith('Character_Ranking_')]\n",
+    "\n",
+    "# Find which character each respondent ranked #1\n",
+    "rank1_rows = []\n",
+    "for row in char_rank_df.iter_rows(named=True):\n",
+    "    for col in ranking_cols:\n",
+    "        if row[col] == 1:\n",
+    "            rank1_rows.append({\n",
+    "                '_recordId': row['_recordId'],\n",
+    "                'ranking_status': row['ranking_status'],\n",
+    "                'top_character': col.replace('Character_Ranking_', '').replace('_', ' '),\n",
+    "            })\n",
+    "            break\n",
+    "\n",
+    "rank1_df = pl.DataFrame(rank1_rows)\n",
+    "char_dist = compare_distribution(rank1_df, 'top_character')\n",
+    "print('Character ranked #1 distribution (%):')\n",
+    "print(char_dist)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "acabf4af",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 7. Voice Scale 1-10 Comparison\n",
+    "\n",
+    "Did the 38 respondents rate voices differently on the 1-10 scale?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b446a0ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Voice scale 1-10 comparison\n",
+    "voice_1_10_df = S.get_voice_scale_1_10(data)[0].collect().join(\n",
+    "    raw.select(['_recordId', 'ranking_status']),\n",
+    "    on='_recordId'\n",
+    ")\n",
+    "\n",
+    "scale_cols = [c for c in voice_1_10_df.columns if c.startswith('Voice_Scale_1_10__')]\n",
+    "\n",
+    "# Average score per voice, by group\n",
+    "scale_comparison = []\n",
+    "for col in scale_cols:\n",
+    "    voice = col.replace('Voice_Scale_1_10__', '')\n",
+    "    for status in ['Missing QID98', 'Complete']:\n",
+    "        subset = voice_1_10_df.filter(pl.col('ranking_status') == status)\n",
+    "        avg = subset[col].drop_nulls().mean()\n",
+    "        scale_comparison.append({\n",
+    "            'voice': voice,\n",
+    "            'ranking_status': status,\n",
+    "            'mean_score': round(avg, 2) if avg is not None else None,\n",
+    "        })\n",
+    "\n",
+    "scale_comp_df = pl.DataFrame(scale_comparison).pivot(\n",
+    "    on='ranking_status', index='voice', values='mean_score'\n",
+    ").with_columns(\n",
+    "    (pl.col('Missing QID98') - pl.col('Complete')).round(2).alias('diff')\n",
+    ").sort('diff')\n",
+    "\n",
+    "print('Average voice scores (1-10) by group:')\n",
+    "print(scale_comp_df)\n",
+    "\n",
+    "# Overall average\n",
+    "overall_missing = pl.DataFrame([r for r in scale_comparison if r['ranking_status'] == 'Missing QID98'])['mean_score'].mean()\n",
+    "overall_complete = pl.DataFrame([r for r in scale_comparison if r['ranking_status'] == 'Complete'])['mean_score'].mean()\n",
+    "print(f'\\nOverall avg score — Missing: {overall_missing:.2f}, Complete: {overall_complete:.2f}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "47626429",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 8. Speaking Style Comparison\n",
+    "\n",
+    "Check if the 38 respondents show different speaking style patterns."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0961c08d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Speaking style straight-liner check for the two groups\n",
+    "ss_or_df = S.get_ss_orange_red(data)[0].collect()\n",
+    "ss_gb_df = S.get_ss_green_blue(data)[0].collect()\n",
+    "vs_df = S.get_voice_scale_1_10(data)[0].collect()\n",
+    "\n",
+    "sl_all = ss_or_df.join(ss_gb_df, on='_recordId').join(vs_df, on='_recordId')\n",
+    "\n",
+    "_, sl_flagged = check_straight_liners(sl_all.lazy(), max_score=5)\n",
+    "\n",
+    "if sl_flagged is not None and not sl_flagged.is_empty():\n",
+    "    # Count straight-lined groups per respondent\n",
+    "    sl_counts = (\n",
+    "        sl_flagged\n",
+    "        .group_by('Record ID')\n",
+    "        .agg(pl.len().alias('sl_group_count'))\n",
+    "        .rename({'Record ID': '_recordId'})\n",
+    "    )\n",
+    "    \n",
+    "    # Join with ranking status\n",
+    "    sl_with_status = sl_counts.join(\n",
+    "        raw.select(['_recordId', 'ranking_status']),\n",
+    "        on='_recordId'\n",
+    "    )\n",
+    "    \n",
+    "    sl_summary = sl_with_status.group_by('ranking_status').agg(\n",
+    "        pl.len().alias('n_straight_liners'),\n",
+    "        pl.col('sl_group_count').mean().alias('avg_groups_straight_lined'),\n",
+    "        pl.col('sl_group_count').max().alias('max_groups_straight_lined'),\n",
+    "    )\n",
+    "    print('Straight-liner comparison:')\n",
+    "    print(sl_summary)\n",
+    "    \n",
+    "    # What percentage of each group are straight-liners?\n",
+    "    for status in ['Missing QID98', 'Complete']:\n",
+    "        n_group = raw.filter(pl.col('ranking_status') == status).height\n",
+    "        n_sl = sl_with_status.filter(pl.col('ranking_status') == status).height\n",
+    "        print(f'  {status}: {n_sl}/{n_group} ({n_sl/n_group*100:.1f}%) have straight-lined ≥1 group')\n",
+    "else:\n",
+    "    print('No straight-liners detected in either group.')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2a79b85",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 9. QID98 vs QID36 Data Structure in Raw CSV\n",
+    "\n",
+    "Deep inspection: are the QID98 columns truly empty, or could there be a data export issue?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a628a3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check all QID98 columns for the 38 missing respondents\n",
+    "qid98_cols = sorted([c for c in raw.columns if c.startswith('QID98')])\n",
+    "print(f'QID98 columns in dataset ({len(qid98_cols)}):')\n",
+    "print(qid98_cols)\n",
+    "\n",
+    "# Verify: ALL QID98 columns are null for the 38?\n",
+    "qid98_null_check = []\n",
+    "for col in qid98_cols:\n",
+    "    n_non_null = missing_raw[col].drop_nulls().height\n",
+    "    qid98_null_check.append({'col': col, 'non_null_count': n_non_null})\n",
+    "\n",
+    "qid98_check_df = pl.DataFrame(qid98_null_check)\n",
+    "print(f'\\nQID98 non-null counts for the 38 missing respondents:')\n",
+    "print(qid98_check_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb1b02cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Compare QID36_G0_*_RANK (selection order) for complete vs missing\n",
+    "# For complete respondents: do QID98 and QID36_RANK always disagree? (They should — different semantics)\n",
+    "sample_complete = complete_raw.head(5)\n",
+    "\n",
+    "print('Cross-check: QID98 (preference rank) vs QID36_RANK (selection order) for 5 complete respondents:')\n",
+    "for row in sample_complete.iter_rows(named=True):\n",
+    "    print(f\"\\n  {row['_recordId']}:\")\n",
+    "    for i in range(1, 19):\n",
+    "        qid98_col = f'QID98_{i}'\n",
+    "        qid36_col = f'QID36_G0_{i}_RANK'\n",
+    "        q98 = row.get(qid98_col)\n",
+    "        q36 = row.get(qid36_col)\n",
+    "        if q98 is not None or q36 is not None:\n",
+    "            voice = choice_voice_map.get(str(i), f'?{i}')\n",
+    "            match = '✓ same' if q98 == q36 else '✗ diff'\n",
+    "            print(f\"    {voice}: QID98={q98}, QID36_RANK={q36}  ({match})\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "216381fc",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 10. Distribution Channel / User Agent / Other Metadata\n",
+    "\n",
+    "Check if the 38 came through a specific distribution channel or device."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "184c8509",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check distribution channel and other metadata columns\n",
+    "for meta_col in ['distributionChannel', 'userLanguage', 'Status']:\n",
+    "    if meta_col in raw.columns:\n",
+    "        dist = raw.group_by(['ranking_status', meta_col]).agg(pl.len().alias('n'))\n",
+    "        print(f'\\n{meta_col}:')\n",
+    "        print(dist.sort(['ranking_status', meta_col]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "67ad6631",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check if there are any Qualtrics internal columns that differ\n",
+    "# (e.g., Q_RecaptchaScore, Q_RelevantIDDuplicate, etc.)\n",
+    "q_internal_cols = [c for c in raw.columns if c.startswith('Q_')]\n",
+    "print(f'Qualtrics internal columns ({len(q_internal_cols)}):')\n",
+    "for col in q_internal_cols:\n",
+    "    m_vals = missing_raw[col].drop_nulls().unique().to_list()\n",
+    "    c_vals = complete_raw[col].drop_nulls().unique().to_list()\n",
+    "    m_null_pct = missing_raw[col].null_count() / missing_raw.height * 100\n",
+    "    c_null_pct = complete_raw[col].null_count() / complete_raw.height * 100\n",
+    "    if m_vals != c_vals or abs(m_null_pct - c_null_pct) > 20:\n",
+    "        print(f'  {col}: Missing vals={m_vals[:5]}, Complete vals={c_vals[:5]}')\n",
+    "        print(f'         Missing null%={m_null_pct:.0f}%, Complete null%={c_null_pct:.0f}%')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3934eb4c",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 11. QSF Survey Flow Inspection\n",
+    "\n",
+    "Check the QSF for skip logic or display logic on QID98."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "031a0f8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "# Inspect QID98 question config in QSF\n",
+    "try:\n",
+    "    qid98_cfg = S._get_qsf_question_by_QID('QID98')\n",
+    "    payload = qid98_cfg.get('Payload', {})\n",
+    "    \n",
+    "    print('QID98 Question Type:', payload.get('QuestionType'))\n",
+    "    print('QID98 Selector:', payload.get('Selector'))\n",
+    "    print('QID98 QuestionText:', payload.get('QuestionText', '')[:200])\n",
+    "    \n",
+    "    # Check for display logic\n",
+    "    display_logic = payload.get('DisplayLogic')\n",
+    "    print(f'\\nDisplay Logic: {json.dumps(display_logic, indent=2) if display_logic else \"None\"}')\n",
+    "    \n",
+    "    # Check for skip logic\n",
+    "    skip_logic = payload.get('SkipLogic')\n",
+    "    print(f'\\nSkip Logic: {json.dumps(skip_logic, indent=2) if skip_logic else \"None\"}')\n",
+    "    \n",
+    "    # Check DynamicChoices (since this question uses carry-forward from QID36)\n",
+    "    dyn_choices = payload.get('DynamicChoices')\n",
+    "    print(f'\\nDynamic Choices: {json.dumps(dyn_choices, indent=2) if dyn_choices else \"None\"}')\n",
+    "    \n",
+    "    # Check validation settings\n",
+    "    validation = payload.get('Validation')\n",
+    "    print(f'\\nValidation: {json.dumps(validation, indent=2) if validation else \"None\"}')\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f'Error inspecting QID98: {e}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6d157a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Also check QID36 (the parent question) for any relevant logic\n",
+    "try:\n",
+    "    qid36_cfg = S._get_qsf_question_by_QID('QID36')\n",
+    "    payload36 = qid36_cfg.get('Payload', {})\n",
+    "    \n",
+    "    print('QID36 Question Type:', payload36.get('QuestionType'))\n",
+    "    print('QID36 Selector:', payload36.get('Selector'))\n",
+    "    \n",
+    "    # Check for any branching or flow-related settings\n",
+    "    print(f'\\nQID36 GradingData: {payload36.get(\"GradingData\")}')\n",
+    "    print(f'QID36 GroupChoiceRandomization: {payload36.get(\"GroupChoiceRandomization\")}')\n",
+    "    print(f'QID36 Randomization: {payload36.get(\"Randomization\")}')\n",
+    "    \n",
+    "    # Check the flow element for QID98\n",
+    "    flow = S.qsf.get('SurveyFlow', S.qsf.get('SurveyElements', []))\n",
+    "    # Search for QID98 in flow elements\n",
+    "    def find_in_flow(elements, target_qid, path=''):\n",
+    "        results = []\n",
+    "        if isinstance(elements, dict):\n",
+    "            if elements.get('PrimaryAttribute') == target_qid or elements.get('QuestionID') == target_qid:\n",
+    "                results.append((path, elements))\n",
+    "            for k, v in elements.items():\n",
+    "                results.extend(find_in_flow(v, target_qid, f'{path}.{k}'))\n",
+    "        elif isinstance(elements, list):\n",
+    "            for i, elem in enumerate(elements):\n",
+    "                results.extend(find_in_flow(elem, target_qid, f'{path}[{i}]'))\n",
+    "        return results\n",
+    "    \n",
+    "    qid98_in_flow = find_in_flow(S.qsf, 'QID98')\n",
+    "    print(f'\\nQID98 appears in QSF structure {len(qid98_in_flow)} time(s):')\n",
+    "    for path, elem in qid98_in_flow:\n",
+    "        print(f'  Path: {path}')\n",
+    "        print(f'  Element: {json.dumps(elem, indent=2)[:300]}')\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f'Error: {e}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "451a0621",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "## 12. Summary\n",
+    "\n",
+    "Collect all findings."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41f8c49c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('=' * 70)\n",
+    "print('SUMMARY: 38 Respondents Missing Voice Ranking (QID98)')\n",
+    "print('=' * 70)\n",
+    "print(f'''\n",
+    "Total respondents:  {n_total}\n",
+    "Missing QID98:      {n_missing} ({n_missing/n_total:.1%})\n",
+    "Complete:           {n_total - n_missing} ({(n_total - n_missing)/n_total:.1%})\n",
+    "\n",
+    "Key observations:\n",
+    "- All 38 completed the survey (progress = 100, finished = True)\n",
+    "- All 38 have QID36_G0_*_RANK (selection order) data — they DID pick 3 voices\n",
+    "- All 18 QID98_* columns are null for these 38 respondents\n",
+    "- The QID98 question carries forward from QID36 via DynamicChoices\n",
+    "\n",
+    "See analysis sections above for:\n",
+    "  1. Duration & timestamp patterns\n",
+    "  2. Demographic profile comparison\n",
+    "  3. Which questions were answered/skipped\n",
+    "  4. Voice selection patterns\n",
+    "  5. Randomization set (A vs B)\n",
+    "  6. Character ranking differences\n",
+    "  7. Voice 1-10 scale comparison\n",
+    "  8. Straight-liner rates\n",
+    "  9. Raw QID98 data verification\n",
+    " 10. Distribution channel / metadata\n",
+    " 11. QSF skip/display logic on QID98\n",
+    "''')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "phase-3-quant",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/plots.py b/plots.py
index f26e216..52a7d7e 100644
--- a/plots.py
+++ b/plots.py
@@ -96,6 +96,11 @@ class QualtricsPlotsMixin:
         sl_threshold = getattr(self, 'sl_threshold', None)
         if sl_threshold is not None:
             parts.append(f"SL-gte{sl_threshold}")
+        
+        # Append voice ranking filter if set
+        vr_filter = getattr(self, 'voice_ranking_filter', None)
+        if vr_filter is not None:
+            parts.append(f"VR-{vr_filter}")
 
         if not parts:
             return "All_Respondents"
@@ -191,6 +196,15 @@ class QualtricsPlotsMixin:
         sl_threshold = getattr(self, 'sl_threshold', None)
         if sl_threshold is not None:
             parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups")
+        
+        # Append voice ranking filter if set
+        vr_filter = getattr(self, 'voice_ranking_filter', None)
+        if vr_filter is not None:
+            vr_labels = {
+                'only-missing': 'ONLY respondents missing voice ranking (QID98)',
+                'exclude-missing': 'EXCLUDING respondents missing voice ranking (QID98)',
+            }
+            parts.append(f"VOICE RANKING: {vr_labels.get(vr_filter, vr_filter)}")
 
         if not parts:
             # No filters active - return just sample size (or empty string if no sample size)
diff --git a/run_filter_combinations.py b/run_filter_combinations.py
index 012a7ff..3c63b85 100644
--- a/run_filter_combinations.py
+++ b/run_filter_combinations.py
@@ -179,10 +179,25 @@ def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> li
                 'filters': {'industry': [industry]}
             })
     
+    # Voice ranking completeness filter
+    # These use a special flag rather than demographic filters, so we store
+    # the mode in a dedicated key that run_report passes as --voice-ranking-filter.
+    if not category or category in ['all_filters', 'voice_ranking']:
+        combinations.append({
+            'name': 'VoiceRanking-OnlyMissing',
+            'filters': {},
+            'voice_ranking_filter': 'only-missing',
+        })
+        combinations.append({
+            'name': 'VoiceRanking-ExcludeMissing',
+            'filters': {},
+            'voice_ranking_filter': 'exclude-missing',
+        })
+    
     return combinations
 
 
-def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None) -> bool:
+def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
     """
     Run the report script with given filters.
     
@@ -191,6 +206,9 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_thresh
         name: Name for this filter combination (used for .txt description file)
         dry_run: If True, just print command without running
         sl_threshold: If set, exclude respondents with >= N straight-lined question groups
+        voice_ranking_filter: If set, filter by voice ranking completeness.
+            'only-missing' keeps only respondents missing QID98 data,
+            'exclude-missing' removes them.
         
     Returns:
         True if successful, False otherwise
@@ -205,6 +223,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_thresh
     if sl_threshold is not None:
         cmd.extend(['--sl-threshold', str(sl_threshold)])
     
+    # Pass voice ranking filter if specified
+    if voice_ranking_filter is not None:
+        cmd.extend(['--voice-ranking-filter', voice_ranking_filter])
+    
     for filter_name, values in filters.items():
         if values:
             cmd.extend([f'--{filter_name}', json.dumps(values)])
@@ -235,7 +257,7 @@ def main():
     parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
     parser.add_argument(
         '--category',
-        choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry'],
+        choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
         default='all_filters',
         help='Filter category to run combinations for (default: all_filters)'
     )
@@ -259,7 +281,7 @@ def main():
         print("\nDRY RUN - Commands that would be executed:")
         for combo in combinations:
             print(f"\n{combo['name']}:")
-            run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold)
+            run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
         return
     
     # Run each combination with progress bar
@@ -268,7 +290,7 @@ def main():
     
     for combo in tqdm(combinations, desc="Running reports", unit="filter"):
         tqdm.write(f"Running: {combo['name']}")
-        if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold):
+        if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
             successful += 1
         else:
             failed.append(combo['name'])
diff --git a/utils.py b/utils.py
index 26eb183..06cf1f3 100644
--- a/utils.py
+++ b/utils.py
@@ -1115,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):
 
         return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
     
+    def get_top_3_voices_missing_ranking(
+        self, q: pl.LazyFrame
+    ) -> pl.DataFrame:
+        """Identify respondents who completed the top-3 voice selection (QID36)
+        but are missing the explicit ranking question (QID98).
+
+        These respondents picked 3 voices in the selection step and have
+        selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
+        ranking columns are null.  This means ``get_top_3_voices()`` will
+        return all-null rows for them, causing plots like
+        ``plot_most_ranked_1`` to undercount.
+
+        Parameters:
+            q: The (optionally filtered) LazyFrame from ``load_data()``.
+
+        Returns:
+            A collected ``pl.DataFrame`` with columns:
+
+            - ``_recordId`` – the respondent identifier
+            - ``3_Ranked`` – comma-separated text of the 3 voices they selected
+            - ``qid36_rank_cols`` – dict-like column with their QID36 selection-
+              order values (for reference; these are *not* preference ranks)
+        """
+        # Get the top-3 ranking data (QID98-based)
+        top3, _ = self.get_top_3_voices(q)
+        top3_df = top3.collect()
+
+        ranking_cols = [c for c in top3_df.columns if c != '_recordId']
+
+        # Respondents where every QID98 ranking column is null
+        all_null_expr = pl.lit(True)
+        for col in ranking_cols:
+            all_null_expr = all_null_expr & pl.col(col).is_null()
+
+        missing_ids = top3_df.filter(all_null_expr).select('_recordId')
+
+        if missing_ids.height == 0:
+            return pl.DataFrame(schema={
+                '_recordId': pl.Utf8,
+                '3_Ranked': pl.Utf8,
+            })
+
+        # Enrich with the 3_Ranked text from the 18→8→3 question
+        v_18_8_3, _ = self.get_18_8_3(q)
+        v_df = v_18_8_3.collect()
+
+        result = missing_ids.join(
+            v_df.select(['_recordId', '3_Ranked']),
+            on='_recordId',
+            how='left',
+        )
+
+        return result
+
     
     def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
         """Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.