{ "cells": [ { "cell_type": "markdown", "id": "1ee02624", "metadata": {}, "source": [ "# Analysis: 38 Respondents Missing Voice Ranking Data (QID98)\n", "\n", "**Context:** 38 out of 455 respondents completed the \"pick top 3 from 8 voices\" step (QID36) but have **all-null** data for the explicit ranking question (QID98). This notebook investigates their demographics, survey metadata, and response patterns to identify commonalities.\n", "\n", "**Data source:** `2-4-26` export" ] }, { "cell_type": "code", "execution_count": 4, "id": "bd6df059", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Total respondents: 455\n" ] } ], "source": [ "import polars as pl\n", "import altair as alt\n", "from utils import QualtricsSurvey\n", "from validation import check_straight_liners\n", "from reference import VOICE_GENDER_MAPPING\n", "\n", "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n", "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'\n", "\n", "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n", "data_all = S.load_data()\n", "data = S.filter_data(data_all)\n", "\n", "n_total = data.select(pl.len()).collect().item()\n", "print(f'Total respondents: {n_total}')" ] }, { "cell_type": "code", "execution_count": 5, "id": "46afc18b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Respondents missing voice ranking (QID98): 38\n", "Respondents with complete data: 417\n", "Missing rate: 8.4%\n" ] } ], "source": [ "# Identify the 38 missing respondents\n", "missing = S.get_top_3_voices_missing_ranking(data)\n", "missing_ids = missing.select('_recordId')\n", "n_missing = missing_ids.height\n", "\n", "print(f'Respondents missing voice ranking (QID98): {n_missing}')\n", "print(f'Respondents with complete data: {n_total - n_missing}')\n", "print(f'Missing rate: {n_missing / n_total:.1%}')" ] }, { "cell_type": "code", "execution_count": 6, "id": "990eb4b4", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_1673001/2213083878.py:3: DeprecationWarning: `is_in` with a collection of the same datatype is ambiguous and deprecated.\n", "Please use `implode` to return to previous behavior.\n", "\n", "See https://github.com/pola-rs/polars/issues/22149 for more information.\n", " raw = raw.with_columns(\n" ] } ], "source": [ "# Build a column to tag respondents as missing / complete\n", "raw = data.collect()\n", "raw = raw.with_columns(\n", " pl.when(pl.col('_recordId').is_in(missing_ids['_recordId']))\n", " .then(pl.lit('Missing QID98'))\n", " .otherwise(pl.lit('Complete'))\n", " .alias('ranking_status')\n", ")" ] }, { "cell_type": "markdown", "id": "3d81b1b6", "metadata": {}, "source": [ "---\n", "## 1. Survey Metadata Analysis\n", "\n", "Check timestamps, duration, progress, and platform info to see if the 38 share a common survey-taking pattern." ] }, { "cell_type": "code", "execution_count": 7, "id": "be18a52c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Available metadata columns: ['startDate', 'endDate', 'duration', 'progress', 'finished', 'userLanguage', 'distributionChannel']\n" ] } ], "source": [ "# Check which metadata columns exist\n", "meta_candidates = ['startDate', 'endDate', 'duration', 'progress', 'finished',\n", " 'Status', 'userLanguage', 'distributionChannel', 'RecipientEmail']\n", "available_meta = [c for c in meta_candidates if c in raw.columns]\n", "print(f'Available metadata columns: {available_meta}')" ] }, { "cell_type": "code", "execution_count": 8, "id": "66e3060f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Duration comparison:\n", "shape: (2, 9)\n", "┌─────────────┬─────────────┬────────────┬─────────────┬───┬─────────┬─────┬──────────┬────────────┐\n", "│ ranking_sta ┆ mean_sec ┆ median_sec ┆ std_sec ┆ … ┆ max_sec ┆ n ┆ mean_min ┆ median_min │\n", "│ tus ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ --- ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ u32 ┆ f64 ┆ f64 │\n", "│ str ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "╞═════════════╪═════════════╪════════════╪═════════════╪═══╪═════════╪═════╪══════════╪════════════╡\n", "│ Complete ┆ 1634.642686 ┆ 1161.0 ┆ 2072.480875 ┆ … ┆ 21761.0 ┆ 417 ┆ 27.2 ┆ 19.4 │\n", "│ Missing ┆ 1152.078947 ┆ 871.5 ┆ 852.206211 ┆ … ┆ 4876.0 ┆ 38 ┆ 19.2 ┆ 14.5 │\n", "│ QID98 ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "└─────────────┴─────────────┴────────────┴─────────────┴───┴─────────┴─────┴──────────┴────────────┘\n" ] } ], "source": [ "# Duration comparison\n", "if 'duration' in raw.columns:\n", " duration_stats = (\n", " raw.group_by('ranking_status')\n", " .agg(\n", " pl.col('duration').cast(pl.Float64).mean().alias('mean_sec'),\n", " pl.col('duration').cast(pl.Float64).median().alias('median_sec'),\n", " pl.col('duration').cast(pl.Float64).std().alias('std_sec'),\n", " pl.col('duration').cast(pl.Float64).min().alias('min_sec'),\n", " pl.col('duration').cast(pl.Float64).max().alias('max_sec'),\n", " pl.len().alias('n'),\n", " )\n", " .with_columns(\n", " (pl.col('mean_sec') / 60).round(1).alias('mean_min'),\n", " (pl.col('median_sec') / 60).round(1).alias('median_min'),\n", " )\n", " )\n", " print('Duration comparison:')\n", " print(duration_stats)\n", " \n", " # Histogram\n", " chart = alt.Chart(raw.select(['duration', 'ranking_status']).to_pandas()).mark_bar(\n", " opacity=0.6\n", " ).encode(\n", " x=alt.X('duration:Q', bin=alt.Bin(maxbins=40), title='Duration (seconds)'),\n", " y=alt.Y('count():Q', title='Count'),\n", " color='ranking_status:N',\n", " ).properties(width=700, height=300, title='Survey Duration Distribution')\n", " chart" ] }, { "cell_type": "code", "execution_count": 9, "id": "f0bd2b0f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Progress stats by group:\n", "shape: (2, 3)\n", "┌────────────────┬───────────────┬──────────────┐\n", "│ ranking_status ┆ mean_progress ┆ min_progress │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞════════════════╪═══════════════╪══════════════╡\n", "│ Complete ┆ 100.0 ┆ 100.0 │\n", "│ Missing QID98 ┆ 100.0 ┆ 100.0 │\n", "└────────────────┴───────────────┴──────────────┘\n", "\n", "Finished status:\n", "shape: (2, 3)\n", "┌────────────────┬──────────┬─────┐\n", "│ ranking_status ┆ finished ┆ n │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ bool ┆ u32 │\n", "╞════════════════╪══════════╪═════╡\n", "│ Complete ┆ true ┆ 417 │\n", "│ Missing QID98 ┆ true ┆ 38 │\n", "└────────────────┴──────────┴─────┘\n" ] } ], "source": [ "# Progress and finished status\n", "if 'progress' in raw.columns:\n", " progress_by_status = (\n", " raw.group_by('ranking_status')\n", " .agg(\n", " pl.col('progress').cast(pl.Float64).mean().alias('mean_progress'),\n", " pl.col('progress').cast(pl.Float64).min().alias('min_progress'),\n", " pl.col('progress').value_counts().alias('progress_dist'),\n", " )\n", " )\n", " print('Progress stats by group:')\n", " print(progress_by_status.select(['ranking_status', 'mean_progress', 'min_progress']))\n", "\n", "if 'finished' in raw.columns:\n", " finished_by_status = raw.group_by(['ranking_status', 'finished']).agg(pl.len().alias('n'))\n", " print('\\nFinished status:')\n", " print(finished_by_status.sort('ranking_status'))" ] }, { "cell_type": "code", "execution_count": 10, "id": "0bfdb060", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Timestamp ranges by group:\n", "shape: (2, 4)\n", "┌────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐\n", "│ ranking_status ┆ earliest ┆ latest ┆ median_start │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ datetime[μs] ┆ datetime[μs] ┆ datetime[μs] │\n", "╞════════════════╪═════════════════════╪═════════════════════╪═════════════════════╡\n", "│ Missing QID98 ┆ 2026-01-20 16:16:18 ┆ 2026-01-30 12:43:07 ┆ 2026-01-27 23:30:15 │\n", "│ Complete ┆ 2026-01-20 15:00:44 ┆ 2026-01-30 23:53:40 ┆ 2026-01-28 05:46:12 │\n", "└────────────────┴─────────────────────┴─────────────────────┴─────────────────────┘\n" ] } ], "source": [ "# Timestamps: check if the missing respondents cluster in a specific time window\n", "if 'startDate' in raw.columns:\n", " ts = raw.select(['_recordId', 'ranking_status', 'startDate', 'endDate']).with_columns(\n", " pl.col('startDate').str.to_datetime('%Y-%m-%d %H:%M:%S', strict=False).alias('start_dt'),\n", " pl.col('endDate').str.to_datetime('%Y-%m-%d %H:%M:%S', strict=False).alias('end_dt'),\n", " )\n", " \n", " # Time range per group\n", " time_stats = ts.group_by('ranking_status').agg(\n", " pl.col('start_dt').min().alias('earliest'),\n", " pl.col('start_dt').max().alias('latest'),\n", " pl.col('start_dt').median().alias('median_start'),\n", " )\n", " print('Timestamp ranges by group:')\n", " print(time_stats)\n", " \n", " # Timeline scatter plot\n", " chart = alt.Chart(ts.to_pandas()).mark_circle(size=40).encode(\n", " x=alt.X('start_dt:T', title='Survey Start Time'),\n", " y=alt.Y('ranking_status:N', title=''),\n", " color='ranking_status:N',\n", " tooltip=['_recordId', 'start_dt:T', 'end_dt:T'],\n", " ).properties(width=700, height=150, title='Survey Start Times')\n", " chart" ] }, { "cell_type": "markdown", "id": "d81aede5", "metadata": {}, "source": [ "---\n", "## 2. Demographic Profile Comparison\n", "\n", "Compare age, gender, ethnicity, consumer segment, income, etc. between the two groups." ] }, { "cell_type": "code", "execution_count": 11, "id": "cad1b204", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Demographic columns: ['Age', 'Gender', 'Race/Ethnicity', 'Bussiness_Owner', 'Business_Revenue', 'Employment', 'Personal_Products', 'Income', 'Investable_Assets', 'Industry', 'Consumer']\n" ] } ], "source": [ "# Get demographics and join with ranking status\n", "demos_df = S.get_demographics(data)[0].collect()\n", "demos_df = demos_df.join(\n", " raw.select(['_recordId', 'ranking_status']),\n", " on='_recordId'\n", ")\n", "\n", "demo_cols = [c for c in demos_df.columns if c not in ('_recordId', 'ranking_status')]\n", "print(f'Demographic columns: {demo_cols}')" ] }, { "cell_type": "code", "execution_count": 12, "id": "65fdfd6b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "============================================================\n", "Age (% within each group)\n", "============================================================\n", "shape: (8, 4)\n", "┌──────────────────┬───────────────┬──────────┬───────────┐\n", "│ Age ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════════════╪═══════════════╪══════════╪═══════════╡\n", "│ 18 to 21 years ┆ 15.8 ┆ 14.9 ┆ 0.9 │\n", "│ 22 to 24 years ┆ 13.2 ┆ 19.2 ┆ -6.0 │\n", "│ 25 to 34 years ┆ 7.9 ┆ 17.0 ┆ -9.1 │\n", "│ 35 to 40 years ┆ 23.7 ┆ 13.2 ┆ 10.5 │\n", "│ 41 to 50 years ┆ 34.2 ┆ 16.8 ┆ 17.4 │\n", "│ 51 to 59 years ┆ 2.6 ┆ 6.0 ┆ -3.4 │\n", "│ 60 to 70 years ┆ null ┆ 8.2 ┆ -8.2 │\n", "│ 70 years or more ┆ 2.6 ┆ 4.8 ┆ -2.2 │\n", "└──────────────────┴───────────────┴──────────┴───────────┘\n", "\n", "============================================================\n", "Gender (% within each group)\n", "============================================================\n", "shape: (3, 4)\n", "┌───────────────────┬──────────┬───────────────┬───────────┐\n", "│ Gender ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════════════╪══════════╪═══════════════╪═══════════╡\n", "│ Man ┆ 59.2 ┆ 65.8 ┆ 6.6 │\n", "│ Prefer not to say ┆ 0.2 ┆ null ┆ -0.2 │\n", "│ Woman ┆ 40.5 ┆ 34.2 ┆ -6.3 │\n", "└───────────────────┴──────────┴───────────────┴───────────┘\n", "\n", "============================================================\n", "Race/Ethnicity (% within each group)\n", "============================================================\n", "shape: (21, 4)\n", "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n", "│ Race/Ethnicity ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", "│ Alaska Native or Indigenous Am… ┆ 0.7 ┆ null ┆ -0.7 │\n", "│ Alaska Native or Indigenous Am… ┆ 0.2 ┆ null ┆ -0.2 │\n", "│ Alaska Native or Indigenous Am… ┆ 0.5 ┆ null ┆ -0.5 │\n", "│ Alaska Native or Indigenous Am… ┆ 0.5 ┆ null ┆ -0.5 │\n", "│ Alaska Native or Indigenous Am… ┆ 0.5 ┆ null ┆ -0.5 │\n", "│ … ┆ … ┆ … ┆ … │\n", "│ Hispanic or Latinx ┆ 7.4 ┆ 7.9 ┆ 0.5 │\n", "│ Hispanic or Latinx,White or Ca… ┆ 6.0 ┆ null ┆ -6.0 │\n", "│ Middle Eastern or North Africa… ┆ 0.2 ┆ null ┆ -0.2 │\n", "│ Native Hawaiian or Other Pacif… ┆ 0.5 ┆ null ┆ -0.5 │\n", "│ White or Caucasian ┆ 50.8 ┆ 52.6 ┆ 1.8 │\n", "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n", "\n", "============================================================\n", "Bussiness_Owner (% within each group)\n", "============================================================\n", "shape: (2, 4)\n", "┌─────────────────┬───────────────┬──────────┬───────────┐\n", "│ Bussiness_Owner ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════╪═══════════════╪══════════╪═══════════╡\n", "│ No ┆ 73.5 ┆ 72.8 ┆ 0.7 │\n", "│ Yes ┆ 26.5 ┆ 27.2 ┆ -0.7 │\n", "└─────────────────┴───────────────┴──────────┴───────────┘\n", "\n", "============================================================\n", "Business_Revenue (% within each group)\n", "============================================================\n", "shape: (6, 4)\n", "┌──────────────────────────────┬──────────┬───────────────┬───────────┐\n", "│ Business_Revenue ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", "│ $1 million to $4.9 million ┆ 27.7 ┆ 66.7 ┆ 39.0 │\n", "│ $10 million to $19.9 million ┆ 9.6 ┆ null ┆ -9.6 │\n", "│ $100,00 to $249,999 ┆ 13.3 ┆ null ┆ -13.3 │\n", "│ $250,000 to $499,999 ┆ 10.8 ┆ 11.1 ┆ 0.3 │\n", "│ $5 million to $9.9 million ┆ 14.5 ┆ null ┆ -14.5 │\n", "│ $500,00 to $999,999 ┆ 24.1 ┆ 22.2 ┆ -1.9 │\n", "└──────────────────────────────┴──────────┴───────────────┴───────────┘\n", "\n", "============================================================\n", "Employment (% within each group)\n", "============================================================\n", "shape: (0, 1)\n", "┌────────────┐\n", "│ Employment │\n", "│ --- │\n", "│ str │\n", "╞════════════╡\n", "└────────────┘\n", "\n", "============================================================\n", "Personal_Products (% within each group)\n", "============================================================\n", "shape: (0, 1)\n", "┌───────────────────┐\n", "│ Personal_Products │\n", "│ --- │\n", "│ str │\n", "╞═══════════════════╡\n", "└───────────────────┘\n", "\n", "============================================================\n", "Income (% within each group)\n", "============================================================\n", "shape: (8, 4)\n", "┌──────────────────────┬──────────┬───────────────┬───────────┐\n", "│ Income ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════════════════╪══════════╪═══════════════╪═══════════╡\n", "│ $100,000 to $149,999 ┆ 16.3 ┆ 15.8 ┆ -0.5 │\n", "│ $150,000 to $199,999 ┆ 10.8 ┆ 7.9 ┆ -2.9 │\n", "│ $200,000 or more ┆ 3.6 ┆ 10.5 ┆ 6.9 │\n", "│ $25,000 to $34,999 ┆ 11.3 ┆ 13.2 ┆ 1.9 │\n", "│ $35,000 to $54,999 ┆ 14.1 ┆ 15.8 ┆ 1.7 │\n", "│ $55,000 to $79,999 ┆ 18.0 ┆ 10.5 ┆ -7.5 │\n", "│ $80,000 to $99,999 ┆ 12.0 ┆ 21.1 ┆ 9.1 │\n", "│ Less than $25,000 ┆ 13.9 ┆ 5.3 ┆ -8.6 │\n", "└──────────────────────┴──────────┴───────────────┴───────────┘\n", "\n", "============================================================\n", "Investable_Assets (% within each group)\n", "============================================================\n", "shape: (9, 4)\n", "┌──────────────────────┬───────────────┬──────────┬───────────┐\n", "│ Investable_Assets ┆ Missing QID98 ┆ Complete ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞══════════════════════╪═══════════════╪══════════╪═══════════╡\n", "│ $0 to $24,999 ┆ 23.7 ┆ 21.1 ┆ 2.6 │\n", "│ $150,000 to $249,999 ┆ 5.3 ┆ 12.0 ┆ -6.7 │\n", "│ $1M to $4.9M ┆ 5.3 ┆ 4.8 ┆ 0.5 │\n", "│ $25,000 to $49,999 ┆ 15.8 ┆ 19.9 ┆ -4.1 │\n", "│ $250,000 to $499,999 ┆ 10.5 ┆ 12.5 ┆ -2.0 │\n", "│ $50,000 to $149,999 ┆ 26.3 ┆ 16.8 ┆ 9.5 │\n", "│ $500,000 to $999,999 ┆ 10.5 ┆ 11.0 ┆ -0.5 │\n", "│ $5M or more ┆ null ┆ 0.7 ┆ -0.7 │\n", "│ Prefer not to answer ┆ 2.6 ┆ 1.2 ┆ 1.4 │\n", "└──────────────────────┴───────────────┴──────────┴───────────┘\n", "\n", "============================================================\n", "Industry (% within each group)\n", "============================================================\n", "shape: (26, 4)\n", "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n", "│ Industry ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", "│ Agriculture, forestry, fishing… ┆ 0.5 ┆ null ┆ -0.5 │\n", "│ Arts, entertainment, or recrea… ┆ 2.9 ┆ 2.6 ┆ -0.3 │\n", "│ Broadcasting ┆ 0.5 ┆ null ┆ -0.5 │\n", "│ Construction ┆ 10.8 ┆ 10.5 ┆ -0.3 │\n", "│ Education – College, universit… ┆ 3.8 ┆ 2.6 ┆ -1.2 │\n", "│ … ┆ … ┆ … ┆ … │\n", "│ Software ┆ 9.1 ┆ 13.2 ┆ 4.1 │\n", "│ Telecommunications ┆ 0.7 ┆ 2.6 ┆ 1.9 │\n", "│ Transportation and warehousing ┆ 4.1 ┆ 2.6 ┆ -1.5 │\n", "│ Utilities ┆ 1.0 ┆ 2.6 ┆ 1.6 │\n", "│ Wholesale ┆ 6.5 ┆ 7.9 ┆ 1.4 │\n", "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n", "\n", "============================================================\n", "Consumer (% within each group)\n", "============================================================\n", "shape: (10, 4)\n", "┌─────────────────────────────────┬──────────┬───────────────┬───────────┐\n", "│ Consumer ┆ Complete ┆ Missing QID98 ┆ diff_ppts │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 ┆ f64 │\n", "╞═════════════════════════════════╪══════════╪═══════════════╪═══════════╡\n", "│ Early_Professional ┆ 2.9 ┆ null ┆ -2.9 │\n", "│ Lower_Mass_A ┆ 7.5 ┆ 13.3 ┆ 5.8 │\n", "│ Lower_Mass_B ┆ 2.6 ┆ null ┆ -2.6 │\n", "│ MassAffluent_A ┆ 7.5 ┆ 13.3 ┆ 5.8 │\n", "│ MassAffluent_B ┆ 2.9 ┆ null ┆ -2.9 │\n", "│ Mass_A ┆ 19.5 ┆ 23.3 ┆ 3.8 │\n", "│ Mass_B ┆ 5.7 ┆ 3.3 ┆ -2.4 │\n", "│ Mix_of_Affluent_Wealth_&_High_… ┆ 7.8 ┆ 10.0 ┆ 2.2 │\n", "│ Mix_of_Affluent_Wealth_&_High_… ┆ 2.9 ┆ null ┆ -2.9 │\n", "│ Starter ┆ 40.8 ┆ 36.7 ┆ -4.1 │\n", "└─────────────────────────────────┴──────────┴───────────────┴───────────┘\n" ] } ], "source": [ "# Percentage distribution comparison for each demographic\n", "def compare_distribution(df: pl.DataFrame, col: str, status_col: str = 'ranking_status'):\n", " \"\"\"Show side-by-side percentage distributions for a column.\"\"\"\n", " counts = (\n", " df.filter(pl.col(col).is_not_null())\n", " .group_by([status_col, col])\n", " .agg(pl.len().alias('n'))\n", " )\n", " totals = counts.group_by(status_col).agg(pl.col('n').sum().alias('total'))\n", " pcts = counts.join(totals, on=status_col).with_columns(\n", " (pl.col('n') / pl.col('total') * 100).round(1).alias('pct')\n", " )\n", " # Pivot for readability\n", " pivot = pcts.pivot(\n", " on=status_col,\n", " index=col,\n", " values='pct',\n", " ).sort(col)\n", " \n", " # Add difference column if both groups exist\n", " if 'Missing QID98' in pivot.columns and 'Complete' in pivot.columns:\n", " pivot = pivot.with_columns(\n", " (pl.col('Missing QID98').fill_null(0) - pl.col('Complete').fill_null(0))\n", " .round(1)\n", " .alias('diff_ppts')\n", " )\n", " return pivot\n", "\n", "\n", "for col in demo_cols:\n", " print(f'\\n{\"=\" * 60}')\n", " print(f'{col} (% within each group)')\n", " print('=' * 60)\n", " print(compare_distribution(demos_df, col))" ] }, { "cell_type": "code", "execution_count": 13, "id": "4e53657a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Visual comparison of key demographics\n", "charts = []\n", "for col in ['Age', 'Gender', 'Consumer']:\n", " if col not in demos_df.columns:\n", " continue\n", " _df = demos_df.filter(pl.col(col).is_not_null())\n", " _counts = _df.group_by(['ranking_status', col]).agg(pl.len().alias('n'))\n", " _totals = _counts.group_by('ranking_status').agg(pl.col('n').sum().alias('total'))\n", " _pcts = _counts.join(_totals, on='ranking_status').with_columns(\n", " (pl.col('n') / pl.col('total') * 100).round(1).alias('pct')\n", " )\n", " c = alt.Chart(_pcts.to_pandas()).mark_bar().encode(\n", " x=alt.X(f'{col}:N', title=col),\n", " y=alt.Y('pct:Q', title='% of group'),\n", " color='ranking_status:N',\n", " xOffset='ranking_status:N',\n", " tooltip=[col, 'ranking_status', 'n', 'pct'],\n", " ).properties(width=350, height=250, title=f'{col} Distribution')\n", " charts.append(c)\n", "\n", "alt.hconcat(*charts)" ] }, { "cell_type": "markdown", "id": "d847c8cc", "metadata": {}, "source": [ "---\n", "## 3. Survey Flow: Which Questions Were Answered?\n", "\n", "Check which questions the 38 respondents answered vs skipped to identify where the survey flow diverged." ] }, { "cell_type": "code", "execution_count": 14, "id": "794e312e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Null rates (%) for key questions by group:\n", "shape: (9, 4)\n", "┌─────────────────────────────────┬───────────────┬───────────────┬────────────────┐\n", "│ Question ┆ QID ┆ Missing_null% ┆ Complete_null% │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 ┆ f64 │\n", "╞═════════════════════════════════╪═══════════════╪═══════════════╪════════════════╡\n", "│ Demographics (QID1 Age) ┆ QID1 ┆ 0.0 ┆ 0.0 │\n", "│ Demographics (QID2 Gender) ┆ QID2 ┆ 0.0 ┆ 0.0 │\n", "│ Top 8 Traits (QID25) ┆ QID25 ┆ 0.0 ┆ 0.0 │\n", "│ Character Ranking (QID27_1) ┆ QID27_1 ┆ 0.0 ┆ 0.0 │\n", "│ 18→8 Set A (QID29) ┆ QID29 ┆ 0.0 ┆ 0.0 │\n", "│ 18→8 Set B (QID101) ┆ QID101 ┆ 0.0 ┆ 0.0 │\n", "│ 8→3 Selection (QID36_0_GROUP) ┆ QID36_0_GROUP ┆ 0.0 ┆ 0.0 │\n", "│ Voice Ranking Q (QID98_1) ┆ QID98_1 ┆ 100.0 ┆ 74.8 │\n", "│ Voice Scale 1-10 first (QID98_… ┆ QID98_2 ┆ 100.0 ┆ 77.5 │\n", "└─────────────────────────────────┴───────────────┴───────────────┴────────────────┘\n" ] } ], "source": [ "# Check null rates for key question groups: missing vs complete respondents\n", "missing_raw = raw.filter(pl.col('ranking_status') == 'Missing QID98')\n", "complete_raw = raw.filter(pl.col('ranking_status') == 'Complete')\n", "\n", "# Key question groups to check\n", "question_groups = {\n", " 'Demographics (QID1 Age)': 'QID1',\n", " 'Demographics (QID2 Gender)': 'QID2',\n", " 'Top 8 Traits (QID25)': 'QID25',\n", " 'Character Ranking (QID27_1)': 'QID27_1',\n", " '18→8 Set A (QID29)': 'QID29',\n", " '18→8 Set B (QID101)': 'QID101',\n", " '8→3 Selection (QID36_0_GROUP)': 'QID36_0_GROUP',\n", " 'Voice Ranking Q (QID98_1)': 'QID98_1',\n", " 'Voice Scale 1-10 first (QID98_2)': 'QID98_2',\n", "}\n", "\n", "null_comparison = []\n", "for label, qid in question_groups.items():\n", " if qid not in raw.columns:\n", " null_comparison.append({'Question': label, 'QID': qid, 'Missing_null%': 'N/A', 'Complete_null%': 'N/A'})\n", " continue\n", " m_null = missing_raw[qid].null_count() / missing_raw.height * 100\n", " c_null = complete_raw[qid].null_count() / complete_raw.height * 100\n", " null_comparison.append({\n", " 'Question': label,\n", " 'QID': qid,\n", " 'Missing_null%': round(m_null, 1),\n", " 'Complete_null%': round(c_null, 1),\n", " })\n", "\n", "null_df = pl.DataFrame(null_comparison)\n", "print('Null rates (%) for key questions by group:')\n", "print(null_df)" ] }, { "cell_type": "code", "execution_count": 15, "id": "fdc504ea", "metadata": {}, "outputs": [ { "ename": "ColumnNotFoundError", "evalue": "unable to find column \"diff_ppts\"; valid columns: []", "output_type": "error", "traceback": [ "\u001b[31m---------------------------------------------------------------------------\u001b[39m", "\u001b[31mColumnNotFoundError\u001b[39m Traceback (most recent call last)", "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 20\u001b[39m\n\u001b[32m 11\u001b[39m desc = S.qid_descr_map.get(col, {}).get(\u001b[33m'\u001b[39m\u001b[33mQName\u001b[39m\u001b[33m'\u001b[39m, col)\n\u001b[32m 12\u001b[39m diff_cols.append({\n\u001b[32m 13\u001b[39m \u001b[33m'\u001b[39m\u001b[33mcolumn\u001b[39m\u001b[33m'\u001b[39m: col,\n\u001b[32m 14\u001b[39m \u001b[33m'\u001b[39m\u001b[33mdescription\u001b[39m\u001b[33m'\u001b[39m: desc,\n\u001b[32m (...)\u001b[39m\u001b[32m 17\u001b[39m \u001b[33m'\u001b[39m\u001b[33mdiff_ppts\u001b[39m\u001b[33m'\u001b[39m: \u001b[38;5;28mround\u001b[39m(diff, \u001b[32m1\u001b[39m),\n\u001b[32m 18\u001b[39m })\n\u001b[32m---> \u001b[39m\u001b[32m20\u001b[39m diff_df = \u001b[43mpl\u001b[49m\u001b[43m.\u001b[49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdiff_cols\u001b[49m\u001b[43m)\u001b[49m\u001b[43m.\u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdiff_ppts\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[32m 21\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33mColumns with >30 ppts null-rate difference (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdiff_df.height\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m found):\u001b[39m\u001b[33m'\u001b[39m)\n\u001b[32m 22\u001b[39m \u001b[38;5;28mprint\u001b[39m(diff_df.head(\u001b[32m30\u001b[39m))\n", "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/dataframe/frame.py:5965\u001b[39m, in \u001b[36mDataFrame.sort\u001b[39m\u001b[34m(self, by, descending, nulls_last, multithreaded, maintain_order, *more_by)\u001b[39m\n\u001b[32m 5867\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 5868\u001b[39m \u001b[33;03mSort the dataframe by the given columns.\u001b[39;00m\n\u001b[32m 5869\u001b[39m \n\u001b[32m (...)\u001b[39m\u001b[32m 5951\u001b[39m \u001b[33;03m└──────┴─────┴─────┘\u001b[39;00m\n\u001b[32m 5952\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 5953\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mpolars\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mlazyframe\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m QueryOptFlags\n\u001b[32m 5955\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m (\n\u001b[32m 5956\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mlazy\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 5957\u001b[39m \u001b[43m \u001b[49m\u001b[43m.\u001b[49m\u001b[43msort\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 5958\u001b[39m \u001b[43m \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5959\u001b[39m \u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43mmore_by\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5960\u001b[39m \u001b[43m \u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdescending\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5961\u001b[39m \u001b[43m \u001b[49m\u001b[43mnulls_last\u001b[49m\u001b[43m=\u001b[49m\u001b[43mnulls_last\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5962\u001b[39m \u001b[43m \u001b[49m\u001b[43mmultithreaded\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmultithreaded\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5963\u001b[39m \u001b[43m \u001b[49m\u001b[43mmaintain_order\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaintain_order\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 5964\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m-> \u001b[39m\u001b[32m5965\u001b[39m \u001b[43m \u001b[49m\u001b[43m.\u001b[49m\u001b[43mcollect\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimizations\u001b[49m\u001b[43m=\u001b[49m\u001b[43mQueryOptFlags\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_eager\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 5966\u001b[39m )\n", "\u001b[36mFile \u001b[39m\u001b[32m~/Documents/VoiceBranding/JPMC/Phase-3/.venv/lib/python3.12/site-packages/polars/_utils/deprecation.py:97\u001b[39m, in \u001b[36mdeprecate_streaming_parameter.