{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "7174c11a", "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "from utils import QualtricsSurvey, calculate_weighted_ranking_scores" ] }, { "cell_type": "code", "execution_count": 2, "id": "d9d11d52", "metadata": {}, "outputs": [], "source": [ "SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n", "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n", "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'" ] }, { "cell_type": "code", "execution_count": 37, "id": "c8f06ff8", "metadata": {}, "outputs": [], "source": [ "speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8da85898", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 38, "id": "e7ccd8ef", "metadata": {}, "outputs": [], "source": [ "\n", "# Convert comma decimal separator to period and cast to float\n", "cols_to_convert = [\n", " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n", "]\n", "\n", "# Convert specified columns to float, handling percent signs, commas and spaces\n", "if \"Standard deviation pitch\" in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(\"Standard deviation pitch\")\n", " .cast(pl.Utf8)\n", " .str.replace(\",\", \".\")\n", " .str.replace(\" \", \"\")\n", " # strict=False converts unparseable strings (like empty ones) to Null\n", " .cast(pl.Float64, strict=False)\n", " )\n", "\n", "for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n", " if col_name in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(col_name)\n", " .cast(pl.Utf8)\n", " .str.replace(\"%\", \"\")\n", " .str.replace(\" \", \"\")\n", " .str.replace(\",\", \".\")\n", " .cast(pl.Float64, strict=False)\n", " )\n", "\n", "if cols_to_convert:\n", " speech_df = speech_df.with_columns(cols_to_convert)\n", "\n", "# speech_df\n" ] }, { "cell_type": "code", "execution_count": 39, "id": "450d1d29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (18, 3)\n", "┌───────┬────────┬──────┐\n", "│ Voice ┆ Gender ┆ Age │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ i64 │\n", "╞═══════╪════════╪══════╡\n", "│ V04 ┆ female ┆ 28 │\n", "│ V08 ┆ female ┆ 45 │\n", "│ V14 ┆ female ┆ 50 │\n", "│ V16 ┆ male ┆ 40 │\n", "│ V34 ┆ male ┆ 42 │\n", "│ … ┆ … ┆ … │\n", "│ V82 ┆ female ┆ null │\n", "│ V86 ┆ male ┆ 62 │\n", "│ V88 ┆ male ┆ 42 │\n", "│ V89 ┆ female ┆ 32 │\n", "│ V91 ┆ female ┆ null │\n", "└───────┴────────┴──────┘\n" ] } ], "source": [ "# Convert Voice ints to zero-padded strings like \"V04\", \"V81\"\n", "# Survey uses zero-padded IDs (V04, V08) so we must match that format\n", "speech_df = speech_df.with_columns(\n", " pl.when(pl.col(\"Voice\").is_not_null())\n", " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8).str.zfill(2)]))\n", " .otherwise(None)\n", " .alias(\"Voice\")\n", ")\n", "\n", "print(speech_df.select([\"Voice\", \"Gender\", \"Age\"]).sort(\"Voice\"))" ] }, { "cell_type": "markdown", "id": "5fb615fe", "metadata": {}, "source": [ "# Get survey data" ] }, { "cell_type": "code", "execution_count": 7, "id": "bb4200ee", "metadata": {}, "outputs": [], "source": [ "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n", "data_all = S.load_data()\n", "data = S.filter_data(data_all)" ] }, { "cell_type": "code", "execution_count": 8, "id": "57243afd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 2)\n", "┌───────────┬────────────────┐\n", "│ Character ┆ Weighted Score │\n", "│ --- ┆ --- │\n", "│ str ┆ i64 │\n", "╞═══════════╪════════════════╡\n", "│ V14 ┆ 209 │\n", "│ V04 ┆ 209 │\n", "│ V08 ┆ 180 │\n", "│ V82 ┆ 172 │\n", "│ V77 ┆ 158 │\n", "└───────────┴────────────────┘\n" ] } ], "source": [ "top3_voices = S.get_top_3_voices(data)[0]\n", "voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n", "print(voices_weighted_rank.head())" ] }, { "cell_type": "code", "execution_count": 9, "id": "b38d21fc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 18)\n", "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", "│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n", "│ --- ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n", "│ str ┆ V14 ┆ V04 ┆ V08 ┆ ┆ V74 ┆ V81 ┆ V86 ┆ __V88 │\n", "│ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", "│ R_59pdrC3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 5.5 ┆ null │\n", "│ urLmZnbP ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3fJSKy5 ┆ 6.0 ┆ 5.0 ┆ null ┆ … ┆ null ┆ null ┆ 6.0 ┆ null │\n", "│ SVxmNdBC ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3g11G0u ┆ 9.5 ┆ null ┆ 5.0 ┆ … ┆ null ┆ null ┆ null ┆ 9.5 │\n", "│ pJ7iGt8Q ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3i3dGL7 ┆ 6.0 ┆ 9.0 ┆ 8.0 ┆ … ┆ null ┆ 2.0 ┆ 3.0 ┆ null │\n", "│ cfLOTgxb ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3BBF1fR ┆ null ┆ null ┆ null ┆ … ┆ 6.0 ┆ null ┆ 8.5 ┆ null │\n", "│ WGGeButr ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n" ] } ], "source": [ "voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n", "print(voice_1_10.head())\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "5b3e6ad0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 2)\n", "┌───────┬────────────────────────┐\n", "│ Voice ┆ Avg Voice Score (1-10) │\n", "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞═══════╪════════════════════════╡\n", "│ V08 ┆ 7.38172 │\n", "│ V82 ┆ 7.376984 │\n", "│ V89 ┆ 7.373206 │\n", "│ V86 ┆ 7.264444 │\n", "│ V69 ┆ 7.219577 │\n", "└───────┴────────────────────────┘\n" ] } ], "source": [ "# --- Compute average voice score (1-10) per voice ---\n", "voice_cols = [c for c in voice_1_10.columns if c.startswith(\"Voice_Scale_1_10__\")]\n", "avg_scores = []\n", "for col in voice_cols:\n", " voice_id = col.replace(\"Voice_Scale_1_10__\", \"\") # e.g. \"V14\"\n", " mean_val = voice_1_10.select(pl.col(col).mean()).item()\n", " avg_scores.append({\"Voice\": voice_id, \"Avg Voice Score (1-10)\": mean_val})\n", "\n", "avg_voice_scores = pl.DataFrame(avg_scores)\n", "print(avg_voice_scores.sort(\"Avg Voice Score (1-10)\", descending=True).head())" ] }, { "cell_type": "code", "execution_count": 40, "id": "79626ffb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merged rows: 18 (voices with both speech data and survey data)\n", " → Voices missing Avg Voice Score: ['V46']\n", "shape: (5, 4)\n", "┌───────┬────────┬────────────────────────┬────────────────┐\n", "│ Voice ┆ Gender ┆ Avg Voice Score (1-10) ┆ Weighted Score │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 ┆ i64 │\n", "╞═══════╪════════╪════════════════════════╪════════════════╡\n", "│ V14 ┆ female ┆ 7.216279 ┆ 209 │\n", "│ V04 ┆ female ┆ 7.07971 ┆ 209 │\n", "│ V08 ┆ female ┆ 7.38172 ┆ 180 │\n", "│ V82 ┆ female ┆ 7.376984 ┆ 172 │\n", "│ V77 ┆ female ┆ 6.960894 ┆ 158 │\n", "└───────┴────────┴────────────────────────┴────────────────┘\n" ] } ], "source": [ "# --- Normalize weighted rank column name and join all data ---\n", "weighted_rank = voices_weighted_rank.rename({\"Character\": \"Voice\"})\n", "\n", "# Join speech attributes with both survey metrics\n", "# Left join on avg_voice_scores so V46 (excluded from survey voice scale) is kept\n", "# — its Avg Score will be null but Weighted Ranking Score is still valid\n", "merged = (\n", " speech_df\n", " .join(avg_voice_scores, on=\"Voice\", how=\"left\")\n", " .join(weighted_rank, on=\"Voice\", how=\"inner\")\n", ")\n", "\n", "print(f\"Merged rows: {merged.height} (voices with both speech data and survey data)\")\n", "print(f\" → Voices missing Avg Voice Score: {merged.filter(pl.col('Avg Voice Score (1-10)').is_null())['Voice'].to_list()}\")\n", "print(merged.select([\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"]).head())" ] }, { "cell_type": "markdown", "id": "932cfb9e", "metadata": {}, "source": [ "# Correlation: Speech Attributes vs Survey Metrics\n", "\n", "We correlate each speech characteristic (pitch, duration, jitter, etc.) against two survey metrics:\n", "\n", "| Metric | Type | Correlation Method | Why |\n", "|---|---|---|---|\n", "| **Avg Voice Score (1-10)** | Continuous | **Pearson** | Both variables are continuous and approximately interval-scaled — Pearson captures linear relationships well here. |\n", "| **Weighted Ranking Score** | Ordinal / count-based | **Spearman** | The weighted score (1st=3pts, 2nd=2pts, 3rd=1pt) is ordinal in nature with a small number of discrete values. Spearman is rank-based, making no assumptions about linearity or normality — more appropriate for this type of data. |\n", "\n", "> **Note:** With only ~17 voices, all correlations should be interpreted cautiously. Small samples amplify the influence of individual data points." ] }, { "cell_type": "code", "execution_count": 41, "id": "77658327", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (39, 3)\n", "┌─────────────────────────────────┬─────────────────────────────────┬─────────────┐\n", "│ attribute ┆ metric ┆ correlation │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 │\n", "╞═════════════════════════════════╪═════════════════════════════════╪═════════════╡\n", "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.386054 │\n", "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.094086 │\n", "│ Mean pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.459684 │\n", "│ Mean pitch ┆ Weighted Ranking Score [Spearm… ┆ 0.63429 │\n", "│ Standard deviation pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.664432 │\n", "│ … ┆ … ┆ … │\n", "│ speechrate (nsyll/dur) ┆ Weighted Ranking Score [Spearm… ┆ -0.336524 │\n", "│ articulation rate (nsyll / pho… ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.456181 │\n", "│ articulation rate (nsyll / pho… ┆ Weighted Ranking Score [Spearm… ┆ -0.268239 │\n", "│ speech rate words per minute ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.26437 │\n", "│ speech rate words per minute ┆ Weighted Ranking Score [Spearm… ┆ 0.252577 │\n", "└─────────────────────────────────┴─────────────────────────────────┴─────────────┘\n" ] } ], "source": [ "# --- Compute correlations ---\n", "# Pearson for continuous Voice Score, Spearman for ordinal Weighted Ranking Score\n", "exclude_cols = {\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"}\n", "speech_attrs = [c for c in merged.columns if c not in exclude_cols]\n", "\n", "rows = []\n", "for attr in speech_attrs:\n", " # Drop nulls for the pair before computing correlation\n", " valid = merged.select([attr, \"Avg Voice Score (1-10)\", \"Weighted Score\"]).drop_nulls()\n", " if valid.height > 2:\n", " # Pearson for continuous 1-10 score\n", " r_score = valid.select(pl.corr(attr, \"Avg Voice Score (1-10)\", method=\"pearson\")).item()\n", " # Spearman for ordinal weighted ranking score\n", " r_rank = valid.select(pl.corr(attr, \"Weighted Score\", method=\"spearman\")).item()\n", " else:\n", " r_score = None\n", " r_rank = None\n", " rows.append({\"attribute\": attr, \"metric\": \"Avg Voice Score (1-10) [Pearson]\", \"correlation\": r_score})\n", " rows.append({\"attribute\": attr, \"metric\": \"Weighted Ranking Score [Spearman]\", \"correlation\": r_rank})\n", "\n", "corr_long = (\n", " pl.DataFrame(rows)\n", " .drop_nulls()\n", " .filter(pl.col(\"correlation\").is_not_nan())\n", ")\n", "print(corr_long)" ] }, { "cell_type": "code", "execution_count": 46, "id": "ef4ceefc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Female Age correlation:\n", "shape: (2, 3)\n", "┌───────────┬─────────────────────────────────┬─────────────┐\n", "│ attribute ┆ metric ┆ correlation │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 │\n", "╞═══════════╪═════════════════════════════════╪═════════════╡\n", "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.023566 │\n", "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.231908 │\n", "└───────────┴─────────────────────────────────┴─────────────┘\n", "\n", "Female data (Age + Weighted Score):\n", "shape: (6, 3)\n", "┌───────┬─────┬────────────────┐\n", "│ Voice ┆ Age ┆ Weighted Score │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i64 ┆ i64 │\n", "╞═══════╪═════╪════════════════╡\n", "│ V04 ┆ 28 ┆ 209 │\n", "│ V89 ┆ 32 ┆ 130 │\n", "│ V48 ┆ 35 ┆ 144 │\n", "│ V08 ┆ 45 ┆ 180 │\n", "│ V77 ┆ 48 ┆ 158 │\n", "│ V14 ┆ 50 ┆ 209 │\n", "└───────┴─────┴────────────────┘\n" ] } ], "source": [ "# Verify: Age correlation in female split should no longer be 1.0\n", "print(\"Female Age correlation:\")\n", "print(corr_female.filter(pl.col(\"attribute\") == \"Age\"))\n", "\n", "print(\"\\nFemale data (Age + Weighted Score):\")\n", "female_check = merged.filter(pl.col(\"Gender\") == \"female\").select([\"Voice\", \"Age\", \"Weighted Score\"]).drop_nulls().sort(\"Age\")\n", "print(female_check)" ] }, { "cell_type": "code", "execution_count": 42, "id": "0d9567ff", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation.png\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# --- Plot correlation heatmap ---\n", "S.plot_speech_attribute_correlation(\n", " corr_long,\n", " title=\"Speech Characteristics vs Survey Metrics