From 8720bb670d4e5a0d2c9545ef1203148179e28544 Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Tue, 10 Feb 2026 14:58:13 +0100
Subject: [PATCH] started speech data notebook

---
 speech_data_correlation.ipynb | 249 ++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)
 create mode 100644 speech_data_correlation.ipynb

diff --git a/speech_data_correlation.ipynb b/speech_data_correlation.ipynb
new file mode 100644
index 0000000..f783603
--- /dev/null
+++ b/speech_data_correlation.ipynb
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7174c11a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl\n",
+    "from utils import QualtricsSurvey, calculate_weighted_ranking_scores"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d9d11d52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n",
+    "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n",
+    "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "c8f06ff8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8da85898",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert comma decimal separator to period and cast to float\n",
+    "speech_df = speech_df.with_columns([\n",
+    "    pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n",
+    "    pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n",
+    "    pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n",
+    "    pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "e7ccd8ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert specified columns to float, handling percent signs, commas and spaces\n",
+    "cols_to_convert = []\n",
+    "\n",
+    "if \"Standard deviation pitch\" in speech_df.columns:\n",
+    "    cols_to_convert.append(\n",
+    "        pl.col(\"Standard deviation pitch\")\n",
+    "        .cast(pl.Utf8)\n",
+    "        .str.replace(\",\", \".\")\n",
+    "        .str.replace(\" \", \"\")\n",
+    "        # strict=False converts unparseable strings (like empty ones) to Null\n",
+    "        .cast(pl.Float64, strict=False)\n",
+    "    )\n",
+    "\n",
+    "for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n",
+    "    if col_name in speech_df.columns:\n",
+    "        cols_to_convert.append(\n",
+    "            pl.col(col_name)\n",
+    "            .cast(pl.Utf8)\n",
+    "            .str.replace(\"%\", \"\")\n",
+    "            .str.replace(\" \", \"\")\n",
+    "            .str.replace(\",\", \".\")\n",
+    "            .cast(pl.Float64, strict=False)\n",
+    "        )\n",
+    "\n",
+    "if cols_to_convert:\n",
+    "    speech_df = speech_df.with_columns(cols_to_convert)\n",
+    "\n",
+    "# speech_df\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "450d1d29",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shape: (5, 22)\n",
+      "┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n",
+      "│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate   ┆ articulation ┆ speech rate │\n",
+      "│ ---   ┆ ---    ┆ --- ┆ ---        ┆   ┆ e (s)        ┆ (nsyll/dur)  ┆ rate (nsyll  ┆ words per   │\n",
+      "│ str   ┆ str    ┆ i64 ┆ f64        ┆   ┆ ---          ┆ ---          ┆ / pho…       ┆ minute      │\n",
+      "│       ┆        ┆     ┆            ┆   ┆ str          ┆ str          ┆ ---          ┆ ---         │\n",
+      "│       ┆        ┆     ┆            ┆   ┆              ┆              ┆ str          ┆ str         │\n",
+      "╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n",
+      "│ VVV74 ┆ male   ┆ 41  ┆ 95.25      ┆ … ┆ 11,17        ┆ 4,32468397   ┆ 5,81915846   ┆ 185,3436    │\n",
+      "│ VVV54 ┆ male   ┆ 36  ┆ 126.93     ┆ … ┆ 10,83        ┆ 4,38596491   ┆ 6,00184672   ┆ 187,9699    │\n",
+      "│ VVV48 ┆ female ┆ 35  ┆ 193.296    ┆ … ┆ 11,92        ┆ 4,02227723   ┆ 5,45302013   ┆ 172,3833    │\n",
+      "│ VVV14 ┆ female ┆ 50  ┆ 169.214    ┆ … ┆ 12,75        ┆ 3,97553517   ┆ 5,09803922   ┆ 170,3801    │\n",
+      "│ VVV4  ┆ female ┆ 28  ┆ 208.079    ┆ … ┆ 12,22        ┆ 4,10094637   ┆ 5,31914894   ┆ 175,7548    │\n",
+      "└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Convert Voice ints to strings like \"V81\"\n",
+    "# Convert Voice ints to strings like \"V81\"\n",
+    "speech_df = speech_df.with_columns(\n",
+    "    pl.when(pl.col(\"Voice\").is_not_null())\n",
+    "      .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n",
+    "      .otherwise(None)\n",
+    "      .alias(\"Voice\")\n",
+    ")\n",
+    "\n",
+    "print(speech_df.head())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5fb615fe",
+   "metadata": {},
+   "source": [
+    "# Get survey data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb4200ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n",
+    "data_all = S.load_data()\n",
+    "data = S.filter_data(data_all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "57243afd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shape: (5, 2)\n",
+      "┌───────────┬────────────────┐\n",
+      "│ Character ┆ Weighted Score │\n",
+      "│ ---       ┆ ---            │\n",
+      "│ str       ┆ i64            │\n",
+      "╞═══════════╪════════════════╡\n",
+      "│ V14       ┆ 209            │\n",
+      "│ V04       ┆ 209            │\n",
+      "│ V08       ┆ 180            │\n",
+      "│ V82       ┆ 172            │\n",
+      "│ V77       ┆ 158            │\n",
+      "└───────────┴────────────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "top3_voices = S.get_top_3_voices(data)[0]\n",
+    "voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n",
+    "print(voices_weighted_rank.head())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "id": "b38d21fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "shape: (5, 18)\n",
+      "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
+      "│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n",
+      "│ ---       ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆   ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n",
+      "│ str       ┆ V14       ┆ V04       ┆ V08       ┆   ┆ V74       ┆ V81       ┆ V86       ┆ __V88    │\n",
+      "│           ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │\n",
+      "│           ┆ f64       ┆ f64       ┆ f64       ┆   ┆ f64       ┆ f64       ┆ f64       ┆ f64      │\n",
+      "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
+      "│ R_59pdrC3 ┆ null      ┆ null      ┆ null      ┆ … ┆ null      ┆ null      ┆ 5.5       ┆ null     │\n",
+      "│ urLmZnbP  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
+      "│ R_3fJSKy5 ┆ 6.0       ┆ 5.0       ┆ null      ┆ … ┆ null      ┆ null      ┆ 6.0       ┆ null     │\n",
+      "│ SVxmNdBC  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
+      "│ R_3g11G0u ┆ 9.5       ┆ null      ┆ 5.0       ┆ … ┆ null      ┆ null      ┆ null      ┆ 9.5      │\n",
+      "│ pJ7iGt8Q  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
+      "│ R_3i3dGL7 ┆ 6.0       ┆ 9.0       ┆ 8.0       ┆ … ┆ null      ┆ 2.0       ┆ 3.0       ┆ null     │\n",
+      "│ cfLOTgxb  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
+      "│ R_3BBF1fR ┆ null      ┆ null      ┆ null      ┆ … ┆ 6.0       ┆ null      ┆ 8.5       ┆ null     │\n",
+      "│ WGGeButr  ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆          │\n",
+      "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n"
+     ]
+    }
+   ],
+   "source": [
+    "voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n",
+    "print(voice_1_10.head())\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b3e6ad0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "phase-3-quant",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}