From 8720bb670d4e5a0d2c9545ef1203148179e28544 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Tue, 10 Feb 2026 14:58:13 +0100 Subject: [PATCH] started speech data notebook --- speech_data_correlation.ipynb | 249 ++++++++++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100644 speech_data_correlation.ipynb diff --git a/speech_data_correlation.ipynb b/speech_data_correlation.ipynb new file mode 100644 index 0000000..f783603 --- /dev/null +++ b/speech_data_correlation.ipynb @@ -0,0 +1,249 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 24, + "id": "7174c11a", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "from utils import QualtricsSurvey, calculate_weighted_ranking_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d9d11d52", + "metadata": {}, + "outputs": [], + "source": [ + "SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n", + "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n", + "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "c8f06ff8", + "metadata": {}, + "outputs": [], + "source": [ + "speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8da85898", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert comma decimal separator to period and cast to float\n", + "speech_df = speech_df.with_columns([\n", + " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n", + " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n", + " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n", + " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e7ccd8ef", + "metadata": {}, + "outputs": [], + "source": [ + "# Convert specified columns to float, handling percent signs, commas and spaces\n", + "cols_to_convert = []\n", + "\n", + "if \"Standard deviation pitch\" in speech_df.columns:\n", + " cols_to_convert.append(\n", + " pl.col(\"Standard deviation pitch\")\n", + " .cast(pl.Utf8)\n", + " .str.replace(\",\", \".\")\n", + " .str.replace(\" \", \"\")\n", + " # strict=False converts unparseable strings (like empty ones) to Null\n", + " .cast(pl.Float64, strict=False)\n", + " )\n", + "\n", + "for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n", + " if col_name in speech_df.columns:\n", + " cols_to_convert.append(\n", + " pl.col(col_name)\n", + " .cast(pl.Utf8)\n", + " .str.replace(\"%\", \"\")\n", + " .str.replace(\" \", \"\")\n", + " .str.replace(\",\", \".\")\n", + " .cast(pl.Float64, strict=False)\n", + " )\n", + "\n", + "if cols_to_convert:\n", + " speech_df = speech_df.with_columns(cols_to_convert)\n", + "\n", + "# speech_df\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "450d1d29", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 22)\n", + "┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n", + "│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate ┆ articulation ┆ speech rate │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ e (s) ┆ (nsyll/dur) ┆ rate (nsyll ┆ words per │\n", + "│ str ┆ str ┆ i64 ┆ f64 ┆ ┆ --- ┆ --- ┆ / pho… ┆ minute │\n", + "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ --- ┆ --- │\n", + "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str │\n", + "╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n", + "│ VVV74 ┆ male ┆ 41 ┆ 95.25 ┆ … ┆ 11,17 ┆ 4,32468397 ┆ 5,81915846 ┆ 185,3436 │\n", + "│ VVV54 ┆ male ┆ 36 ┆ 126.93 ┆ … ┆ 10,83 ┆ 4,38596491 ┆ 6,00184672 ┆ 187,9699 │\n", + "│ VVV48 ┆ female ┆ 35 ┆ 193.296 ┆ … ┆ 11,92 ┆ 4,02227723 ┆ 5,45302013 ┆ 172,3833 │\n", + "│ VVV14 ┆ female ┆ 50 ┆ 169.214 ┆ … ┆ 12,75 ┆ 3,97553517 ┆ 5,09803922 ┆ 170,3801 │\n", + "│ VVV4 ┆ female ┆ 28 ┆ 208.079 ┆ … ┆ 12,22 ┆ 4,10094637 ┆ 5,31914894 ┆ 175,7548 │\n", + "└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n" + ] + } + ], + "source": [ + "# Convert Voice ints to strings like \"V81\"\n", + "# Convert Voice ints to strings like \"V81\"\n", + "speech_df = speech_df.with_columns(\n", + " pl.when(pl.col(\"Voice\").is_not_null())\n", + " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n", + " .otherwise(None)\n", + " .alias(\"Voice\")\n", + ")\n", + "\n", + "print(speech_df.head())" + ] + }, + { + "cell_type": "markdown", + "id": "5fb615fe", + "metadata": {}, + "source": [ + "# Get survey data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb4200ee", + "metadata": {}, + "outputs": [], + "source": [ + "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n", + "data_all = S.load_data()\n", + "data = S.filter_data(data_all)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "57243afd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 2)\n", + "┌───────────┬────────────────┐\n", + "│ Character ┆ Weighted Score │\n", + "│ --- ┆ --- │\n", + "│ str ┆ i64 │\n", + "╞═══════════╪════════════════╡\n", + "│ V14 ┆ 209 │\n", + "│ V04 ┆ 209 │\n", + "│ V08 ┆ 180 │\n", + "│ V82 ┆ 172 │\n", + "│ V77 ┆ 158 │\n", + "└───────────┴────────────────┘\n" + ] + } + ], + "source": [ + "top3_voices = S.get_top_3_voices(data)[0]\n", + "voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n", + "print(voices_weighted_rank.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "b38d21fc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 18)\n", + "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", + "│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n", + "│ --- ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n", + "│ str ┆ V14 ┆ V04 ┆ V08 ┆ ┆ V74 ┆ V81 ┆ V86 ┆ __V88 │\n", + "│ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", + "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", + "│ R_59pdrC3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 5.5 ┆ null │\n", + "│ urLmZnbP ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ R_3fJSKy5 ┆ 6.0 ┆ 5.0 ┆ null ┆ … ┆ null ┆ null ┆ 6.0 ┆ null │\n", + "│ SVxmNdBC ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ R_3g11G0u ┆ 9.5 ┆ null ┆ 5.0 ┆ … ┆ null ┆ null ┆ null ┆ 9.5 │\n", + "│ pJ7iGt8Q ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ R_3i3dGL7 ┆ 6.0 ┆ 9.0 ┆ 8.0 ┆ … ┆ null ┆ 2.0 ┆ 3.0 ┆ null │\n", + "│ cfLOTgxb ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "│ R_3BBF1fR ┆ null ┆ null ┆ null ┆ … ┆ 6.0 ┆ null ┆ 8.5 ┆ null │\n", + "│ WGGeButr ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", + "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n" + ] + } + ], + "source": [ + "voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n", + "print(voice_1_10.head())\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b3e6ad0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "phase-3-quant", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}