Files
JPMC-quant/speech_data_correlation.ipynb

250 lines
11 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 24,
"id": "7174c11a",
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"from utils import QualtricsSurvey, calculate_weighted_ranking_scores"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d9d11d52",
"metadata": {},
"outputs": [],
"source": [
"SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n",
"RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n",
"QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c8f06ff8",
"metadata": {},
"outputs": [],
"source": [
"speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "8da85898",
"metadata": {},
"outputs": [],
"source": [
"# Convert comma decimal separator to period and cast to float\n",
"speech_df = speech_df.with_columns([\n",
" pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n",
" pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n",
" pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n",
" pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "e7ccd8ef",
"metadata": {},
"outputs": [],
"source": [
"# Convert specified columns to float, handling percent signs, commas and spaces\n",
"cols_to_convert = []\n",
"\n",
"if \"Standard deviation pitch\" in speech_df.columns:\n",
" cols_to_convert.append(\n",
" pl.col(\"Standard deviation pitch\")\n",
" .cast(pl.Utf8)\n",
" .str.replace(\",\", \".\")\n",
" .str.replace(\" \", \"\")\n",
" # strict=False converts unparseable strings (like empty ones) to Null\n",
" .cast(pl.Float64, strict=False)\n",
" )\n",
"\n",
"for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n",
" if col_name in speech_df.columns:\n",
" cols_to_convert.append(\n",
" pl.col(col_name)\n",
" .cast(pl.Utf8)\n",
" .str.replace(\"%\", \"\")\n",
" .str.replace(\" \", \"\")\n",
" .str.replace(\",\", \".\")\n",
" .cast(pl.Float64, strict=False)\n",
" )\n",
"\n",
"if cols_to_convert:\n",
" speech_df = speech_df.with_columns(cols_to_convert)\n",
"\n",
"# speech_df\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "450d1d29",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (5, 22)\n",
"┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n",
"│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate ┆ articulation ┆ speech rate │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ e (s) ┆ (nsyll/dur) ┆ rate (nsyll ┆ words per │\n",
"│ str ┆ str ┆ i64 ┆ f64 ┆ ┆ --- ┆ --- ┆ / pho… ┆ minute │\n",
"│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ --- ┆ --- │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str │\n",
"╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n",
"│ VVV74 ┆ male ┆ 41 ┆ 95.25 ┆ … ┆ 11,17 ┆ 4,32468397 ┆ 5,81915846 ┆ 185,3436 │\n",
"│ VVV54 ┆ male ┆ 36 ┆ 126.93 ┆ … ┆ 10,83 ┆ 4,38596491 ┆ 6,00184672 ┆ 187,9699 │\n",
"│ VVV48 ┆ female ┆ 35 ┆ 193.296 ┆ … ┆ 11,92 ┆ 4,02227723 ┆ 5,45302013 ┆ 172,3833 │\n",
"│ VVV14 ┆ female ┆ 50 ┆ 169.214 ┆ … ┆ 12,75 ┆ 3,97553517 ┆ 5,09803922 ┆ 170,3801 │\n",
"│ VVV4 ┆ female ┆ 28 ┆ 208.079 ┆ … ┆ 12,22 ┆ 4,10094637 ┆ 5,31914894 ┆ 175,7548 │\n",
"└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n"
]
}
],
"source": [
"# Convert Voice ints to strings like \"V81\"\n",
"# Convert Voice ints to strings like \"V81\"\n",
"speech_df = speech_df.with_columns(\n",
" pl.when(pl.col(\"Voice\").is_not_null())\n",
" .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n",
" .otherwise(None)\n",
" .alias(\"Voice\")\n",
")\n",
"\n",
"print(speech_df.head())"
]
},
{
"cell_type": "markdown",
"id": "5fb615fe",
"metadata": {},
"source": [
"# Get survey data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bb4200ee",
"metadata": {},
"outputs": [],
"source": [
"S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n",
"data_all = S.load_data()\n",
"data = S.filter_data(data_all)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "57243afd",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (5, 2)\n",
"┌───────────┬────────────────┐\n",
"│ Character ┆ Weighted Score │\n",
"│ --- ┆ --- │\n",
"│ str ┆ i64 │\n",
"╞═══════════╪════════════════╡\n",
"│ V14 ┆ 209 │\n",
"│ V04 ┆ 209 │\n",
"│ V08 ┆ 180 │\n",
"│ V82 ┆ 172 │\n",
"│ V77 ┆ 158 │\n",
"└───────────┴────────────────┘\n"
]
}
],
"source": [
"top3_voices = S.get_top_3_voices(data)[0]\n",
"voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n",
"print(voices_weighted_rank.head())"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "b38d21fc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"shape: (5, 18)\n",
"┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
"│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n",
"│ --- ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n",
"│ str ┆ V14 ┆ V04 ┆ V08 ┆ ┆ V74 ┆ V81 ┆ V86 ┆ __V88 │\n",
"│ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
"╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
"│ R_59pdrC3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 5.5 ┆ null │\n",
"│ urLmZnbP ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ R_3fJSKy5 ┆ 6.0 ┆ 5.0 ┆ null ┆ … ┆ null ┆ null ┆ 6.0 ┆ null │\n",
"│ SVxmNdBC ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ R_3g11G0u ┆ 9.5 ┆ null ┆ 5.0 ┆ … ┆ null ┆ null ┆ null ┆ 9.5 │\n",
"│ pJ7iGt8Q ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ R_3i3dGL7 ┆ 6.0 ┆ 9.0 ┆ 8.0 ┆ … ┆ null ┆ 2.0 ┆ 3.0 ┆ null │\n",
"│ cfLOTgxb ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ R_3BBF1fR ┆ null ┆ null ┆ null ┆ … ┆ 6.0 ┆ null ┆ 8.5 ┆ null │\n",
"│ WGGeButr ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n"
]
}
],
"source": [
"voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n",
"print(voice_1_10.head())\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b3e6ad0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "phase-3-quant",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}