started speech data notebook
This commit is contained in:
249
speech_data_correlation.ipynb
Normal file
249
speech_data_correlation.ipynb
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "7174c11a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import polars as pl\n",
|
||||||
|
"from utils import QualtricsSurvey, calculate_weighted_ranking_scores"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "d9d11d52",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n",
|
||||||
|
"RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n",
|
||||||
|
"QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "c8f06ff8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "8da85898",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Convert comma decimal separator to period and cast to float\n",
|
||||||
|
"speech_df = speech_df.with_columns([\n",
|
||||||
|
" pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n",
|
||||||
|
" pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n",
|
||||||
|
" pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n",
|
||||||
|
" pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n",
|
||||||
|
"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "e7ccd8ef",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Convert specified columns to float, handling percent signs, commas and spaces\n",
|
||||||
|
"cols_to_convert = []\n",
|
||||||
|
"\n",
|
||||||
|
"if \"Standard deviation pitch\" in speech_df.columns:\n",
|
||||||
|
" cols_to_convert.append(\n",
|
||||||
|
" pl.col(\"Standard deviation pitch\")\n",
|
||||||
|
" .cast(pl.Utf8)\n",
|
||||||
|
" .str.replace(\",\", \".\")\n",
|
||||||
|
" .str.replace(\" \", \"\")\n",
|
||||||
|
" # strict=False converts unparseable strings (like empty ones) to Null\n",
|
||||||
|
" .cast(pl.Float64, strict=False)\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n",
|
||||||
|
" if col_name in speech_df.columns:\n",
|
||||||
|
" cols_to_convert.append(\n",
|
||||||
|
" pl.col(col_name)\n",
|
||||||
|
" .cast(pl.Utf8)\n",
|
||||||
|
" .str.replace(\"%\", \"\")\n",
|
||||||
|
" .str.replace(\" \", \"\")\n",
|
||||||
|
" .str.replace(\",\", \".\")\n",
|
||||||
|
" .cast(pl.Float64, strict=False)\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
"if cols_to_convert:\n",
|
||||||
|
" speech_df = speech_df.with_columns(cols_to_convert)\n",
|
||||||
|
"\n",
|
||||||
|
"# speech_df\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"id": "450d1d29",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"shape: (5, 22)\n",
|
||||||
|
"┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n",
|
||||||
|
"│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate ┆ articulation ┆ speech rate │\n",
|
||||||
|
"│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ e (s) ┆ (nsyll/dur) ┆ rate (nsyll ┆ words per │\n",
|
||||||
|
"│ str ┆ str ┆ i64 ┆ f64 ┆ ┆ --- ┆ --- ┆ / pho… ┆ minute │\n",
|
||||||
|
"│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ --- ┆ --- │\n",
|
||||||
|
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str │\n",
|
||||||
|
"╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n",
|
||||||
|
"│ VVV74 ┆ male ┆ 41 ┆ 95.25 ┆ … ┆ 11,17 ┆ 4,32468397 ┆ 5,81915846 ┆ 185,3436 │\n",
|
||||||
|
"│ VVV54 ┆ male ┆ 36 ┆ 126.93 ┆ … ┆ 10,83 ┆ 4,38596491 ┆ 6,00184672 ┆ 187,9699 │\n",
|
||||||
|
"│ VVV48 ┆ female ┆ 35 ┆ 193.296 ┆ … ┆ 11,92 ┆ 4,02227723 ┆ 5,45302013 ┆ 172,3833 │\n",
|
||||||
|
"│ VVV14 ┆ female ┆ 50 ┆ 169.214 ┆ … ┆ 12,75 ┆ 3,97553517 ┆ 5,09803922 ┆ 170,3801 │\n",
|
||||||
|
"│ VVV4 ┆ female ┆ 28 ┆ 208.079 ┆ … ┆ 12,22 ┆ 4,10094637 ┆ 5,31914894 ┆ 175,7548 │\n",
|
||||||
|
"└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Convert Voice ints to strings like \"V81\"\n",
|
||||||
|
"# Convert Voice ints to strings like \"V81\"\n",
|
||||||
|
"speech_df = speech_df.with_columns(\n",
|
||||||
|
" pl.when(pl.col(\"Voice\").is_not_null())\n",
|
||||||
|
" .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n",
|
||||||
|
" .otherwise(None)\n",
|
||||||
|
" .alias(\"Voice\")\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(speech_df.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5fb615fe",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Get survey data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bb4200ee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n",
|
||||||
|
"data_all = S.load_data()\n",
|
||||||
|
"data = S.filter_data(data_all)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"id": "57243afd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"shape: (5, 2)\n",
|
||||||
|
"┌───────────┬────────────────┐\n",
|
||||||
|
"│ Character ┆ Weighted Score │\n",
|
||||||
|
"│ --- ┆ --- │\n",
|
||||||
|
"│ str ┆ i64 │\n",
|
||||||
|
"╞═══════════╪════════════════╡\n",
|
||||||
|
"│ V14 ┆ 209 │\n",
|
||||||
|
"│ V04 ┆ 209 │\n",
|
||||||
|
"│ V08 ┆ 180 │\n",
|
||||||
|
"│ V82 ┆ 172 │\n",
|
||||||
|
"│ V77 ┆ 158 │\n",
|
||||||
|
"└───────────┴────────────────┘\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"top3_voices = S.get_top_3_voices(data)[0]\n",
|
||||||
|
"voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n",
|
||||||
|
"print(voices_weighted_rank.head())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"id": "b38d21fc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"shape: (5, 18)\n",
|
||||||
|
"┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n",
|
||||||
|
"│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n",
|
||||||
|
"│ --- ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n",
|
||||||
|
"│ str ┆ V14 ┆ V04 ┆ V08 ┆ ┆ V74 ┆ V81 ┆ V86 ┆ __V88 │\n",
|
||||||
|
"│ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n",
|
||||||
|
"│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n",
|
||||||
|
"╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n",
|
||||||
|
"│ R_59pdrC3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 5.5 ┆ null │\n",
|
||||||
|
"│ urLmZnbP ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||||
|
"│ R_3fJSKy5 ┆ 6.0 ┆ 5.0 ┆ null ┆ … ┆ null ┆ null ┆ 6.0 ┆ null │\n",
|
||||||
|
"│ SVxmNdBC ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||||
|
"│ R_3g11G0u ┆ 9.5 ┆ null ┆ 5.0 ┆ … ┆ null ┆ null ┆ null ┆ 9.5 │\n",
|
||||||
|
"│ pJ7iGt8Q ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||||
|
"│ R_3i3dGL7 ┆ 6.0 ┆ 9.0 ┆ 8.0 ┆ … ┆ null ┆ 2.0 ┆ 3.0 ┆ null │\n",
|
||||||
|
"│ cfLOTgxb ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||||
|
"│ R_3BBF1fR ┆ null ┆ null ┆ null ┆ … ┆ 6.0 ┆ null ┆ 8.5 ┆ null │\n",
|
||||||
|
"│ WGGeButr ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
|
||||||
|
"└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n",
|
||||||
|
"print(voice_1_10.head())\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "5b3e6ad0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "phase-3-quant",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.10"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user