{ "cells": [ { "cell_type": "code", "execution_count": 24, "id": "7174c11a", "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "from utils import QualtricsSurvey, calculate_weighted_ranking_scores" ] }, { "cell_type": "code", "execution_count": 3, "id": "d9d11d52", "metadata": {}, "outputs": [], "source": [ "SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n", "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n", "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'" ] }, { "cell_type": "code", "execution_count": 23, "id": "c8f06ff8", "metadata": {}, "outputs": [], "source": [ "speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n", "\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "8da85898", "metadata": {}, "outputs": [], "source": [ "# Convert comma decimal separator to period and cast to float\n", "speech_df = speech_df.with_columns([\n", " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n", "])" ] }, { "cell_type": "code", "execution_count": 22, "id": "e7ccd8ef", "metadata": {}, "outputs": [], "source": [ "# Convert specified columns to float, handling percent signs, commas and spaces\n", "cols_to_convert = []\n", "\n", "if \"Standard deviation pitch\" in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(\"Standard deviation pitch\")\n", " .cast(pl.Utf8)\n", " .str.replace(\",\", \".\")\n", " .str.replace(\" \", \"\")\n", " # strict=False converts unparseable strings (like empty ones) to Null\n", " .cast(pl.Float64, strict=False)\n", " )\n", "\n", "for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n", " if col_name in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(col_name)\n", " .cast(pl.Utf8)\n", " .str.replace(\"%\", \"\")\n", " .str.replace(\" \", \"\")\n", " .str.replace(\",\", \".\")\n", " .cast(pl.Float64, strict=False)\n", " )\n", "\n", "if cols_to_convert:\n", " speech_df = speech_df.with_columns(cols_to_convert)\n", "\n", "# speech_df\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "450d1d29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 22)\n", "┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n", "│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate ┆ articulation ┆ speech rate │\n", "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ e (s) ┆ (nsyll/dur) ┆ rate (nsyll ┆ words per │\n", "│ str ┆ str ┆ i64 ┆ f64 ┆ ┆ --- ┆ --- ┆ / pho… ┆ minute │\n", "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ --- ┆ --- │\n", "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str │\n", "╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n", "│ VVV74 ┆ male ┆ 41 ┆ 95.25 ┆ … ┆ 11,17 ┆ 4,32468397 ┆ 5,81915846 ┆ 185,3436 │\n", "│ VVV54 ┆ male ┆ 36 ┆ 126.93 ┆ … ┆ 10,83 ┆ 4,38596491 ┆ 6,00184672 ┆ 187,9699 │\n", "│ VVV48 ┆ female ┆ 35 ┆ 193.296 ┆ … ┆ 11,92 ┆ 4,02227723 ┆ 5,45302013 ┆ 172,3833 │\n", "│ VVV14 ┆ female ┆ 50 ┆ 169.214 ┆ … ┆ 12,75 ┆ 3,97553517 ┆ 5,09803922 ┆ 170,3801 │\n", "│ VVV4 ┆ female ┆ 28 ┆ 208.079 ┆ … ┆ 12,22 ┆ 4,10094637 ┆ 5,31914894 ┆ 175,7548 │\n", "└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n" ] } ], "source": [ "# Convert Voice ints to strings like \"V81\"\n", "# Convert Voice ints to strings like \"V81\"\n", "speech_df = speech_df.with_columns(\n", " pl.when(pl.col(\"Voice\").is_not_null())\n", " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n", " .otherwise(None)\n", " .alias(\"Voice\")\n", ")\n", "\n", "print(speech_df.head())" ] }, { "cell_type": "markdown", "id": "5fb615fe", "metadata": {}, "source": [ "# Get survey data" ] }, { "cell_type": "code", "execution_count": null, "id": "bb4200ee", "metadata": {}, "outputs": [], "source": [ "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n", "data_all = S.load_data()\n", "data = S.filter_data(data_all)" ] }, { "cell_type": "code", "execution_count": 34, "id": "57243afd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 2)\n", "┌───────────┬────────────────┐\n", "│ Character ┆ Weighted Score │\n", "│ --- ┆ --- │\n", "│ str ┆ i64 │\n", "╞═══════════╪════════════════╡\n", "│ V14 ┆ 209 │\n", "│ V04 ┆ 209 │\n", "│ V08 ┆ 180 │\n", "│ V82 ┆ 172 │\n", "│ V77 ┆ 158 │\n", "└───────────┴────────────────┘\n" ] } ], "source": [ "top3_voices = S.get_top_3_voices(data)[0]\n", "voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n", "print(voices_weighted_rank.head())" ] }, { "cell_type": "code", "execution_count": 35, "id": "b38d21fc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 18)\n", "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", "│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n", "│ --- ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n", "│ str ┆ V14 ┆ V04 ┆ V08 ┆ ┆ V74 ┆ V81 ┆ V86 ┆ __V88 │\n", "│ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", "│ R_59pdrC3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 5.5 ┆ null │\n", "│ urLmZnbP ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3fJSKy5 ┆ 6.0 ┆ 5.0 ┆ null ┆ … ┆ null ┆ null ┆ 6.0 ┆ null │\n", "│ SVxmNdBC ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3g11G0u ┆ 9.5 ┆ null ┆ 5.0 ┆ … ┆ null ┆ null ┆ null ┆ 9.5 │\n", "│ pJ7iGt8Q ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3i3dGL7 ┆ 6.0 ┆ 9.0 ┆ 8.0 ┆ … ┆ null ┆ 2.0 ┆ 3.0 ┆ null │\n", "│ cfLOTgxb ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3BBF1fR ┆ null ┆ null ┆ null ┆ … ┆ 6.0 ┆ null ┆ 8.5 ┆ null │\n", "│ WGGeButr ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n" ] } ], "source": [ "voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n", "print(voice_1_10.head())\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5b3e6ad0", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "phase-3-quant", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }