{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "7174c11a", "metadata": {}, "outputs": [], "source": [ "import polars as pl\n", "from utils import QualtricsSurvey, calculate_weighted_ranking_scores" ] }, { "cell_type": "code", "execution_count": 2, "id": "d9d11d52", "metadata": {}, "outputs": [], "source": [ "SPEECH_DATA = 'data/speech_data/JPMC Speech data ab samples (Final Speech Data AB samples).csv'\n", "RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'\n", "QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'" ] }, { "cell_type": "code", "execution_count": 37, "id": "c8f06ff8", "metadata": {}, "outputs": [], "source": [ "speech_df = pl.read_csv(SPEECH_DATA, separator=';')\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "8da85898", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 38, "id": "e7ccd8ef", "metadata": {}, "outputs": [], "source": [ "\n", "# Convert comma decimal separator to period and cast to float\n", "cols_to_convert = [\n", " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n", " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n", "]\n", "\n", "# Convert specified columns to float, handling percent signs, commas and spaces\n", "if \"Standard deviation pitch\" in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(\"Standard deviation pitch\")\n", " .cast(pl.Utf8)\n", " .str.replace(\",\", \".\")\n", " .str.replace(\" \", \"\")\n", " # strict=False converts unparseable strings (like empty ones) to Null\n", " .cast(pl.Float64, strict=False)\n", " )\n", "\n", "for col_name in [\"Jitter (local)\", \"Shimmer (local)\"]:\n", " if col_name in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(col_name)\n", " .cast(pl.Utf8)\n", " .str.replace(\"%\", \"\")\n", " .str.replace(\" \", \"\")\n", " .str.replace(\",\", \".\")\n", " .cast(pl.Float64, strict=False)\n", " )\n", "\n", "if cols_to_convert:\n", " speech_df = speech_df.with_columns(cols_to_convert)\n", "\n", "# speech_df\n" ] }, { "cell_type": "code", "execution_count": 39, "id": "450d1d29", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (18, 3)\n", "┌───────┬────────┬──────┐\n", "│ Voice ┆ Gender ┆ Age │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ i64 │\n", "╞═══════╪════════╪══════╡\n", "│ V04 ┆ female ┆ 28 │\n", "│ V08 ┆ female ┆ 45 │\n", "│ V14 ┆ female ┆ 50 │\n", "│ V16 ┆ male ┆ 40 │\n", "│ V34 ┆ male ┆ 42 │\n", "│ … ┆ … ┆ … │\n", "│ V82 ┆ female ┆ null │\n", "│ V86 ┆ male ┆ 62 │\n", "│ V88 ┆ male ┆ 42 │\n", "│ V89 ┆ female ┆ 32 │\n", "│ V91 ┆ female ┆ null │\n", "└───────┴────────┴──────┘\n" ] } ], "source": [ "# Convert Voice ints to zero-padded strings like \"V04\", \"V81\"\n", "# Survey uses zero-padded IDs (V04, V08) so we must match that format\n", "speech_df = speech_df.with_columns(\n", " pl.when(pl.col(\"Voice\").is_not_null())\n", " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8).str.zfill(2)]))\n", " .otherwise(None)\n", " .alias(\"Voice\")\n", ")\n", "\n", "print(speech_df.select([\"Voice\", \"Gender\", \"Age\"]).sort(\"Voice\"))" ] }, { "cell_type": "markdown", "id": "5fb615fe", "metadata": {}, "source": [ "# Get survey data" ] }, { "cell_type": "code", "execution_count": 7, "id": "bb4200ee", "metadata": {}, "outputs": [], "source": [ "S = QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=None)\n", "data_all = S.load_data()\n", "data = S.filter_data(data_all)" ] }, { "cell_type": "code", "execution_count": 8, "id": "57243afd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 2)\n", "┌───────────┬────────────────┐\n", "│ Character ┆ Weighted Score │\n", "│ --- ┆ --- │\n", "│ str ┆ i64 │\n", "╞═══════════╪════════════════╡\n", "│ V14 ┆ 209 │\n", "│ V04 ┆ 209 │\n", "│ V08 ┆ 180 │\n", "│ V82 ┆ 172 │\n", "│ V77 ┆ 158 │\n", "└───────────┴────────────────┘\n" ] } ], "source": [ "top3_voices = S.get_top_3_voices(data)[0]\n", "voices_weighted_rank = calculate_weighted_ranking_scores(top3_voices)\n", "print(voices_weighted_rank.head())" ] }, { "cell_type": "code", "execution_count": 9, "id": "b38d21fc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 18)\n", "┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐\n", "│ _recordId ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ … ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sca ┆ Voice_Sc │\n", "│ --- ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ┆ le_1_10__ ┆ le_1_10__ ┆ le_1_10__ ┆ ale_1_10 │\n", "│ str ┆ V14 ┆ V04 ┆ V08 ┆ ┆ V74 ┆ V81 ┆ V86 ┆ __V88 │\n", "│ ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │\n", "│ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ f64 ┆ f64 ┆ f64 ┆ f64 │\n", "╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡\n", "│ R_59pdrC3 ┆ null ┆ null ┆ null ┆ … ┆ null ┆ null ┆ 5.5 ┆ null │\n", "│ urLmZnbP ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3fJSKy5 ┆ 6.0 ┆ 5.0 ┆ null ┆ … ┆ null ┆ null ┆ 6.0 ┆ null │\n", "│ SVxmNdBC ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3g11G0u ┆ 9.5 ┆ null ┆ 5.0 ┆ … ┆ null ┆ null ┆ null ┆ 9.5 │\n", "│ pJ7iGt8Q ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3i3dGL7 ┆ 6.0 ┆ 9.0 ┆ 8.0 ┆ … ┆ null ┆ 2.0 ┆ 3.0 ┆ null │\n", "│ cfLOTgxb ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "│ R_3BBF1fR ┆ null ┆ null ┆ null ┆ … ┆ 6.0 ┆ null ┆ 8.5 ┆ null │\n", "│ WGGeButr ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", "└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘\n" ] } ], "source": [ "voice_1_10 = S.get_voice_scale_1_10(data)[0].collect()\n", "print(voice_1_10.head())\n", "\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "5b3e6ad0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (5, 2)\n", "┌───────┬────────────────────────┐\n", "│ Voice ┆ Avg Voice Score (1-10) │\n", "│ --- ┆ --- │\n", "│ str ┆ f64 │\n", "╞═══════╪════════════════════════╡\n", "│ V08 ┆ 7.38172 │\n", "│ V82 ┆ 7.376984 │\n", "│ V89 ┆ 7.373206 │\n", "│ V86 ┆ 7.264444 │\n", "│ V69 ┆ 7.219577 │\n", "└───────┴────────────────────────┘\n" ] } ], "source": [ "# --- Compute average voice score (1-10) per voice ---\n", "voice_cols = [c for c in voice_1_10.columns if c.startswith(\"Voice_Scale_1_10__\")]\n", "avg_scores = []\n", "for col in voice_cols:\n", " voice_id = col.replace(\"Voice_Scale_1_10__\", \"\") # e.g. \"V14\"\n", " mean_val = voice_1_10.select(pl.col(col).mean()).item()\n", " avg_scores.append({\"Voice\": voice_id, \"Avg Voice Score (1-10)\": mean_val})\n", "\n", "avg_voice_scores = pl.DataFrame(avg_scores)\n", "print(avg_voice_scores.sort(\"Avg Voice Score (1-10)\", descending=True).head())" ] }, { "cell_type": "code", "execution_count": 40, "id": "79626ffb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Merged rows: 18 (voices with both speech data and survey data)\n", " → Voices missing Avg Voice Score: ['V46']\n", "shape: (5, 4)\n", "┌───────┬────────┬────────────────────────┬────────────────┐\n", "│ Voice ┆ Gender ┆ Avg Voice Score (1-10) ┆ Weighted Score │\n", "│ --- ┆ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 ┆ i64 │\n", "╞═══════╪════════╪════════════════════════╪════════════════╡\n", "│ V14 ┆ female ┆ 7.216279 ┆ 209 │\n", "│ V04 ┆ female ┆ 7.07971 ┆ 209 │\n", "│ V08 ┆ female ┆ 7.38172 ┆ 180 │\n", "│ V82 ┆ female ┆ 7.376984 ┆ 172 │\n", "│ V77 ┆ female ┆ 6.960894 ┆ 158 │\n", "└───────┴────────┴────────────────────────┴────────────────┘\n" ] } ], "source": [ "# --- Normalize weighted rank column name and join all data ---\n", "weighted_rank = voices_weighted_rank.rename({\"Character\": \"Voice\"})\n", "\n", "# Join speech attributes with both survey metrics\n", "# Left join on avg_voice_scores so V46 (excluded from survey voice scale) is kept\n", "# — its Avg Score will be null but Weighted Ranking Score is still valid\n", "merged = (\n", " speech_df\n", " .join(avg_voice_scores, on=\"Voice\", how=\"left\")\n", " .join(weighted_rank, on=\"Voice\", how=\"inner\")\n", ")\n", "\n", "print(f\"Merged rows: {merged.height} (voices with both speech data and survey data)\")\n", "print(f\" → Voices missing Avg Voice Score: {merged.filter(pl.col('Avg Voice Score (1-10)').is_null())['Voice'].to_list()}\")\n", "print(merged.select([\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"]).head())" ] }, { "cell_type": "markdown", "id": "932cfb9e", "metadata": {}, "source": [ "# Correlation: Speech Attributes vs Survey Metrics\n", "\n", "We correlate each speech characteristic (pitch, duration, jitter, etc.) against two survey metrics:\n", "\n", "| Metric | Type | Correlation Method | Why |\n", "|---|---|---|---|\n", "| **Avg Voice Score (1-10)** | Continuous | **Pearson** | Both variables are continuous and approximately interval-scaled — Pearson captures linear relationships well here. |\n", "| **Weighted Ranking Score** | Ordinal / count-based | **Spearman** | The weighted score (1st=3pts, 2nd=2pts, 3rd=1pt) is ordinal in nature with a small number of discrete values. Spearman is rank-based, making no assumptions about linearity or normality — more appropriate for this type of data. |\n", "\n", "> **Note:** With only ~17 voices, all correlations should be interpreted cautiously. Small samples amplify the influence of individual data points." ] }, { "cell_type": "code", "execution_count": 41, "id": "77658327", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "shape: (39, 3)\n", "┌─────────────────────────────────┬─────────────────────────────────┬─────────────┐\n", "│ attribute ┆ metric ┆ correlation │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 │\n", "╞═════════════════════════════════╪═════════════════════════════════╪═════════════╡\n", "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.386054 │\n", "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.094086 │\n", "│ Mean pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.459684 │\n", "│ Mean pitch ┆ Weighted Ranking Score [Spearm… ┆ 0.63429 │\n", "│ Standard deviation pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.664432 │\n", "│ … ┆ … ┆ … │\n", "│ speechrate (nsyll/dur) ┆ Weighted Ranking Score [Spearm… ┆ -0.336524 │\n", "│ articulation rate (nsyll / pho… ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.456181 │\n", "│ articulation rate (nsyll / pho… ┆ Weighted Ranking Score [Spearm… ┆ -0.268239 │\n", "│ speech rate words per minute ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.26437 │\n", "│ speech rate words per minute ┆ Weighted Ranking Score [Spearm… ┆ 0.252577 │\n", "└─────────────────────────────────┴─────────────────────────────────┴─────────────┘\n" ] } ], "source": [ "# --- Compute correlations ---\n", "# Pearson for continuous Voice Score, Spearman for ordinal Weighted Ranking Score\n", "exclude_cols = {\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"}\n", "speech_attrs = [c for c in merged.columns if c not in exclude_cols]\n", "\n", "rows = []\n", "for attr in speech_attrs:\n", " # Drop nulls for the pair before computing correlation\n", " valid = merged.select([attr, \"Avg Voice Score (1-10)\", \"Weighted Score\"]).drop_nulls()\n", " if valid.height > 2:\n", " # Pearson for continuous 1-10 score\n", " r_score = valid.select(pl.corr(attr, \"Avg Voice Score (1-10)\", method=\"pearson\")).item()\n", " # Spearman for ordinal weighted ranking score\n", " r_rank = valid.select(pl.corr(attr, \"Weighted Score\", method=\"spearman\")).item()\n", " else:\n", " r_score = None\n", " r_rank = None\n", " rows.append({\"attribute\": attr, \"metric\": \"Avg Voice Score (1-10) [Pearson]\", \"correlation\": r_score})\n", " rows.append({\"attribute\": attr, \"metric\": \"Weighted Ranking Score [Spearman]\", \"correlation\": r_rank})\n", "\n", "corr_long = (\n", " pl.DataFrame(rows)\n", " .drop_nulls()\n", " .filter(pl.col(\"correlation\").is_not_nan())\n", ")\n", "print(corr_long)" ] }, { "cell_type": "code", "execution_count": 46, "id": "ef4ceefc", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Female Age correlation:\n", "shape: (2, 3)\n", "┌───────────┬─────────────────────────────────┬─────────────┐\n", "│ attribute ┆ metric ┆ correlation │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ str ┆ f64 │\n", "╞═══════════╪═════════════════════════════════╪═════════════╡\n", "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.023566 │\n", "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.231908 │\n", "└───────────┴─────────────────────────────────┴─────────────┘\n", "\n", "Female data (Age + Weighted Score):\n", "shape: (6, 3)\n", "┌───────┬─────┬────────────────┐\n", "│ Voice ┆ Age ┆ Weighted Score │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i64 ┆ i64 │\n", "╞═══════╪═════╪════════════════╡\n", "│ V04 ┆ 28 ┆ 209 │\n", "│ V89 ┆ 32 ┆ 130 │\n", "│ V48 ┆ 35 ┆ 144 │\n", "│ V08 ┆ 45 ┆ 180 │\n", "│ V77 ┆ 48 ┆ 158 │\n", "│ V14 ┆ 50 ┆ 209 │\n", "└───────┴─────┴────────────────┘\n" ] } ], "source": [ "# Verify: Age correlation in female split should no longer be 1.0\n", "print(\"Female Age correlation:\")\n", "print(corr_female.filter(pl.col(\"attribute\") == \"Age\"))\n", "\n", "print(\"\\nFemale data (Age + Weighted Score):\")\n", "female_check = merged.filter(pl.col(\"Gender\") == \"female\").select([\"Voice\", \"Age\", \"Weighted Score\"]).drop_nulls().sort(\"Age\")\n", "print(female_check)" ] }, { "cell_type": "code", "execution_count": 42, "id": "0d9567ff", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation.png\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# --- Plot correlation heatmap ---\n", "S.plot_speech_attribute_correlation(\n", " corr_long,\n", " title=\"Speech Characteristics vs Survey Metrics
Correlation per Voice (Pearson / Spearman)\",\n", " filename=\"speech_attr_vs_survey_correlation\",\n", ")" ] }, { "cell_type": "code", "execution_count": 43, "id": "a173be9a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Male voices: 10\n", "Female voices: 8\n" ] } ], "source": [ "# --- Compute correlations by voice gender ---\n", "def compute_corr_for_gender(merged_df: pl.DataFrame, gender: str) -> pl.DataFrame:\n", " \"\"\"Compute Pearson (score) + Spearman (ranking) correlations for a gender subset.\"\"\"\n", " subset = merged_df.filter(pl.col(\"Gender\") == gender)\n", " exclude = {\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"}\n", " attrs = [c for c in subset.columns if c not in exclude]\n", "\n", " rows = []\n", " for attr in attrs:\n", " valid = subset.select([attr, \"Avg Voice Score (1-10)\", \"Weighted Score\"]).drop_nulls()\n", " if valid.height > 2:\n", " r_score = valid.select(pl.corr(attr, \"Avg Voice Score (1-10)\", method=\"pearson\")).item()\n", " r_rank = valid.select(pl.corr(attr, \"Weighted Score\", method=\"spearman\")).item()\n", " else:\n", " r_score = None\n", " r_rank = None\n", " rows.append({\"attribute\": attr, \"metric\": \"Avg Voice Score (1-10) [Pearson]\", \"correlation\": r_score})\n", " rows.append({\"attribute\": attr, \"metric\": \"Weighted Ranking Score [Spearman]\", \"correlation\": r_rank})\n", "\n", " return pl.DataFrame(rows).drop_nulls().filter(pl.col(\"correlation\").is_not_nan())\n", "\n", "corr_male = compute_corr_for_gender(merged, \"male\")\n", "corr_female = compute_corr_for_gender(merged, \"female\")\n", "\n", "print(f\"Male voices: {merged.filter(pl.col('Gender') == 'male').height}\")\n", "print(f\"Female voices: {merged.filter(pl.col('Gender') == 'female').height}\")" ] }, { "cell_type": "code", "execution_count": 44, "id": "84eaaff6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation_male.png\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# --- Plot: Male voices ---\n", "S.plot_speech_attribute_correlation(\n", " corr_male,\n", " title=\"Speech Characteristics vs Survey Metrics
Male Voices Only (Pearson / Spearman)\",\n", " filename=\"speech_attr_vs_survey_correlation_male\",\n", ")" ] }, { "cell_type": "code", "execution_count": 45, "id": "d04225e1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation_female.png\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# --- Plot: Female voices ---\n", "S.plot_speech_attribute_correlation(\n", " corr_female,\n", " title=\"Speech Characteristics vs Survey Metrics
Female Voices Only (Pearson / Spearman)\",\n", " filename=\"speech_attr_vs_survey_correlation_female\",\n", ")" ] }, { "cell_type": "code", "execution_count": 47, "id": "8e2fbc25", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "=== MALE | Age vs Avg Voice Score (1-10) [Pearson] | r = 0.793 ===\n", "shape: (9, 3)\n", "┌───────┬─────┬────────────────────────┐\n", "│ Voice ┆ Age ┆ Avg Voice Score (1-10) │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ i64 ┆ f64 │\n", "╞═══════╪═════╪════════════════════════╡\n", "│ V81 ┆ 28 ┆ 6.683007 │\n", "│ V54 ┆ 36 ┆ 6.67734 │\n", "│ V16 ┆ 40 ┆ 6.921053 │\n", "│ V74 ┆ 41 ┆ 6.89196 │\n", "│ V34 ┆ 42 ┆ 7.153005 │\n", "│ V88 ┆ 42 ┆ 6.916667 │\n", "│ V69 ┆ 43 ┆ 7.219577 │\n", "│ V45 ┆ 43 ┆ 7.062189 │\n", "│ V86 ┆ 62 ┆ 7.264444 │\n", "└───────┴─────┴────────────────────────┘\n", "\n", "=== MALE | Shimmer (local) vs Avg Voice Score (1-10) [Pearson] | r = 0.813 ===\n", "shape: (9, 3)\n", "┌───────┬─────────────────┬────────────────────────┐\n", "│ Voice ┆ Shimmer (local) ┆ Avg Voice Score (1-10) │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞═══════╪═════════════════╪════════════════════════╡\n", "│ V88 ┆ 9.07 ┆ 6.916667 │\n", "│ V74 ┆ 9.09 ┆ 6.89196 │\n", "│ V54 ┆ 9.11 ┆ 6.67734 │\n", "│ V81 ┆ 9.13 ┆ 6.683007 │\n", "│ V16 ┆ 9.38 ┆ 6.921053 │\n", "│ V34 ┆ 10.07 ┆ 7.153005 │\n", "│ V86 ┆ 10.28 ┆ 7.264444 │\n", "│ V45 ┆ 10.39 ┆ 7.062189 │\n", "│ V69 ┆ 11.52 ┆ 7.219577 │\n", "└───────┴─────────────────┴────────────────────────┘\n", "\n", "=== MALE | Mean harmonics-to-noise ratio dB vs Avg Voice Score (1-10) [Pearson] | r = -0.738 ===\n", "shape: (9, 3)\n", "┌───────┬─────────────────────────────────┬────────────────────────┐\n", "│ Voice ┆ Mean harmonics-to-noise ratio … ┆ Avg Voice Score (1-10) │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ f64 │\n", "╞═══════╪═════════════════════════════════╪════════════════════════╡\n", "│ V16 ┆ 7.544 ┆ 6.921053 │\n", "│ V34 ┆ 7.671 ┆ 7.153005 │\n", "│ V69 ┆ 7.763 ┆ 7.219577 │\n", "│ V86 ┆ 8.092 ┆ 7.264444 │\n", "│ V45 ┆ 8.647 ┆ 7.062189 │\n", "│ V74 ┆ 8.732 ┆ 6.89196 │\n", "│ V88 ┆ 9.009 ┆ 6.916667 │\n", "│ V81 ┆ 9.026 ┆ 6.683007 │\n", "│ V54 ┆ 9.598 ┆ 6.67734 │\n", "└───────┴─────────────────────────────────┴────────────────────────┘\n", "\n", "=== MALE | Mean harmonics-to-noise ratio dB vs Weighted Ranking Score [Spearman] | r = -0.767 ===\n", "shape: (10, 3)\n", "┌───────┬─────────────────────────────────┬────────────────┐\n", "│ Voice ┆ Mean harmonics-to-noise ratio … ┆ Weighted Score │\n", "│ --- ┆ --- ┆ --- │\n", "│ str ┆ f64 ┆ i64 │\n", "╞═══════╪═════════════════════════════════╪════════════════╡\n", "│ V16 ┆ 7.544 ┆ 156 │\n", "│ V34 ┆ 7.671 ┆ 128 │\n", "│ V69 ┆ 7.763 ┆ 121 │\n", "│ V86 ┆ 8.092 ┆ 76 │\n", "│ V46 ┆ 8.625 ┆ 148 │\n", "│ V45 ┆ 8.647 ┆ 118 │\n", "│ V74 ┆ 8.732 ┆ 117 │\n", "│ V88 ┆ 9.009 ┆ 97 │\n", "│ V81 ┆ 9.026 ┆ 69 │\n", "│ V54 ┆ 9.598 ┆ 116 │\n", "└───────┴─────────────────────────────────┴────────────────┘\n", "\n", "=== FEMALE: no correlations with |r| > 0.7 ===\n", "\n" ] } ], "source": [ "# Inspect underlying data for all |r| > 0.7 correlations in gender subgroups\n", "THRESHOLD = 0.7\n", "\n", "metric_col_map = {\n", " \"Avg Voice Score (1-10) [Pearson]\": \"Avg Voice Score (1-10)\",\n", " \"Weighted Ranking Score [Spearman]\": \"Weighted Score\",\n", "}\n", "\n", "for gender, corr_df in [(\"MALE\", corr_male), (\"FEMALE\", corr_female)]:\n", " strong = corr_df.filter(pl.col(\"correlation\").abs() > THRESHOLD)\n", " if strong.height == 0:\n", " print(f\"=== {gender}: no correlations with |r| > {THRESHOLD} ===\\n\")\n", " continue\n", "\n", " subset = merged.filter(pl.col(\"Gender\") == gender.lower())\n", "\n", " for row in strong.iter_rows(named=True):\n", " attr = row[\"attribute\"]\n", " metric_label = row[\"metric\"]\n", " r = row[\"correlation\"]\n", " survey_col = metric_col_map[metric_label]\n", "\n", " table = (\n", " subset\n", " .select([\"Voice\", attr, survey_col])\n", " .drop_nulls()\n", " .sort(attr)\n", " )\n", " print(f\"=== {gender} | {attr} vs {metric_label} | r = {r:.3f} ===\")\n", " print(table)\n", " print()" ] }, { "cell_type": "code", "execution_count": 48, "id": "959945f2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Saved 4 sheets to figures/2-4-26/strong_correlations_by_gender.xlsx\n" ] } ], "source": [ "import xlsxwriter\n", "from pathlib import Path\n", "\n", "out_path = Path(S.fig_save_dir) / \"strong_correlations_by_gender.xlsx\"\n", "\n", "with xlsxwriter.Workbook(str(out_path)) as wb:\n", " sheet_count = 0\n", " for gender, corr_df in [(\"Male\", corr_male), (\"Female\", corr_female)]:\n", " strong = corr_df.filter(pl.col(\"correlation\").abs() > THRESHOLD)\n", " if strong.height == 0:\n", " continue\n", "\n", " subset = merged.filter(pl.col(\"Gender\") == gender.lower())\n", "\n", " for row in strong.iter_rows(named=True):\n", " attr = row[\"attribute\"]\n", " metric_label = row[\"metric\"]\n", " r = row[\"correlation\"]\n", " survey_col = metric_col_map[metric_label]\n", "\n", " table = subset.select([\"Voice\", attr, survey_col]).drop_nulls().sort(attr)\n", "\n", " # Sheet name: max 31 chars for Excel\n", " short_metric = \"Score\" if \"Pearson\" in metric_label else \"Rank\"\n", " sheet_name = f\"{gender}_{attr[:18]}_{short_metric}\"[:31]\n", "\n", " ws = wb.add_worksheet(sheet_name)\n", " # Header row with context\n", " bold = wb.add_format({\"bold\": True})\n", " ws.write(0, 0, f\"{gender} | {attr} vs {metric_label} | r = {r:.3f}\", bold)\n", "\n", " # Column headers\n", " for ci, col_name in enumerate(table.columns):\n", " ws.write(2, ci, col_name, bold)\n", "\n", " # Data rows\n", " for ri, data_row in enumerate(table.iter_rows()):\n", " for ci, val in enumerate(data_row):\n", " ws.write(3 + ri, ci, val)\n", "\n", " # Auto-fit column widths\n", " for ci, col_name in enumerate(table.columns):\n", " max_len = max(len(str(col_name)), *(len(str(v)) for v in table.get_column(col_name).to_list()))\n", " ws.set_column(ci, ci, max_len + 2)\n", "\n", " sheet_count += 1\n", "\n", "print(f\"Saved {sheet_count} sheets to {out_path}\")" ] } ], "metadata": { "kernelspec": { "display_name": "phase-3-quant", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.10" } }, "nbformat": 4, "nbformat_minor": 5 }