diff --git a/plots.py b/plots.py index 52a7d7e..3fd0631 100644 --- a/plots.py +++ b/plots.py @@ -2860,4 +2860,120 @@ class QualtricsPlotsMixin: ) chart = self._save_plot(chart, title) + return chart + + def plot_speech_attribute_correlation( + self, + corr_df: pl.DataFrame | pd.DataFrame, + title: str = "Speech Attributes vs Survey Metrics
Pearson Correlation", + filename: str | None = None, + height: int | None = None, + width: int | None = None, + show_values: bool = True, + color_scheme: str | None = None, + ) -> alt.Chart: + """Plot a correlation heatmap between speech attributes and survey metrics. + + Expects a long-form DataFrame with columns: + - metric: row label (e.g. "Weighted Rank", "Avg Voice Score") + - attribute: column label (speech characteristic name) + - correlation: Pearson r value + + Args: + corr_df: Long-form correlation DataFrame. + title: Chart title (supports
for line breaks). + filename: Optional explicit filename (without extension). + height: Chart height in pixels. + width: Chart width in pixels. + show_values: Whether to display correlation values as text. + color_scheme: Optional Altair diverging color scheme name. + + Returns: + alt.Chart: Altair heatmap chart. + """ + if isinstance(corr_df, pl.DataFrame): + plot_df = corr_df.to_pandas() + else: + plot_df = corr_df + + attributes = plot_df["attribute"].unique().tolist() + metrics = plot_df["metric"].unique().tolist() + + n_attrs = len(attributes) + chart_width = width or max(600, n_attrs * 55) + chart_height = height or max(120, len(metrics) * 50 + 60) + + heatmap = ( + alt.Chart(plot_df) + .mark_rect(stroke="white", strokeWidth=1) + .encode( + x=alt.X( + "attribute:N", + title=None, + sort=attributes, + axis=alt.Axis(labelAngle=-45, labelLimit=180, grid=False), + ), + y=alt.Y( + "metric:N", + title=None, + sort=metrics, + axis=alt.Axis(labelLimit=200, grid=False), + ), + color=alt.Color( + "correlation:Q", + scale=alt.Scale( + domain=[-1, 1], + scheme=color_scheme or "redblue", + ), + legend=alt.Legend(title="Pearson r"), + ), + tooltip=[ + alt.Tooltip("metric:N", title="Metric"), + alt.Tooltip("attribute:N", title="Attribute"), + alt.Tooltip("correlation:Q", title="r", format=".3f"), + ], + ) + ) + + if show_values: + # Split into two text layers with fixed mark colors to avoid + # conflicting color encodings that break vl_convert PNG export. + dark_rows = plot_df[plot_df["correlation"].abs() <= 0.45] + light_rows = plot_df[plot_df["correlation"].abs() > 0.45] + + text_layers = [] + if not dark_rows.empty: + text_layers.append( + alt.Chart(dark_rows) + .mark_text(fontSize=11, fontWeight="normal", color="black") + .encode( + x=alt.X("attribute:N", sort=attributes), + y=alt.Y("metric:N", sort=metrics), + text=alt.Text("correlation:Q", format=".2f"), + ) + ) + if not light_rows.empty: + text_layers.append( + alt.Chart(light_rows) + .mark_text(fontSize=11, fontWeight="normal", color="white") + .encode( + x=alt.X("attribute:N", sort=attributes), + y=alt.Y("metric:N", sort=metrics), + text=alt.Text("correlation:Q", format=".2f"), + ) + ) + + chart = heatmap + for tl in text_layers: + chart = chart + tl + else: + chart = heatmap + + chart = chart.properties( + title=self._process_title(title), + width=chart_width, + height=chart_height, + ) + + chart = self._save_plot(chart, title, filename=filename) return chart \ No newline at end of file diff --git a/speech_data_correlation.ipynb b/speech_data_correlation.ipynb index f783603..cb62e86 100644 --- a/speech_data_correlation.ipynb +++ b/speech_data_correlation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 24, + "execution_count": 1, "id": "7174c11a", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "d9d11d52", "metadata": {}, "outputs": [], @@ -25,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 37, "id": "c8f06ff8", "metadata": {}, "outputs": [], @@ -36,30 +36,29 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "8da85898", "metadata": {}, "outputs": [], - "source": [ - "# Convert comma decimal separator to period and cast to float\n", - "speech_df = speech_df.with_columns([\n", - " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n", - " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n", - " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n", - " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n", - "])" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 38, "id": "e7ccd8ef", "metadata": {}, "outputs": [], "source": [ - "# Convert specified columns to float, handling percent signs, commas and spaces\n", - "cols_to_convert = []\n", "\n", + "# Convert comma decimal separator to period and cast to float\n", + "cols_to_convert = [\n", + " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n", + " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n", + " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n", + " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n", + "]\n", + "\n", + "# Convert specified columns to float, handling percent signs, commas and spaces\n", "if \"Standard deviation pitch\" in speech_df.columns:\n", " cols_to_convert.append(\n", " pl.col(\"Standard deviation pitch\")\n", @@ -89,7 +88,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 39, "id": "450d1d29", "metadata": {}, "outputs": [ @@ -97,34 +96,38 @@ "name": "stdout", "output_type": "stream", "text": [ - "shape: (5, 22)\n", - "┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n", - "│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate ┆ articulation ┆ speech rate │\n", - "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ e (s) ┆ (nsyll/dur) ┆ rate (nsyll ┆ words per │\n", - "│ str ┆ str ┆ i64 ┆ f64 ┆ ┆ --- ┆ --- ┆ / pho… ┆ minute │\n", - "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ --- ┆ --- │\n", - "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str │\n", - "╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n", - "│ VVV74 ┆ male ┆ 41 ┆ 95.25 ┆ … ┆ 11,17 ┆ 4,32468397 ┆ 5,81915846 ┆ 185,3436 │\n", - "│ VVV54 ┆ male ┆ 36 ┆ 126.93 ┆ … ┆ 10,83 ┆ 4,38596491 ┆ 6,00184672 ┆ 187,9699 │\n", - "│ VVV48 ┆ female ┆ 35 ┆ 193.296 ┆ … ┆ 11,92 ┆ 4,02227723 ┆ 5,45302013 ┆ 172,3833 │\n", - "│ VVV14 ┆ female ┆ 50 ┆ 169.214 ┆ … ┆ 12,75 ┆ 3,97553517 ┆ 5,09803922 ┆ 170,3801 │\n", - "│ VVV4 ┆ female ┆ 28 ┆ 208.079 ┆ … ┆ 12,22 ┆ 4,10094637 ┆ 5,31914894 ┆ 175,7548 │\n", - "└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n" + "shape: (18, 3)\n", + "┌───────┬────────┬──────┐\n", + "│ Voice ┆ Gender ┆ Age │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ i64 │\n", + "╞═══════╪════════╪══════╡\n", + "│ V04 ┆ female ┆ 28 │\n", + "│ V08 ┆ female ┆ 45 │\n", + "│ V14 ┆ female ┆ 50 │\n", + "│ V16 ┆ male ┆ 40 │\n", + "│ V34 ┆ male ┆ 42 │\n", + "│ … ┆ … ┆ … │\n", + "│ V82 ┆ female ┆ null │\n", + "│ V86 ┆ male ┆ 62 │\n", + "│ V88 ┆ male ┆ 42 │\n", + "│ V89 ┆ female ┆ 32 │\n", + "│ V91 ┆ female ┆ null │\n", + "└───────┴────────┴──────┘\n" ] } ], "source": [ - "# Convert Voice ints to strings like \"V81\"\n", - "# Convert Voice ints to strings like \"V81\"\n", + "# Convert Voice ints to zero-padded strings like \"V04\", \"V81\"\n", + "# Survey uses zero-padded IDs (V04, V08) so we must match that format\n", "speech_df = speech_df.with_columns(\n", " pl.when(pl.col(\"Voice\").is_not_null())\n", - " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n", + " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8).str.zfill(2)]))\n", " .otherwise(None)\n", " .alias(\"Voice\")\n", ")\n", "\n", - "print(speech_df.head())" + "print(speech_df.select([\"Voice\", \"Gender\", \"Age\"]).sort(\"Voice\"))" ] }, { @@ -137,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "bb4200ee", "metadata": {}, "outputs": [], @@ -149,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 8, "id": "57243afd", "metadata": {}, "outputs": [ @@ -180,7 +183,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 9, "id": "b38d21fc", "metadata": {}, "outputs": [ @@ -218,11 +221,751 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "5b3e6ad0", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (5, 2)\n", + "┌───────┬────────────────────────┐\n", + "│ Voice ┆ Avg Voice Score (1-10) │\n", + "│ --- ┆ --- │\n", + "│ str ┆ f64 │\n", + "╞═══════╪════════════════════════╡\n", + "│ V08 ┆ 7.38172 │\n", + "│ V82 ┆ 7.376984 │\n", + "│ V89 ┆ 7.373206 │\n", + "│ V86 ┆ 7.264444 │\n", + "│ V69 ┆ 7.219577 │\n", + "└───────┴────────────────────────┘\n" + ] + } + ], + "source": [ + "# --- Compute average voice score (1-10) per voice ---\n", + "voice_cols = [c for c in voice_1_10.columns if c.startswith(\"Voice_Scale_1_10__\")]\n", + "avg_scores = []\n", + "for col in voice_cols:\n", + " voice_id = col.replace(\"Voice_Scale_1_10__\", \"\") # e.g. \"V14\"\n", + " mean_val = voice_1_10.select(pl.col(col).mean()).item()\n", + " avg_scores.append({\"Voice\": voice_id, \"Avg Voice Score (1-10)\": mean_val})\n", + "\n", + "avg_voice_scores = pl.DataFrame(avg_scores)\n", + "print(avg_voice_scores.sort(\"Avg Voice Score (1-10)\", descending=True).head())" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "79626ffb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merged rows: 18 (voices with both speech data and survey data)\n", + " → Voices missing Avg Voice Score: ['V46']\n", + "shape: (5, 4)\n", + "┌───────┬────────┬────────────────────────┬────────────────┐\n", + "│ Voice ┆ Gender ┆ Avg Voice Score (1-10) ┆ Weighted Score │\n", + "│ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 ┆ i64 │\n", + "╞═══════╪════════╪════════════════════════╪════════════════╡\n", + "│ V14 ┆ female ┆ 7.216279 ┆ 209 │\n", + "│ V04 ┆ female ┆ 7.07971 ┆ 209 │\n", + "│ V08 ┆ female ┆ 7.38172 ┆ 180 │\n", + "│ V82 ┆ female ┆ 7.376984 ┆ 172 │\n", + "│ V77 ┆ female ┆ 6.960894 ┆ 158 │\n", + "└───────┴────────┴────────────────────────┴────────────────┘\n" + ] + } + ], + "source": [ + "# --- Normalize weighted rank column name and join all data ---\n", + "weighted_rank = voices_weighted_rank.rename({\"Character\": \"Voice\"})\n", + "\n", + "# Join speech attributes with both survey metrics\n", + "# Left join on avg_voice_scores so V46 (excluded from survey voice scale) is kept\n", + "# — its Avg Score will be null but Weighted Ranking Score is still valid\n", + "merged = (\n", + " speech_df\n", + " .join(avg_voice_scores, on=\"Voice\", how=\"left\")\n", + " .join(weighted_rank, on=\"Voice\", how=\"inner\")\n", + ")\n", + "\n", + "print(f\"Merged rows: {merged.height} (voices with both speech data and survey data)\")\n", + "print(f\" → Voices missing Avg Voice Score: {merged.filter(pl.col('Avg Voice Score (1-10)').is_null())['Voice'].to_list()}\")\n", + "print(merged.select([\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"]).head())" + ] + }, + { + "cell_type": "markdown", + "id": "932cfb9e", + "metadata": {}, + "source": [ + "# Correlation: Speech Attributes vs Survey Metrics\n", + "\n", + "We correlate each speech characteristic (pitch, duration, jitter, etc.) against two survey metrics:\n", + "\n", + "| Metric | Type | Correlation Method | Why |\n", + "|---|---|---|---|\n", + "| **Avg Voice Score (1-10)** | Continuous | **Pearson** | Both variables are continuous and approximately interval-scaled — Pearson captures linear relationships well here. |\n", + "| **Weighted Ranking Score** | Ordinal / count-based | **Spearman** | The weighted score (1st=3pts, 2nd=2pts, 3rd=1pt) is ordinal in nature with a small number of discrete values. Spearman is rank-based, making no assumptions about linearity or normality — more appropriate for this type of data. |\n", + "\n", + "> **Note:** With only ~17 voices, all correlations should be interpreted cautiously. Small samples amplify the influence of individual data points." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "77658327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (39, 3)\n", + "┌─────────────────────────────────┬─────────────────────────────────┬─────────────┐\n", + "│ attribute ┆ metric ┆ correlation │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞═════════════════════════════════╪═════════════════════════════════╪═════════════╡\n", + "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.386054 │\n", + "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.094086 │\n", + "│ Mean pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.459684 │\n", + "│ Mean pitch ┆ Weighted Ranking Score [Spearm… ┆ 0.63429 │\n", + "│ Standard deviation pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.664432 │\n", + "│ … ┆ … ┆ … │\n", + "│ speechrate (nsyll/dur) ┆ Weighted Ranking Score [Spearm… ┆ -0.336524 │\n", + "│ articulation rate (nsyll / pho… ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.456181 │\n", + "│ articulation rate (nsyll / pho… ┆ Weighted Ranking Score [Spearm… ┆ -0.268239 │\n", + "│ speech rate words per minute ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.26437 │\n", + "│ speech rate words per minute ┆ Weighted Ranking Score [Spearm… ┆ 0.252577 │\n", + "└─────────────────────────────────┴─────────────────────────────────┴─────────────┘\n" + ] + } + ], + "source": [ + "# --- Compute correlations ---\n", + "# Pearson for continuous Voice Score, Spearman for ordinal Weighted Ranking Score\n", + "exclude_cols = {\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"}\n", + "speech_attrs = [c for c in merged.columns if c not in exclude_cols]\n", + "\n", + "rows = []\n", + "for attr in speech_attrs:\n", + " # Drop nulls for the pair before computing correlation\n", + " valid = merged.select([attr, \"Avg Voice Score (1-10)\", \"Weighted Score\"]).drop_nulls()\n", + " if valid.height > 2:\n", + " # Pearson for continuous 1-10 score\n", + " r_score = valid.select(pl.corr(attr, \"Avg Voice Score (1-10)\", method=\"pearson\")).item()\n", + " # Spearman for ordinal weighted ranking score\n", + " r_rank = valid.select(pl.corr(attr, \"Weighted Score\", method=\"spearman\")).item()\n", + " else:\n", + " r_score = None\n", + " r_rank = None\n", + " rows.append({\"attribute\": attr, \"metric\": \"Avg Voice Score (1-10) [Pearson]\", \"correlation\": r_score})\n", + " rows.append({\"attribute\": attr, \"metric\": \"Weighted Ranking Score [Spearman]\", \"correlation\": r_rank})\n", + "\n", + "corr_long = (\n", + " pl.DataFrame(rows)\n", + " .drop_nulls()\n", + " .filter(pl.col(\"correlation\").is_not_nan())\n", + ")\n", + "print(corr_long)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "ef4ceefc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Female Age correlation:\n", + "shape: (2, 3)\n", + "┌───────────┬─────────────────────────────────┬─────────────┐\n", + "│ attribute ┆ metric ┆ correlation │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ f64 │\n", + "╞═══════════╪═════════════════════════════════╪═════════════╡\n", + "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.023566 │\n", + "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.231908 │\n", + "└───────────┴─────────────────────────────────┴─────────────┘\n", + "\n", + "Female data (Age + Weighted Score):\n", + "shape: (6, 3)\n", + "┌───────┬─────┬────────────────┐\n", + "│ Voice ┆ Age ┆ Weighted Score │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ i64 ┆ i64 │\n", + "╞═══════╪═════╪════════════════╡\n", + "│ V04 ┆ 28 ┆ 209 │\n", + "│ V89 ┆ 32 ┆ 130 │\n", + "│ V48 ┆ 35 ┆ 144 │\n", + "│ V08 ┆ 45 ┆ 180 │\n", + "│ V77 ┆ 48 ┆ 158 │\n", + "│ V14 ┆ 50 ┆ 209 │\n", + "└───────┴─────┴────────────────┘\n" + ] + } + ], + "source": [ + "# Verify: Age correlation in female split should no longer be 1.0\n", + "print(\"Female Age correlation:\")\n", + "print(corr_female.filter(pl.col(\"attribute\") == \"Age\"))\n", + "\n", + "print(\"\\nFemale data (Age + Weighted Score):\")\n", + "female_check = merged.filter(pl.col(\"Gender\") == \"female\").select([\"Voice\", \"Age\", \"Weighted Score\"]).drop_nulls().sort(\"Age\")\n", + "print(female_check)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "0d9567ff", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation.png\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- Plot correlation heatmap ---\n", + "S.plot_speech_attribute_correlation(\n", + " corr_long,\n", + " title=\"Speech Characteristics vs Survey Metrics
Correlation per Voice (Pearson / Spearman)\",\n", + " filename=\"speech_attr_vs_survey_correlation\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "a173be9a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Male voices: 10\n", + "Female voices: 8\n" + ] + } + ], + "source": [ + "# --- Compute correlations by voice gender ---\n", + "def compute_corr_for_gender(merged_df: pl.DataFrame, gender: str) -> pl.DataFrame:\n", + " \"\"\"Compute Pearson (score) + Spearman (ranking) correlations for a gender subset.\"\"\"\n", + " subset = merged_df.filter(pl.col(\"Gender\") == gender)\n", + " exclude = {\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"}\n", + " attrs = [c for c in subset.columns if c not in exclude]\n", + "\n", + " rows = []\n", + " for attr in attrs:\n", + " valid = subset.select([attr, \"Avg Voice Score (1-10)\", \"Weighted Score\"]).drop_nulls()\n", + " if valid.height > 2:\n", + " r_score = valid.select(pl.corr(attr, \"Avg Voice Score (1-10)\", method=\"pearson\")).item()\n", + " r_rank = valid.select(pl.corr(attr, \"Weighted Score\", method=\"spearman\")).item()\n", + " else:\n", + " r_score = None\n", + " r_rank = None\n", + " rows.append({\"attribute\": attr, \"metric\": \"Avg Voice Score (1-10) [Pearson]\", \"correlation\": r_score})\n", + " rows.append({\"attribute\": attr, \"metric\": \"Weighted Ranking Score [Spearman]\", \"correlation\": r_rank})\n", + "\n", + " return pl.DataFrame(rows).drop_nulls().filter(pl.col(\"correlation\").is_not_nan())\n", + "\n", + "corr_male = compute_corr_for_gender(merged, \"male\")\n", + "corr_female = compute_corr_for_gender(merged, \"female\")\n", + "\n", + "print(f\"Male voices: {merged.filter(pl.col('Gender') == 'male').height}\")\n", + "print(f\"Female voices: {merged.filter(pl.col('Gender') == 'female').height}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "84eaaff6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation_male.png\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- Plot: Male voices ---\n", + "S.plot_speech_attribute_correlation(\n", + " corr_male,\n", + " title=\"Speech Characteristics vs Survey Metrics
Male Voices Only (Pearson / Spearman)\",\n", + " filename=\"speech_attr_vs_survey_correlation_male\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "d04225e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation_female.png\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# --- Plot: Female voices ---\n", + "S.plot_speech_attribute_correlation(\n", + " corr_female,\n", + " title=\"Speech Characteristics vs Survey Metrics
Female Voices Only (Pearson / Spearman)\",\n", + " filename=\"speech_attr_vs_survey_correlation_female\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "8e2fbc25", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== MALE | Age vs Avg Voice Score (1-10) [Pearson] | r = 0.793 ===\n", + "shape: (9, 3)\n", + "┌───────┬─────┬────────────────────────┐\n", + "│ Voice ┆ Age ┆ Avg Voice Score (1-10) │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ i64 ┆ f64 │\n", + "╞═══════╪═════╪════════════════════════╡\n", + "│ V81 ┆ 28 ┆ 6.683007 │\n", + "│ V54 ┆ 36 ┆ 6.67734 │\n", + "│ V16 ┆ 40 ┆ 6.921053 │\n", + "│ V74 ┆ 41 ┆ 6.89196 │\n", + "│ V34 ┆ 42 ┆ 7.153005 │\n", + "│ V88 ┆ 42 ┆ 6.916667 │\n", + "│ V69 ┆ 43 ┆ 7.219577 │\n", + "│ V45 ┆ 43 ┆ 7.062189 │\n", + "│ V86 ┆ 62 ┆ 7.264444 │\n", + "└───────┴─────┴────────────────────────┘\n", + "\n", + "=== MALE | Shimmer (local) vs Avg Voice Score (1-10) [Pearson] | r = 0.813 ===\n", + "shape: (9, 3)\n", + "┌───────┬─────────────────┬────────────────────────┐\n", + "│ Voice ┆ Shimmer (local) ┆ Avg Voice Score (1-10) │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 │\n", + "╞═══════╪═════════════════╪════════════════════════╡\n", + "│ V88 ┆ 9.07 ┆ 6.916667 │\n", + "│ V74 ┆ 9.09 ┆ 6.89196 │\n", + "│ V54 ┆ 9.11 ┆ 6.67734 │\n", + "│ V81 ┆ 9.13 ┆ 6.683007 │\n", + "│ V16 ┆ 9.38 ┆ 6.921053 │\n", + "│ V34 ┆ 10.07 ┆ 7.153005 │\n", + "│ V86 ┆ 10.28 ┆ 7.264444 │\n", + "│ V45 ┆ 10.39 ┆ 7.062189 │\n", + "│ V69 ┆ 11.52 ┆ 7.219577 │\n", + "└───────┴─────────────────┴────────────────────────┘\n", + "\n", + "=== MALE | Mean harmonics-to-noise ratio dB vs Avg Voice Score (1-10) [Pearson] | r = -0.738 ===\n", + "shape: (9, 3)\n", + "┌───────┬─────────────────────────────────┬────────────────────────┐\n", + "│ Voice ┆ Mean harmonics-to-noise ratio … ┆ Avg Voice Score (1-10) │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ f64 │\n", + "╞═══════╪═════════════════════════════════╪════════════════════════╡\n", + "│ V16 ┆ 7.544 ┆ 6.921053 │\n", + "│ V34 ┆ 7.671 ┆ 7.153005 │\n", + "│ V69 ┆ 7.763 ┆ 7.219577 │\n", + "│ V86 ┆ 8.092 ┆ 7.264444 │\n", + "│ V45 ┆ 8.647 ┆ 7.062189 │\n", + "│ V74 ┆ 8.732 ┆ 6.89196 │\n", + "│ V88 ┆ 9.009 ┆ 6.916667 │\n", + "│ V81 ┆ 9.026 ┆ 6.683007 │\n", + "│ V54 ┆ 9.598 ┆ 6.67734 │\n", + "└───────┴─────────────────────────────────┴────────────────────────┘\n", + "\n", + "=== MALE | Mean harmonics-to-noise ratio dB vs Weighted Ranking Score [Spearman] | r = -0.767 ===\n", + "shape: (10, 3)\n", + "┌───────┬─────────────────────────────────┬────────────────┐\n", + "│ Voice ┆ Mean harmonics-to-noise ratio … ┆ Weighted Score │\n", + "│ --- ┆ --- ┆ --- │\n", + "│ str ┆ f64 ┆ i64 │\n", + "╞═══════╪═════════════════════════════════╪════════════════╡\n", + "│ V16 ┆ 7.544 ┆ 156 │\n", + "│ V34 ┆ 7.671 ┆ 128 │\n", + "│ V69 ┆ 7.763 ┆ 121 │\n", + "│ V86 ┆ 8.092 ┆ 76 │\n", + "│ V46 ┆ 8.625 ┆ 148 │\n", + "│ V45 ┆ 8.647 ┆ 118 │\n", + "│ V74 ┆ 8.732 ┆ 117 │\n", + "│ V88 ┆ 9.009 ┆ 97 │\n", + "│ V81 ┆ 9.026 ┆ 69 │\n", + "│ V54 ┆ 9.598 ┆ 116 │\n", + "└───────┴─────────────────────────────────┴────────────────┘\n", + "\n", + "=== FEMALE: no correlations with |r| > 0.7 ===\n", + "\n" + ] + } + ], + "source": [ + "# Inspect underlying data for all |r| > 0.7 correlations in gender subgroups\n", + "THRESHOLD = 0.7\n", + "\n", + "metric_col_map = {\n", + " \"Avg Voice Score (1-10) [Pearson]\": \"Avg Voice Score (1-10)\",\n", + " \"Weighted Ranking Score [Spearman]\": \"Weighted Score\",\n", + "}\n", + "\n", + "for gender, corr_df in [(\"MALE\", corr_male), (\"FEMALE\", corr_female)]:\n", + " strong = corr_df.filter(pl.col(\"correlation\").abs() > THRESHOLD)\n", + " if strong.height == 0:\n", + " print(f\"=== {gender}: no correlations with |r| > {THRESHOLD} ===\\n\")\n", + " continue\n", + "\n", + " subset = merged.filter(pl.col(\"Gender\") == gender.lower())\n", + "\n", + " for row in strong.iter_rows(named=True):\n", + " attr = row[\"attribute\"]\n", + " metric_label = row[\"metric\"]\n", + " r = row[\"correlation\"]\n", + " survey_col = metric_col_map[metric_label]\n", + "\n", + " table = (\n", + " subset\n", + " .select([\"Voice\", attr, survey_col])\n", + " .drop_nulls()\n", + " .sort(attr)\n", + " )\n", + " print(f\"=== {gender} | {attr} vs {metric_label} | r = {r:.3f} ===\")\n", + " print(table)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "959945f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved 4 sheets to figures/2-4-26/strong_correlations_by_gender.xlsx\n" + ] + } + ], + "source": [ + "import xlsxwriter\n", + "from pathlib import Path\n", + "\n", + "out_path = Path(S.fig_save_dir) / \"strong_correlations_by_gender.xlsx\"\n", + "\n", + "with xlsxwriter.Workbook(str(out_path)) as wb:\n", + " sheet_count = 0\n", + " for gender, corr_df in [(\"Male\", corr_male), (\"Female\", corr_female)]:\n", + " strong = corr_df.filter(pl.col(\"correlation\").abs() > THRESHOLD)\n", + " if strong.height == 0:\n", + " continue\n", + "\n", + " subset = merged.filter(pl.col(\"Gender\") == gender.lower())\n", + "\n", + " for row in strong.iter_rows(named=True):\n", + " attr = row[\"attribute\"]\n", + " metric_label = row[\"metric\"]\n", + " r = row[\"correlation\"]\n", + " survey_col = metric_col_map[metric_label]\n", + "\n", + " table = subset.select([\"Voice\", attr, survey_col]).drop_nulls().sort(attr)\n", + "\n", + " # Sheet name: max 31 chars for Excel\n", + " short_metric = \"Score\" if \"Pearson\" in metric_label else \"Rank\"\n", + " sheet_name = f\"{gender}_{attr[:18]}_{short_metric}\"[:31]\n", + "\n", + " ws = wb.add_worksheet(sheet_name)\n", + " # Header row with context\n", + " bold = wb.add_format({\"bold\": True})\n", + " ws.write(0, 0, f\"{gender} | {attr} vs {metric_label} | r = {r:.3f}\", bold)\n", + "\n", + " # Column headers\n", + " for ci, col_name in enumerate(table.columns):\n", + " ws.write(2, ci, col_name, bold)\n", + "\n", + " # Data rows\n", + " for ri, data_row in enumerate(table.iter_rows()):\n", + " for ci, val in enumerate(data_row):\n", + " ws.write(3 + ri, ci, val)\n", + "\n", + " # Auto-fit column widths\n", + " for ci, col_name in enumerate(table.columns):\n", + " max_len = max(len(str(col_name)), *(len(str(v)) for v in table.get_column(col_name).to_list()))\n", + " ws.set_column(ci, ci, max_len + 2)\n", + "\n", + " sheet_count += 1\n", + "\n", + "print(f\"Saved {sheet_count} sheets to {out_path}\")" + ] } ], "metadata": {