diff --git a/plots.py b/plots.py
index 52a7d7e..3fd0631 100644
--- a/plots.py
+++ b/plots.py
@@ -2860,4 +2860,120 @@ class QualtricsPlotsMixin:
)
chart = self._save_plot(chart, title)
+ return chart
+
+ def plot_speech_attribute_correlation(
+ self,
+ corr_df: pl.DataFrame | pd.DataFrame,
+ title: str = "Speech Attributes vs Survey Metrics
Pearson Correlation",
+ filename: str | None = None,
+ height: int | None = None,
+ width: int | None = None,
+ show_values: bool = True,
+ color_scheme: str | None = None,
+ ) -> alt.Chart:
+ """Plot a correlation heatmap between speech attributes and survey metrics.
+
+ Expects a long-form DataFrame with columns:
+ - metric: row label (e.g. "Weighted Rank", "Avg Voice Score")
+ - attribute: column label (speech characteristic name)
+ - correlation: Pearson r value
+
+ Args:
+ corr_df: Long-form correlation DataFrame.
+ title: Chart title (supports
for line breaks).
+ filename: Optional explicit filename (without extension).
+ height: Chart height in pixels.
+ width: Chart width in pixels.
+ show_values: Whether to display correlation values as text.
+ color_scheme: Optional Altair diverging color scheme name.
+
+ Returns:
+ alt.Chart: Altair heatmap chart.
+ """
+ if isinstance(corr_df, pl.DataFrame):
+ plot_df = corr_df.to_pandas()
+ else:
+ plot_df = corr_df
+
+ attributes = plot_df["attribute"].unique().tolist()
+ metrics = plot_df["metric"].unique().tolist()
+
+ n_attrs = len(attributes)
+ chart_width = width or max(600, n_attrs * 55)
+ chart_height = height or max(120, len(metrics) * 50 + 60)
+
+ heatmap = (
+ alt.Chart(plot_df)
+ .mark_rect(stroke="white", strokeWidth=1)
+ .encode(
+ x=alt.X(
+ "attribute:N",
+ title=None,
+ sort=attributes,
+ axis=alt.Axis(labelAngle=-45, labelLimit=180, grid=False),
+ ),
+ y=alt.Y(
+ "metric:N",
+ title=None,
+ sort=metrics,
+ axis=alt.Axis(labelLimit=200, grid=False),
+ ),
+ color=alt.Color(
+ "correlation:Q",
+ scale=alt.Scale(
+ domain=[-1, 1],
+ scheme=color_scheme or "redblue",
+ ),
+ legend=alt.Legend(title="Pearson r"),
+ ),
+ tooltip=[
+ alt.Tooltip("metric:N", title="Metric"),
+ alt.Tooltip("attribute:N", title="Attribute"),
+ alt.Tooltip("correlation:Q", title="r", format=".3f"),
+ ],
+ )
+ )
+
+ if show_values:
+ # Split into two text layers with fixed mark colors to avoid
+ # conflicting color encodings that break vl_convert PNG export.
+ dark_rows = plot_df[plot_df["correlation"].abs() <= 0.45]
+ light_rows = plot_df[plot_df["correlation"].abs() > 0.45]
+
+ text_layers = []
+ if not dark_rows.empty:
+ text_layers.append(
+ alt.Chart(dark_rows)
+ .mark_text(fontSize=11, fontWeight="normal", color="black")
+ .encode(
+ x=alt.X("attribute:N", sort=attributes),
+ y=alt.Y("metric:N", sort=metrics),
+ text=alt.Text("correlation:Q", format=".2f"),
+ )
+ )
+ if not light_rows.empty:
+ text_layers.append(
+ alt.Chart(light_rows)
+ .mark_text(fontSize=11, fontWeight="normal", color="white")
+ .encode(
+ x=alt.X("attribute:N", sort=attributes),
+ y=alt.Y("metric:N", sort=metrics),
+ text=alt.Text("correlation:Q", format=".2f"),
+ )
+ )
+
+ chart = heatmap
+ for tl in text_layers:
+ chart = chart + tl
+ else:
+ chart = heatmap
+
+ chart = chart.properties(
+ title=self._process_title(title),
+ width=chart_width,
+ height=chart_height,
+ )
+
+ chart = self._save_plot(chart, title, filename=filename)
return chart
\ No newline at end of file
diff --git a/speech_data_correlation.ipynb b/speech_data_correlation.ipynb
index f783603..cb62e86 100644
--- a/speech_data_correlation.ipynb
+++ b/speech_data_correlation.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 1,
"id": "7174c11a",
"metadata": {},
"outputs": [],
@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 2,
"id": "d9d11d52",
"metadata": {},
"outputs": [],
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 37,
"id": "c8f06ff8",
"metadata": {},
"outputs": [],
@@ -36,30 +36,29 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "8da85898",
"metadata": {},
"outputs": [],
- "source": [
- "# Convert comma decimal separator to period and cast to float\n",
- "speech_df = speech_df.with_columns([\n",
- " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n",
- " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n",
- " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n",
- " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n",
- "])"
- ]
+ "source": []
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 38,
"id": "e7ccd8ef",
"metadata": {},
"outputs": [],
"source": [
- "# Convert specified columns to float, handling percent signs, commas and spaces\n",
- "cols_to_convert = []\n",
"\n",
+ "# Convert comma decimal separator to period and cast to float\n",
+ "cols_to_convert = [\n",
+ " pl.col('dur (s)').str.replace(',', '.').cast(pl.Float64),\n",
+ " pl.col('phonationtime (s)').str.replace(',', '.').cast(pl.Float64),\n",
+ " pl.col('articulation rate (nsyll / phonationtime)').str.replace(',', '.').cast(pl.Float64),\n",
+ " pl.col('speech rate words per minute').str.replace(',', '.').cast(pl.Float64),\n",
+ "]\n",
+ "\n",
+ "# Convert specified columns to float, handling percent signs, commas and spaces\n",
"if \"Standard deviation pitch\" in speech_df.columns:\n",
" cols_to_convert.append(\n",
" pl.col(\"Standard deviation pitch\")\n",
@@ -89,7 +88,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 39,
"id": "450d1d29",
"metadata": {},
"outputs": [
@@ -97,34 +96,38 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "shape: (5, 22)\n",
- "┌───────┬────────┬─────┬────────────┬───┬──────────────┬──────────────┬──────────────┬─────────────┐\n",
- "│ Voice ┆ Gender ┆ Age ┆ Mean pitch ┆ … ┆ phonationtim ┆ speechrate ┆ articulation ┆ speech rate │\n",
- "│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ e (s) ┆ (nsyll/dur) ┆ rate (nsyll ┆ words per │\n",
- "│ str ┆ str ┆ i64 ┆ f64 ┆ ┆ --- ┆ --- ┆ / pho… ┆ minute │\n",
- "│ ┆ ┆ ┆ ┆ ┆ str ┆ str ┆ --- ┆ --- │\n",
- "│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ str ┆ str │\n",
- "╞═══════╪════════╪═════╪════════════╪═══╪══════════════╪══════════════╪══════════════╪═════════════╡\n",
- "│ VVV74 ┆ male ┆ 41 ┆ 95.25 ┆ … ┆ 11,17 ┆ 4,32468397 ┆ 5,81915846 ┆ 185,3436 │\n",
- "│ VVV54 ┆ male ┆ 36 ┆ 126.93 ┆ … ┆ 10,83 ┆ 4,38596491 ┆ 6,00184672 ┆ 187,9699 │\n",
- "│ VVV48 ┆ female ┆ 35 ┆ 193.296 ┆ … ┆ 11,92 ┆ 4,02227723 ┆ 5,45302013 ┆ 172,3833 │\n",
- "│ VVV14 ┆ female ┆ 50 ┆ 169.214 ┆ … ┆ 12,75 ┆ 3,97553517 ┆ 5,09803922 ┆ 170,3801 │\n",
- "│ VVV4 ┆ female ┆ 28 ┆ 208.079 ┆ … ┆ 12,22 ┆ 4,10094637 ┆ 5,31914894 ┆ 175,7548 │\n",
- "└───────┴────────┴─────┴────────────┴───┴──────────────┴──────────────┴──────────────┴─────────────┘\n"
+ "shape: (18, 3)\n",
+ "┌───────┬────────┬──────┐\n",
+ "│ Voice ┆ Gender ┆ Age │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ i64 │\n",
+ "╞═══════╪════════╪══════╡\n",
+ "│ V04 ┆ female ┆ 28 │\n",
+ "│ V08 ┆ female ┆ 45 │\n",
+ "│ V14 ┆ female ┆ 50 │\n",
+ "│ V16 ┆ male ┆ 40 │\n",
+ "│ V34 ┆ male ┆ 42 │\n",
+ "│ … ┆ … ┆ … │\n",
+ "│ V82 ┆ female ┆ null │\n",
+ "│ V86 ┆ male ┆ 62 │\n",
+ "│ V88 ┆ male ┆ 42 │\n",
+ "│ V89 ┆ female ┆ 32 │\n",
+ "│ V91 ┆ female ┆ null │\n",
+ "└───────┴────────┴──────┘\n"
]
}
],
"source": [
- "# Convert Voice ints to strings like \"V81\"\n",
- "# Convert Voice ints to strings like \"V81\"\n",
+ "# Convert Voice ints to zero-padded strings like \"V04\", \"V81\"\n",
+ "# Survey uses zero-padded IDs (V04, V08) so we must match that format\n",
"speech_df = speech_df.with_columns(\n",
" pl.when(pl.col(\"Voice\").is_not_null())\n",
- " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8)]))\n",
+ " .then(pl.concat_str([pl.lit(\"V\"), pl.col(\"Voice\").cast(pl.Utf8).str.zfill(2)]))\n",
" .otherwise(None)\n",
" .alias(\"Voice\")\n",
")\n",
"\n",
- "print(speech_df.head())"
+ "print(speech_df.select([\"Voice\", \"Gender\", \"Age\"]).sort(\"Voice\"))"
]
},
{
@@ -137,7 +140,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"id": "bb4200ee",
"metadata": {},
"outputs": [],
@@ -149,7 +152,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 8,
"id": "57243afd",
"metadata": {},
"outputs": [
@@ -180,7 +183,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 9,
"id": "b38d21fc",
"metadata": {},
"outputs": [
@@ -218,11 +221,751 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"id": "5b3e6ad0",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "shape: (5, 2)\n",
+ "┌───────┬────────────────────────┐\n",
+ "│ Voice ┆ Avg Voice Score (1-10) │\n",
+ "│ --- ┆ --- │\n",
+ "│ str ┆ f64 │\n",
+ "╞═══════╪════════════════════════╡\n",
+ "│ V08 ┆ 7.38172 │\n",
+ "│ V82 ┆ 7.376984 │\n",
+ "│ V89 ┆ 7.373206 │\n",
+ "│ V86 ┆ 7.264444 │\n",
+ "│ V69 ┆ 7.219577 │\n",
+ "└───────┴────────────────────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "# --- Compute average voice score (1-10) per voice ---\n",
+ "voice_cols = [c for c in voice_1_10.columns if c.startswith(\"Voice_Scale_1_10__\")]\n",
+ "avg_scores = []\n",
+ "for col in voice_cols:\n",
+ " voice_id = col.replace(\"Voice_Scale_1_10__\", \"\") # e.g. \"V14\"\n",
+ " mean_val = voice_1_10.select(pl.col(col).mean()).item()\n",
+ " avg_scores.append({\"Voice\": voice_id, \"Avg Voice Score (1-10)\": mean_val})\n",
+ "\n",
+ "avg_voice_scores = pl.DataFrame(avg_scores)\n",
+ "print(avg_voice_scores.sort(\"Avg Voice Score (1-10)\", descending=True).head())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "id": "79626ffb",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Merged rows: 18 (voices with both speech data and survey data)\n",
+ " → Voices missing Avg Voice Score: ['V46']\n",
+ "shape: (5, 4)\n",
+ "┌───────┬────────┬────────────────────────┬────────────────┐\n",
+ "│ Voice ┆ Gender ┆ Avg Voice Score (1-10) ┆ Weighted Score │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ f64 ┆ i64 │\n",
+ "╞═══════╪════════╪════════════════════════╪════════════════╡\n",
+ "│ V14 ┆ female ┆ 7.216279 ┆ 209 │\n",
+ "│ V04 ┆ female ┆ 7.07971 ┆ 209 │\n",
+ "│ V08 ┆ female ┆ 7.38172 ┆ 180 │\n",
+ "│ V82 ┆ female ┆ 7.376984 ┆ 172 │\n",
+ "│ V77 ┆ female ┆ 6.960894 ┆ 158 │\n",
+ "└───────┴────────┴────────────────────────┴────────────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "# --- Normalize weighted rank column name and join all data ---\n",
+ "weighted_rank = voices_weighted_rank.rename({\"Character\": \"Voice\"})\n",
+ "\n",
+ "# Join speech attributes with both survey metrics\n",
+ "# Left join on avg_voice_scores so V46 (excluded from survey voice scale) is kept\n",
+ "# — its Avg Score will be null but Weighted Ranking Score is still valid\n",
+ "merged = (\n",
+ " speech_df\n",
+ " .join(avg_voice_scores, on=\"Voice\", how=\"left\")\n",
+ " .join(weighted_rank, on=\"Voice\", how=\"inner\")\n",
+ ")\n",
+ "\n",
+ "print(f\"Merged rows: {merged.height} (voices with both speech data and survey data)\")\n",
+ "print(f\" → Voices missing Avg Voice Score: {merged.filter(pl.col('Avg Voice Score (1-10)').is_null())['Voice'].to_list()}\")\n",
+ "print(merged.select([\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"]).head())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "932cfb9e",
+ "metadata": {},
+ "source": [
+ "# Correlation: Speech Attributes vs Survey Metrics\n",
+ "\n",
+ "We correlate each speech characteristic (pitch, duration, jitter, etc.) against two survey metrics:\n",
+ "\n",
+ "| Metric | Type | Correlation Method | Why |\n",
+ "|---|---|---|---|\n",
+ "| **Avg Voice Score (1-10)** | Continuous | **Pearson** | Both variables are continuous and approximately interval-scaled — Pearson captures linear relationships well here. |\n",
+ "| **Weighted Ranking Score** | Ordinal / count-based | **Spearman** | The weighted score (1st=3pts, 2nd=2pts, 3rd=1pt) is ordinal in nature with a small number of discrete values. Spearman is rank-based, making no assumptions about linearity or normality — more appropriate for this type of data. |\n",
+ "\n",
+ "> **Note:** With only ~17 voices, all correlations should be interpreted cautiously. Small samples amplify the influence of individual data points."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "id": "77658327",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "shape: (39, 3)\n",
+ "┌─────────────────────────────────┬─────────────────────────────────┬─────────────┐\n",
+ "│ attribute ┆ metric ┆ correlation │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ f64 │\n",
+ "╞═════════════════════════════════╪═════════════════════════════════╪═════════════╡\n",
+ "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.386054 │\n",
+ "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.094086 │\n",
+ "│ Mean pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.459684 │\n",
+ "│ Mean pitch ┆ Weighted Ranking Score [Spearm… ┆ 0.63429 │\n",
+ "│ Standard deviation pitch ┆ Avg Voice Score (1-10) [Pearso… ┆ 0.664432 │\n",
+ "│ … ┆ … ┆ … │\n",
+ "│ speechrate (nsyll/dur) ┆ Weighted Ranking Score [Spearm… ┆ -0.336524 │\n",
+ "│ articulation rate (nsyll / pho… ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.456181 │\n",
+ "│ articulation rate (nsyll / pho… ┆ Weighted Ranking Score [Spearm… ┆ -0.268239 │\n",
+ "│ speech rate words per minute ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.26437 │\n",
+ "│ speech rate words per minute ┆ Weighted Ranking Score [Spearm… ┆ 0.252577 │\n",
+ "└─────────────────────────────────┴─────────────────────────────────┴─────────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "# --- Compute correlations ---\n",
+ "# Pearson for continuous Voice Score, Spearman for ordinal Weighted Ranking Score\n",
+ "exclude_cols = {\"Voice\", \"Gender\", \"Avg Voice Score (1-10)\", \"Weighted Score\"}\n",
+ "speech_attrs = [c for c in merged.columns if c not in exclude_cols]\n",
+ "\n",
+ "rows = []\n",
+ "for attr in speech_attrs:\n",
+ " # Drop nulls for the pair before computing correlation\n",
+ " valid = merged.select([attr, \"Avg Voice Score (1-10)\", \"Weighted Score\"]).drop_nulls()\n",
+ " if valid.height > 2:\n",
+ " # Pearson for continuous 1-10 score\n",
+ " r_score = valid.select(pl.corr(attr, \"Avg Voice Score (1-10)\", method=\"pearson\")).item()\n",
+ " # Spearman for ordinal weighted ranking score\n",
+ " r_rank = valid.select(pl.corr(attr, \"Weighted Score\", method=\"spearman\")).item()\n",
+ " else:\n",
+ " r_score = None\n",
+ " r_rank = None\n",
+ " rows.append({\"attribute\": attr, \"metric\": \"Avg Voice Score (1-10) [Pearson]\", \"correlation\": r_score})\n",
+ " rows.append({\"attribute\": attr, \"metric\": \"Weighted Ranking Score [Spearman]\", \"correlation\": r_rank})\n",
+ "\n",
+ "corr_long = (\n",
+ " pl.DataFrame(rows)\n",
+ " .drop_nulls()\n",
+ " .filter(pl.col(\"correlation\").is_not_nan())\n",
+ ")\n",
+ "print(corr_long)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "ef4ceefc",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Female Age correlation:\n",
+ "shape: (2, 3)\n",
+ "┌───────────┬─────────────────────────────────┬─────────────┐\n",
+ "│ attribute ┆ metric ┆ correlation │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ f64 │\n",
+ "╞═══════════╪═════════════════════════════════╪═════════════╡\n",
+ "│ Age ┆ Avg Voice Score (1-10) [Pearso… ┆ -0.023566 │\n",
+ "│ Age ┆ Weighted Ranking Score [Spearm… ┆ 0.231908 │\n",
+ "└───────────┴─────────────────────────────────┴─────────────┘\n",
+ "\n",
+ "Female data (Age + Weighted Score):\n",
+ "shape: (6, 3)\n",
+ "┌───────┬─────┬────────────────┐\n",
+ "│ Voice ┆ Age ┆ Weighted Score │\n",
+ "│ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ i64 ┆ i64 │\n",
+ "╞═══════╪═════╪════════════════╡\n",
+ "│ V04 ┆ 28 ┆ 209 │\n",
+ "│ V89 ┆ 32 ┆ 130 │\n",
+ "│ V48 ┆ 35 ┆ 144 │\n",
+ "│ V08 ┆ 45 ┆ 180 │\n",
+ "│ V77 ┆ 48 ┆ 158 │\n",
+ "│ V14 ┆ 50 ┆ 209 │\n",
+ "└───────┴─────┴────────────────┘\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Verify: Age correlation in female split should no longer be 1.0\n",
+ "print(\"Female Age correlation:\")\n",
+ "print(corr_female.filter(pl.col(\"attribute\") == \"Age\"))\n",
+ "\n",
+ "print(\"\\nFemale data (Age + Weighted Score):\")\n",
+ "female_check = merged.filter(pl.col(\"Gender\") == \"female\").select([\"Voice\", \"Age\", \"Weighted Score\"]).drop_nulls().sort(\"Age\")\n",
+ "print(female_check)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "0d9567ff",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Saved plot to figures/2-4-26/All_Respondents/speech_attr_vs_survey_correlation.png\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "