statistical significance

This commit is contained in:
2026-02-05 19:49:19 +01:00
parent 840bd2940d
commit 7fb6570190
2 changed files with 243 additions and 0 deletions

View File

@@ -186,6 +186,20 @@ S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Sig
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
"""
# %% Mentions significance analysis
char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
char_rank,
alpha=0.05,
correction="none",
)
S.plot_significance_heatmap(
char_pairwise_df_mentions,
metadata=_meta_mentions,
title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
)
# %% voices analysis
top3_voices = S.get_top_3_voices(data)[0]
@@ -200,4 +214,119 @@ S.plot_significance_heatmap(
metadata=_metadata,
title="Statistical Significance: Voice Top Choice Preference"
)
# %% Total Mentions Significance (Rank 1+2+3 Combined)
# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
top3_voices,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_df_mentions,
metadata=_meta_mentions,
title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
)
# %% Male Voices Only Analysis
import reference
def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
"""Filter ranking columns to keep only those matching target gender."""
cols_to_keep = []
# Always keep identifier if present
if '_recordId' in df.columns:
cols_to_keep.append('_recordId')
for col in df.columns:
# Check if column is a voice column (contains Vxx)
# Format is typically "Top_3_Voices_ranking__V14"
if '__V' in col:
voice_id = col.split('__')[1]
if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
cols_to_keep.append(col)
return df.select(cols_to_keep)
# Get full ranking data as DataFrame
df_voices = top3_voices.collect()
# Filter for Male voices
df_male_voices = filter_voices_by_gender(df_voices, 'Male')
# 1. Male Voices: Top Choice Preference (Rank 1)
_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
df_male_voices,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_pref,
metadata=_meta_male_pref,
title="Male Voices Only: Top Choice Preference Significance"
)
# 2. Male Voices: Total Mentions (Visibility)
_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
df_male_voices,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_vis,
metadata=_meta_male_vis,
title="Male Voices Only: Total Mentions Significance"
)
# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
# Start with the male voices dataframe from the previous step
voices_to_exclude = ['V88', 'V86', 'V81']
def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
"""Filter ranking columns to exclude specific voices."""
cols_to_keep = []
# Always keep identifier if present
if '_recordId' in df.columns:
cols_to_keep.append('_recordId')
for col in df.columns:
# Check if column is a voice column (contains Vxx)
if '__V' in col:
voice_id = col.split('__')[1]
if voice_id not in exclude_list:
cols_to_keep.append(col)
return df.select(cols_to_keep)
df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
# 1. Male Top Candidates: Top Choice Preference
_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
df_male_top,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_top_pref,
metadata=_meta_male_top_pref,
title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
)
# 2. Male Top Candidates: Total Mentions
_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
df_male_top,
alpha=0.05,
correction="none"
)
S.plot_significance_heatmap(
_pairwise_male_top_vis,
metadata=_meta_male_top_vis,
title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
)
# %%