statistical significance
This commit is contained in:
@@ -186,6 +186,20 @@ S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Sig
|
||||
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
|
||||
"""
|
||||
|
||||
# %% Mentions significance analysis
|
||||
|
||||
char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
|
||||
char_rank,
|
||||
alpha=0.05,
|
||||
correction="none",
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
char_pairwise_df_mentions,
|
||||
metadata=_meta_mentions,
|
||||
title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
|
||||
)
|
||||
|
||||
|
||||
# %% voices analysis
|
||||
top3_voices = S.get_top_3_voices(data)[0]
|
||||
@@ -200,4 +214,119 @@ S.plot_significance_heatmap(
|
||||
metadata=_metadata,
|
||||
title="Statistical Significance: Voice Top Choice Preference"
|
||||
)
|
||||
# %% Total Mentions Significance (Rank 1+2+3 Combined)
|
||||
# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
|
||||
|
||||
_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
|
||||
top3_voices,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_df_mentions,
|
||||
metadata=_meta_mentions,
|
||||
title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
|
||||
)
|
||||
# %% Male Voices Only Analysis
|
||||
import reference
|
||||
|
||||
def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
|
||||
"""Filter ranking columns to keep only those matching target gender."""
|
||||
cols_to_keep = []
|
||||
|
||||
# Always keep identifier if present
|
||||
if '_recordId' in df.columns:
|
||||
cols_to_keep.append('_recordId')
|
||||
|
||||
for col in df.columns:
|
||||
# Check if column is a voice column (contains Vxx)
|
||||
# Format is typically "Top_3_Voices_ranking__V14"
|
||||
if '__V' in col:
|
||||
voice_id = col.split('__')[1]
|
||||
if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
|
||||
cols_to_keep.append(col)
|
||||
|
||||
return df.select(cols_to_keep)
|
||||
|
||||
# Get full ranking data as DataFrame
|
||||
df_voices = top3_voices.collect()
|
||||
|
||||
# Filter for Male voices
|
||||
df_male_voices = filter_voices_by_gender(df_voices, 'Male')
|
||||
|
||||
# 1. Male Voices: Top Choice Preference (Rank 1)
|
||||
_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
|
||||
df_male_voices,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_pref,
|
||||
metadata=_meta_male_pref,
|
||||
title="Male Voices Only: Top Choice Preference Significance"
|
||||
)
|
||||
|
||||
# 2. Male Voices: Total Mentions (Visibility)
|
||||
_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
|
||||
df_male_voices,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_vis,
|
||||
metadata=_meta_male_vis,
|
||||
title="Male Voices Only: Total Mentions Significance"
|
||||
)
|
||||
# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
|
||||
|
||||
# Start with the male voices dataframe from the previous step
|
||||
voices_to_exclude = ['V88', 'V86', 'V81']
|
||||
|
||||
def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
|
||||
"""Filter ranking columns to exclude specific voices."""
|
||||
cols_to_keep = []
|
||||
|
||||
# Always keep identifier if present
|
||||
if '_recordId' in df.columns:
|
||||
cols_to_keep.append('_recordId')
|
||||
|
||||
for col in df.columns:
|
||||
# Check if column is a voice column (contains Vxx)
|
||||
if '__V' in col:
|
||||
voice_id = col.split('__')[1]
|
||||
if voice_id not in exclude_list:
|
||||
cols_to_keep.append(col)
|
||||
|
||||
return df.select(cols_to_keep)
|
||||
|
||||
df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
|
||||
|
||||
# 1. Male Top Candidates: Top Choice Preference
|
||||
_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
|
||||
df_male_top,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_top_pref,
|
||||
metadata=_meta_male_top_pref,
|
||||
title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
|
||||
)
|
||||
|
||||
# 2. Male Top Candidates: Total Mentions
|
||||
_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
|
||||
df_male_top,
|
||||
alpha=0.05,
|
||||
correction="none"
|
||||
)
|
||||
|
||||
S.plot_significance_heatmap(
|
||||
_pairwise_male_top_vis,
|
||||
metadata=_meta_male_top_vis,
|
||||
title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
|
||||
)
|
||||
# %%
|
||||
|
||||
Reference in New Issue
Block a user