statistical significance
This commit is contained in:
@@ -186,6 +186,20 @@ S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Sig
|
|||||||
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
|
Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# %% Mentions significance analysis
|
||||||
|
|
||||||
|
char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
|
||||||
|
char_rank,
|
||||||
|
alpha=0.05,
|
||||||
|
correction="none",
|
||||||
|
)
|
||||||
|
|
||||||
|
S.plot_significance_heatmap(
|
||||||
|
char_pairwise_df_mentions,
|
||||||
|
metadata=_meta_mentions,
|
||||||
|
title="Statistical Significance: Character Total Mentions (Top 3 Visibility)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# %% voices analysis
|
# %% voices analysis
|
||||||
top3_voices = S.get_top_3_voices(data)[0]
|
top3_voices = S.get_top_3_voices(data)[0]
|
||||||
@@ -200,4 +214,119 @@ S.plot_significance_heatmap(
|
|||||||
metadata=_metadata,
|
metadata=_metadata,
|
||||||
title="Statistical Significance: Voice Top Choice Preference"
|
title="Statistical Significance: Voice Top Choice Preference"
|
||||||
)
|
)
|
||||||
|
# %% Total Mentions Significance (Rank 1+2+3 Combined)
|
||||||
|
# This tests "Quantity" (Visibility) instead of "Quality" (Preference)
|
||||||
|
|
||||||
|
_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance(
|
||||||
|
top3_voices,
|
||||||
|
alpha=0.05,
|
||||||
|
correction="none"
|
||||||
|
)
|
||||||
|
|
||||||
|
S.plot_significance_heatmap(
|
||||||
|
_pairwise_df_mentions,
|
||||||
|
metadata=_meta_mentions,
|
||||||
|
title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)"
|
||||||
|
)
|
||||||
|
# %% Male Voices Only Analysis
|
||||||
|
import reference
|
||||||
|
|
||||||
|
def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame:
|
||||||
|
"""Filter ranking columns to keep only those matching target gender."""
|
||||||
|
cols_to_keep = []
|
||||||
|
|
||||||
|
# Always keep identifier if present
|
||||||
|
if '_recordId' in df.columns:
|
||||||
|
cols_to_keep.append('_recordId')
|
||||||
|
|
||||||
|
for col in df.columns:
|
||||||
|
# Check if column is a voice column (contains Vxx)
|
||||||
|
# Format is typically "Top_3_Voices_ranking__V14"
|
||||||
|
if '__V' in col:
|
||||||
|
voice_id = col.split('__')[1]
|
||||||
|
if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender:
|
||||||
|
cols_to_keep.append(col)
|
||||||
|
|
||||||
|
return df.select(cols_to_keep)
|
||||||
|
|
||||||
|
# Get full ranking data as DataFrame
|
||||||
|
df_voices = top3_voices.collect()
|
||||||
|
|
||||||
|
# Filter for Male voices
|
||||||
|
df_male_voices = filter_voices_by_gender(df_voices, 'Male')
|
||||||
|
|
||||||
|
# 1. Male Voices: Top Choice Preference (Rank 1)
|
||||||
|
_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance(
|
||||||
|
df_male_voices,
|
||||||
|
alpha=0.05,
|
||||||
|
correction="none"
|
||||||
|
)
|
||||||
|
|
||||||
|
S.plot_significance_heatmap(
|
||||||
|
_pairwise_male_pref,
|
||||||
|
metadata=_meta_male_pref,
|
||||||
|
title="Male Voices Only: Top Choice Preference Significance"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Male Voices: Total Mentions (Visibility)
|
||||||
|
_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance(
|
||||||
|
df_male_voices,
|
||||||
|
alpha=0.05,
|
||||||
|
correction="none"
|
||||||
|
)
|
||||||
|
|
||||||
|
S.plot_significance_heatmap(
|
||||||
|
_pairwise_male_vis,
|
||||||
|
metadata=_meta_male_vis,
|
||||||
|
title="Male Voices Only: Total Mentions Significance"
|
||||||
|
)
|
||||||
|
# %% Male Voices (Excluding Bottom 3: V88, V86, V81)
|
||||||
|
|
||||||
|
# Start with the male voices dataframe from the previous step
|
||||||
|
voices_to_exclude = ['V88', 'V86', 'V81']
|
||||||
|
|
||||||
|
def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame:
|
||||||
|
"""Filter ranking columns to exclude specific voices."""
|
||||||
|
cols_to_keep = []
|
||||||
|
|
||||||
|
# Always keep identifier if present
|
||||||
|
if '_recordId' in df.columns:
|
||||||
|
cols_to_keep.append('_recordId')
|
||||||
|
|
||||||
|
for col in df.columns:
|
||||||
|
# Check if column is a voice column (contains Vxx)
|
||||||
|
if '__V' in col:
|
||||||
|
voice_id = col.split('__')[1]
|
||||||
|
if voice_id not in exclude_list:
|
||||||
|
cols_to_keep.append(col)
|
||||||
|
|
||||||
|
return df.select(cols_to_keep)
|
||||||
|
|
||||||
|
df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude)
|
||||||
|
|
||||||
|
# 1. Male Top Candidates: Top Choice Preference
|
||||||
|
_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance(
|
||||||
|
df_male_top,
|
||||||
|
alpha=0.05,
|
||||||
|
correction="none"
|
||||||
|
)
|
||||||
|
|
||||||
|
S.plot_significance_heatmap(
|
||||||
|
_pairwise_male_top_pref,
|
||||||
|
metadata=_meta_male_top_pref,
|
||||||
|
title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Male Top Candidates: Total Mentions
|
||||||
|
_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance(
|
||||||
|
df_male_top,
|
||||||
|
alpha=0.05,
|
||||||
|
correction="none"
|
||||||
|
)
|
||||||
|
|
||||||
|
S.plot_significance_heatmap(
|
||||||
|
_pairwise_male_top_vis,
|
||||||
|
metadata=_meta_male_top_vis,
|
||||||
|
title="Male Voices (Excl. Bottom 3): Total Mentions Significance"
|
||||||
|
)
|
||||||
# %%
|
# %%
|
||||||
|
|||||||
114
utils.py
114
utils.py
@@ -1588,6 +1588,120 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
|||||||
|
|
||||||
return results_df, metadata
|
return results_df, metadata
|
||||||
|
|
||||||
|
def compute_mentions_significance(
|
||||||
|
self,
|
||||||
|
data: pl.LazyFrame | pl.DataFrame,
|
||||||
|
alpha: float = 0.05,
|
||||||
|
correction: str = "bonferroni",
|
||||||
|
) -> tuple[pl.DataFrame, dict]:
|
||||||
|
"""Compute statistical significance for Total Mentions (Rank 1+2+3).
|
||||||
|
|
||||||
|
Tests whether the proportion of respondents who included a voice in their Top 3
|
||||||
|
is significantly different between voices.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Ranking data (rows=respondents, cols=voices, values=rank).
|
||||||
|
alpha: Significance level.
|
||||||
|
correction: Multiple comparison correction method.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: (pairwise_df, metadata)
|
||||||
|
"""
|
||||||
|
from scipy import stats as scipy_stats
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
if isinstance(data, pl.LazyFrame):
|
||||||
|
df = data.collect()
|
||||||
|
else:
|
||||||
|
df = data
|
||||||
|
|
||||||
|
ranking_cols = [c for c in df.columns if c != '_recordId']
|
||||||
|
if len(ranking_cols) < 2:
|
||||||
|
raise ValueError("Need at least 2 ranking columns")
|
||||||
|
|
||||||
|
total_respondents = df.height
|
||||||
|
mentions_data = {}
|
||||||
|
|
||||||
|
# Count mentions (any rank) for each voice
|
||||||
|
for col in ranking_cols:
|
||||||
|
label = self._clean_voice_label(col)
|
||||||
|
count = df.filter(pl.col(col).is_not_null()).height
|
||||||
|
mentions_data[label] = count
|
||||||
|
|
||||||
|
labels = sorted(list(mentions_data.keys()))
|
||||||
|
results = []
|
||||||
|
n_comparisons = len(labels) * (len(labels) - 1) // 2
|
||||||
|
|
||||||
|
for i, label1 in enumerate(labels):
|
||||||
|
for label2 in labels[i+1:]:
|
||||||
|
count1 = mentions_data[label1]
|
||||||
|
count2 = mentions_data[label2]
|
||||||
|
|
||||||
|
pct1 = count1 / total_respondents
|
||||||
|
pct2 = count2 / total_respondents
|
||||||
|
|
||||||
|
# Z-test for two proportions
|
||||||
|
n1 = total_respondents
|
||||||
|
n2 = total_respondents
|
||||||
|
|
||||||
|
p_pooled = (count1 + count2) / (n1 + n2)
|
||||||
|
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2))
|
||||||
|
|
||||||
|
if se > 0:
|
||||||
|
z_stat = (pct1 - pct2) / se
|
||||||
|
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))
|
||||||
|
else:
|
||||||
|
p_value = 1.0
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
'group1': label1,
|
||||||
|
'group2': label2,
|
||||||
|
'p_value': float(p_value),
|
||||||
|
'rank1_count1': count1, # Reusing column names for compatibility with heatmap plotting
|
||||||
|
'rank1_count2': count2,
|
||||||
|
'rank1_pct1': round(pct1 * 100, 1),
|
||||||
|
'rank1_pct2': round(pct2 * 100, 1),
|
||||||
|
'total1': n1,
|
||||||
|
'total2': n2,
|
||||||
|
'effect_size': pct1 - pct2 # Difference in proportions
|
||||||
|
})
|
||||||
|
|
||||||
|
results_df = pl.DataFrame(results)
|
||||||
|
|
||||||
|
p_values = results_df['p_value'].to_numpy()
|
||||||
|
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
|
||||||
|
|
||||||
|
if correction == "bonferroni":
|
||||||
|
p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
|
||||||
|
elif correction == "holm":
|
||||||
|
sorted_idx = np.argsort(p_values)
|
||||||
|
sorted_p = p_values[sorted_idx]
|
||||||
|
m = len(sorted_p)
|
||||||
|
adjusted = np.zeros(m)
|
||||||
|
for j in range(m):
|
||||||
|
adjusted[j] = sorted_p[j] * (m - j)
|
||||||
|
for j in range(1, m):
|
||||||
|
adjusted[j] = max(adjusted[j], adjusted[j-1])
|
||||||
|
adjusted = np.minimum(adjusted, 1.0)
|
||||||
|
p_adjusted = adjusted[np.argsort(sorted_idx)]
|
||||||
|
elif correction == "none":
|
||||||
|
p_adjusted = p_values.astype(float) # pyright: ignore
|
||||||
|
|
||||||
|
results_df = results_df.with_columns([
|
||||||
|
pl.Series('p_adjusted', p_adjusted),
|
||||||
|
pl.Series('significant', p_adjusted < alpha),
|
||||||
|
]).sort('p_value')
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
'test_type': 'proportion_z_test_mentions',
|
||||||
|
'alpha': alpha,
|
||||||
|
'correction': correction,
|
||||||
|
'n_comparisons': n_comparisons,
|
||||||
|
}
|
||||||
|
|
||||||
|
return results_df, metadata
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def process_speaking_style_data(
|
def process_speaking_style_data(
|
||||||
df: Union[pl.LazyFrame, pl.DataFrame],
|
df: Union[pl.LazyFrame, pl.DataFrame],
|
||||||
|
|||||||
Reference in New Issue
Block a user