diff --git a/XX_statistical_significance.script.py b/XX_statistical_significance.script.py index 3d2f867..d7a1214 100644 --- a/XX_statistical_significance.script.py +++ b/XX_statistical_significance.script.py @@ -186,6 +186,20 @@ S.plot_significance_heatmap(_pairwise_df, metadata=_meta, title="Statistical Sig Yes, "The Coach" can be considered statistically more significant than the other options. It is clearly superior to the bottom two options and holds a statistically significant lead over the runner-up ("Personal Assistant") in direct comparison. """ +# %% Mentions significance analysis + +char_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance( + char_rank, + alpha=0.05, + correction="none", +) + +S.plot_significance_heatmap( + char_pairwise_df_mentions, + metadata=_meta_mentions, + title="Statistical Significance: Character Total Mentions (Top 3 Visibility)" +) + # %% voices analysis top3_voices = S.get_top_3_voices(data)[0] @@ -200,4 +214,119 @@ S.plot_significance_heatmap( metadata=_metadata, title="Statistical Significance: Voice Top Choice Preference" ) +# %% Total Mentions Significance (Rank 1+2+3 Combined) +# This tests "Quantity" (Visibility) instead of "Quality" (Preference) + +_pairwise_df_mentions, _meta_mentions = S.compute_mentions_significance( + top3_voices, + alpha=0.05, + correction="none" +) + +S.plot_significance_heatmap( + _pairwise_df_mentions, + metadata=_meta_mentions, + title="Statistical Significance: Voice Total Mentions (Top 3 Visibility)" +) +# %% Male Voices Only Analysis +import reference + +def filter_voices_by_gender(df: pl.DataFrame, target_gender: str) -> pl.DataFrame: + """Filter ranking columns to keep only those matching target gender.""" + cols_to_keep = [] + + # Always keep identifier if present + if '_recordId' in df.columns: + cols_to_keep.append('_recordId') + + for col in df.columns: + # Check if column is a voice column (contains Vxx) + # Format is typically "Top_3_Voices_ranking__V14" + if '__V' in col: + voice_id = col.split('__')[1] + if reference.VOICE_GENDER_MAPPING.get(voice_id) == target_gender: + cols_to_keep.append(col) + + return df.select(cols_to_keep) + +# Get full ranking data as DataFrame +df_voices = top3_voices.collect() + +# Filter for Male voices +df_male_voices = filter_voices_by_gender(df_voices, 'Male') + +# 1. Male Voices: Top Choice Preference (Rank 1) +_pairwise_male_pref, _meta_male_pref = S.compute_ranking_significance( + df_male_voices, + alpha=0.05, + correction="none" +) + +S.plot_significance_heatmap( + _pairwise_male_pref, + metadata=_meta_male_pref, + title="Male Voices Only: Top Choice Preference Significance" +) + +# 2. Male Voices: Total Mentions (Visibility) +_pairwise_male_vis, _meta_male_vis = S.compute_mentions_significance( + df_male_voices, + alpha=0.05, + correction="none" +) + +S.plot_significance_heatmap( + _pairwise_male_vis, + metadata=_meta_male_vis, + title="Male Voices Only: Total Mentions Significance" +) +# %% Male Voices (Excluding Bottom 3: V88, V86, V81) + +# Start with the male voices dataframe from the previous step +voices_to_exclude = ['V88', 'V86', 'V81'] + +def filter_exclude_voices(df: pl.DataFrame, exclude_list: list[str]) -> pl.DataFrame: + """Filter ranking columns to exclude specific voices.""" + cols_to_keep = [] + + # Always keep identifier if present + if '_recordId' in df.columns: + cols_to_keep.append('_recordId') + + for col in df.columns: + # Check if column is a voice column (contains Vxx) + if '__V' in col: + voice_id = col.split('__')[1] + if voice_id not in exclude_list: + cols_to_keep.append(col) + + return df.select(cols_to_keep) + +df_male_top = filter_exclude_voices(df_male_voices, voices_to_exclude) + +# 1. Male Top Candidates: Top Choice Preference +_pairwise_male_top_pref, _meta_male_top_pref = S.compute_ranking_significance( + df_male_top, + alpha=0.05, + correction="none" +) + +S.plot_significance_heatmap( + _pairwise_male_top_pref, + metadata=_meta_male_top_pref, + title="Male Voices (Excl. Bottom 3): Top Choice Preference Significance" +) + +# 2. Male Top Candidates: Total Mentions +_pairwise_male_top_vis, _meta_male_top_vis = S.compute_mentions_significance( + df_male_top, + alpha=0.05, + correction="none" +) + +S.plot_significance_heatmap( + _pairwise_male_top_vis, + metadata=_meta_male_top_vis, + title="Male Voices (Excl. Bottom 3): Total Mentions Significance" +) # %% diff --git a/utils.py b/utils.py index 3442f29..be2647d 100644 --- a/utils.py +++ b/utils.py @@ -1588,6 +1588,120 @@ class QualtricsSurvey(QualtricsPlotsMixin): return results_df, metadata + def compute_mentions_significance( + self, + data: pl.LazyFrame | pl.DataFrame, + alpha: float = 0.05, + correction: str = "bonferroni", + ) -> tuple[pl.DataFrame, dict]: + """Compute statistical significance for Total Mentions (Rank 1+2+3). + + Tests whether the proportion of respondents who included a voice in their Top 3 + is significantly different between voices. + + Args: + data: Ranking data (rows=respondents, cols=voices, values=rank). + alpha: Significance level. + correction: Multiple comparison correction method. + + Returns: + tuple: (pairwise_df, metadata) + """ + from scipy import stats as scipy_stats + import numpy as np + + if isinstance(data, pl.LazyFrame): + df = data.collect() + else: + df = data + + ranking_cols = [c for c in df.columns if c != '_recordId'] + if len(ranking_cols) < 2: + raise ValueError("Need at least 2 ranking columns") + + total_respondents = df.height + mentions_data = {} + + # Count mentions (any rank) for each voice + for col in ranking_cols: + label = self._clean_voice_label(col) + count = df.filter(pl.col(col).is_not_null()).height + mentions_data[label] = count + + labels = sorted(list(mentions_data.keys())) + results = [] + n_comparisons = len(labels) * (len(labels) - 1) // 2 + + for i, label1 in enumerate(labels): + for label2 in labels[i+1:]: + count1 = mentions_data[label1] + count2 = mentions_data[label2] + + pct1 = count1 / total_respondents + pct2 = count2 / total_respondents + + # Z-test for two proportions + n1 = total_respondents + n2 = total_respondents + + p_pooled = (count1 + count2) / (n1 + n2) + se = np.sqrt(p_pooled * (1 - p_pooled) * (1/n1 + 1/n2)) + + if se > 0: + z_stat = (pct1 - pct2) / se + p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat))) + else: + p_value = 1.0 + + results.append({ + 'group1': label1, + 'group2': label2, + 'p_value': float(p_value), + 'rank1_count1': count1, # Reusing column names for compatibility with heatmap plotting + 'rank1_count2': count2, + 'rank1_pct1': round(pct1 * 100, 1), + 'rank1_pct2': round(pct2 * 100, 1), + 'total1': n1, + 'total2': n2, + 'effect_size': pct1 - pct2 # Difference in proportions + }) + + results_df = pl.DataFrame(results) + + p_values = results_df['p_value'].to_numpy() + p_adjusted = np.full_like(p_values, np.nan, dtype=float) + + if correction == "bonferroni": + p_adjusted = np.minimum(p_values * n_comparisons, 1.0) + elif correction == "holm": + sorted_idx = np.argsort(p_values) + sorted_p = p_values[sorted_idx] + m = len(sorted_p) + adjusted = np.zeros(m) + for j in range(m): + adjusted[j] = sorted_p[j] * (m - j) + for j in range(1, m): + adjusted[j] = max(adjusted[j], adjusted[j-1]) + adjusted = np.minimum(adjusted, 1.0) + p_adjusted = adjusted[np.argsort(sorted_idx)] + elif correction == "none": + p_adjusted = p_values.astype(float) # pyright: ignore + + results_df = results_df.with_columns([ + pl.Series('p_adjusted', p_adjusted), + pl.Series('significant', p_adjusted < alpha), + ]).sort('p_value') + + metadata = { + 'test_type': 'proportion_z_test_mentions', + 'alpha': alpha, + 'correction': correction, + 'n_comparisons': n_comparisons, + } + + return results_df, metadata + + def process_speaking_style_data( df: Union[pl.LazyFrame, pl.DataFrame],