diff --git a/.github/agents/plot-creator.agent.md b/.github/agents/plot-creator.agent.md index 1760213..3ff7559 100644 --- a/.github/agents/plot-creator.agent.md +++ b/.github/agents/plot-creator.agent.md @@ -48,13 +48,12 @@ Check if an existing `transform_` function exists in `utils.py def transform_(self, df: pl.LazyFrame | pl.DataFrame) -> tuple[pl.LazyFrame, dict | None]: """Transform to . - Original request: "" + Original use-case: "" This function . Args: - df: Pre-fetched data (e.g., from get_character_refine()). - Do NOT call get_*() methods inside this function. + df: Pre-fetched data as a Polars LazyFrame or DataFrame. Returns: tuple: (LazyFrame with columns [...], Optional metadata dict) @@ -96,19 +95,11 @@ chart = S.plot_character_trait_frequency(trait_freq) ``` ### Step 5: Create Temporary Test File -Create `debug_plot_temp.py` for testing. **You MUST ask the user to provide:** +Create `debug_plot_temp.py` for testing. **Prefer using the data snippet already provided by the user.** -1. **The exact code snippet to create the test data** - Do NOT generate or assume file paths -2. **Confirmation of which notebook they're working in** (so you can read it for context if needed) +**Option A: Use provided data snippet (preferred)** +If the user provided a `df.head()` or sample data output, create inline test data from it: -Example prompt to user: -> "To create the test file, please provide: -> 1. The exact code snippet that produces the dataframe you shared (copy from your notebook) -> 2. Which notebook are you working in? (I may read it for context, but won't modify it) -> -> I will NOT attempt to load any data without your explicit code." - -**Test file structure using user-provided data:** ```python """Temporary test file for . Delete after testing. @@ -118,15 +109,32 @@ from theme import ColorPalette import altair as alt # ============================================================ -# USER-PROVIDED TEST DATA (paste from user's snippet) +# TEST DATA (reconstructed from user's df.head() output) # ============================================================ -# +test_data = pl.DataFrame({ + "Column1": ["value1", "value2", ...], + "Column2": [1, 2, ...], + # ... recreate structure from provided sample +}) # ============================================================ # Test the plot function -# ... +from plots import QualtricsPlotsMixin +# ... test code ``` +**Option B: Ask user (only if necessary)** +Only ask the user for additional code if: +- The provided sample is insufficient to test the plot logic +- You need to understand complex data relationships not visible in the sample +- The transformation requires understanding the full data pipeline + +If you must ask: +> "The sample data you provided should work for basic testing. However, I need [specific reason]. Could you provide: +> 1. [specific information needed] +> +> If you'd prefer, I can proceed with a minimal test using the sample data you shared." + ### Step 6: Create Plot Function Add a new method to `QualtricsPlotsMixin` in `plots.py`: diff --git a/02_quant_analysis.py b/02_quant_analysis.py index fdc6cac..52008a6 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -16,8 +16,8 @@ def _(): from speaking_styles import SPEAKING_STYLES return ( - QualtricsSurvey, Path, + QualtricsSurvey, SPEAKING_STYLES, calculate_weighted_ranking_scores, check_progress, @@ -49,7 +49,7 @@ def _(Path, file_browser, mo): @app.cell -def _(QualtricsSurvey, QSF_FILE, RESULTS_FILE, mo): +def _(QSF_FILE, QualtricsSurvey, RESULTS_FILE, mo): S = QualtricsSurvey(RESULTS_FILE, QSF_FILE) try: data_all = S.load_data() @@ -285,6 +285,7 @@ def _(S, mo, v_18_8_3): def _(S, calculate_weighted_ranking_scores, data): top3_voices = S.get_top_3_voices(data)[0] top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices) + return top3_voices, top3_voices_weighted @@ -383,6 +384,12 @@ def _(S, data, mo): return (vscales,) +@app.cell +def _(vscales): + print(vscales.collect().head()) + return + + @app.cell def _(pl, vscales): # Count non-null values per row diff --git a/03_quant_report.py b/03_quant_report.py index f9447f7..17178bb 100644 --- a/03_quant_report.py +++ b/03_quant_report.py @@ -44,14 +44,14 @@ def _(QSF_FILE, RESULTS_FILE): @app.cell(hide_code=True) -def _(): - mo.md(r""" +def _(RESULTS_FILE, data_all): + mo.md(rf""" --- # Load Data - **Dataset:** `{Path(RESULTS_FILE).name}` + **Dataset:** {Path(RESULTS_FILE).name} - **Responses**: `{data_all.collect().shape[0]}` + **Responses**: {data_all.collect().shape[0]} """) return @@ -112,11 +112,9 @@ def _(): @app.cell -def _(data_validated): - data = data_validated - - data.collect() - return (data,) +def _(): + # + return @app.cell(hide_code=True) @@ -130,8 +128,8 @@ def _(): @app.cell -def _(S, data): - demographics = S.get_demographics(data)[0].collect() +def _(S, data_validated): + demographics = S.get_demographics(data_validated)[0].collect() demographics return (demographics,) @@ -148,7 +146,7 @@ def _(): def _(demographics): # Demographics where 'Consumer' is null demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list() - # demographics_no_consumer + demographics_no_consumer return (demographics_no_consumer,) @@ -160,9 +158,26 @@ def _(data_all, demographics_no_consumer): @app.cell -def _(data_all): +def _(): + mo.md(r""" + # Filter Data (Global corrections) + """) + return + + +@app.cell +def _(data_validated): + # drop rows where 'consumer' is null + # data = data_validated.filter(pl.col('Consumer').is_not_null()) + data = data_validated + data.collect() + return (data,) + + +@app.cell +def _(): # Check if all business owners are missing a 'Consumer type' in demographics - assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics." + # assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics." return @@ -187,14 +202,14 @@ def _(): @app.cell -def _(S, demo_plot_cols, demographics): +def _(S, data, demo_plot_cols): _content = """ ## Demographic Distributions """ for c in demo_plot_cols: _fig = S.plot_demographic_distribution( - data=demographics, + data=S.get_demographics(data)[0], column=c, title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents" ) @@ -265,6 +280,22 @@ def _(S, char_rank): return +@app.cell +def _(S, char_rank): + _pairwise_df, _meta = S.compute_ranking_significance(char_rank) + + print(_pairwise_df.columns) + + mo.md(f""" + ### Statistical Significance Character Ranking + + {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))} + + {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))} + """) + return + + @app.cell def _(): mo.md(r""" @@ -307,28 +338,69 @@ def _(): @app.cell -def _(): - # Join respondent +def _(S, data): + char_df = S.get_character_refine(data)[0] + return (char_df,) + + +@app.cell +def _(S, char_df): + from theme import ColorPalette + + # Assuming you already have char_df (your data from get_character_refine or similar) + characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant'] + character_colors = { + 'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT), + 'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT), + 'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT), + 'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT), + } + + # Build consistent sort order (by total frequency across all characters) + all_trait_counts = {} + for char in characters: + freq_df, _ = S.transform_character_trait_frequency(char_df, char) + for row in freq_df.iter_rows(named=True): + all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count'] + + consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x]) + + _content = """""" + # Generate 4 plots (one per character) + for char in characters: + freq_df, _ = S.transform_character_trait_frequency(char_df, char) + main_color, highlight_color = character_colors[char] + chart = S.plot_single_character_trait_frequency( + data=freq_df, + character_name=char, + bar_color=main_color, + highlight_color=highlight_color, + trait_sort_order=consistent_sort_order, + ) + _content += f""" + {mo.ui.altair_chart(chart)} + + + """ + + mo.md(_content) return @app.cell def _(): mo.md(r""" - --- + ## Statistical significance best characters - # Spoken Voice Results + zie chat + > voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:) + > """) return -@app.cell(hide_code=True) +@app.cell def _(): - mo.md(r""" - --- - - # Brand Character Results - """) return @@ -342,5 +414,174 @@ def _(): return +@app.cell +def _(S, data): + top3_voices = S.get_top_3_voices(data)[0] + top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices) + return top3_voices, top3_voices_weighted + + +@app.cell +def _(): + mo.md(r""" + ## Which voice is ranked best in the ranking question for top 3? + + (not best 3 out of 8 question) + """) + return + + +@app.cell +def _(S, top3_voices): + _plot = S.plot_ranking_distribution(top3_voices, x_label='Voice') + mo.md(f""" + {mo.ui.altair_chart(_plot)} + """) + return + + +@app.cell +def _(): + mo.md(r""" + ### Statistical significance for voice ranking + """) + return + + +@app.cell +def _(): + # print(top3_voices.collect().head()) + return + + +@app.cell +def _(): + + # _pairwise_df, _metadata = S.compute_ranking_significance( + # top3_voices,alpha=0.05,correction="none") + + # # View significant pairs + # # print(pairwise_df.filter(pl.col('significant') == True)) + + # # Create heatmap visualization + # _heatmap = S.plot_significance_heatmap( + # _pairwise_df, + # metadata=_metadata, + # title="Weighted Voice Ranking Significance
(Pairwise Comparisons)" + # ) + + # # Create summary bar chart + # _summary = S.plot_significance_summary( + # _pairwise_df, + # metadata=_metadata + # ) + + # mo.md(f""" + # {mo.ui.altair_chart(_heatmap)} + + # {mo.ui.altair_chart(_summary)} + # """) + return + + +@app.cell +def _(): + mo.md(r""" + ## Weighted Popularity Scores + """) + return + + +@app.cell +def _(S, top3_voices_weighted): + _plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score
(1st = 3pts, 2nd = 2pts, 3rd = 1pt)") + + mo.md(f""" + {mo.ui.altair_chart(_plot)} + """) + return + + +@app.cell +def _(): + return + + +@app.cell +def _(top3_voices_weighted): + print(top3_voices_weighted.head()) + return + + +@app.cell +def _(): + return + + +@app.cell(hide_code=True) +def _(): + mo.md(r""" + ## Voice Scale 1-10 + """) + return + + +@app.cell +def _(S, data): + # Get your voice scale data (from notebook) + voice_1_10, _ = S.get_voice_scale_1_10(data) + return (voice_1_10,) + + +@app.cell +def _(S, voice_1_10): + S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)") + return + + +@app.cell +def _(): + mo.md(r""" + ### Statistical Significance (Scale 1-10) + """) + return + + +@app.cell +def _(S, voice_1_10): + # Compute pairwise significance tests + pairwise_df, metadata = S.compute_pairwise_significance( + voice_1_10, + test_type="mannwhitney", # or "ttest", "chi2", "auto" + alpha=0.05, + correction="bonferroni" # or "holm", "none" + ) + + # View significant pairs + # print(pairwise_df.filter(pl.col('significant') == True)) + + # Create heatmap visualization + _heatmap = S.plot_significance_heatmap( + pairwise_df, + metadata=metadata, + title="Voice Rating Significance
(Pairwise Comparisons)" + ) + + # Create summary bar chart + _summary = S.plot_significance_summary( + pairwise_df, + metadata=metadata + ) + + mo.md(f""" + {mo.ui.altair_chart(_heatmap)} + + {mo.ui.altair_chart(_summary)} + """) + + + return + + if __name__ == "__main__": app.run() diff --git a/docs/statistical-significance-guide.md b/docs/statistical-significance-guide.md new file mode 100644 index 0000000..92f29cc --- /dev/null +++ b/docs/statistical-significance-guide.md @@ -0,0 +1,428 @@ +# Statistical Significance Testing Guide + +A beginner-friendly reference for choosing the right statistical test and correction method for your Voice Branding analysis. + +--- + +## Table of Contents +1. [Quick Decision Flowchart](#quick-decision-flowchart) +2. [Understanding Your Data Types](#understanding-your-data-types) +3. [Available Tests](#available-tests) +4. [Multiple Comparison Corrections](#multiple-comparison-corrections) +5. [Interpreting Results](#interpreting-results) +6. [Code Examples](#code-examples) + +--- + +## Quick Decision Flowchart + +``` +What kind of data do you have? +│ +├─► Continuous scores (1-10 ratings, averages) +│ │ +│ └─► Use: compute_pairwise_significance() +│ │ +│ ├─► Data normally distributed? → test_type="ttest" +│ └─► Not sure / skewed data? → test_type="mannwhitney" (safer choice) +│ +└─► Ranking data (1st, 2nd, 3rd place votes) + │ + └─► Use: compute_ranking_significance() + (automatically uses proportion z-test) +``` + +--- + +## Understanding Your Data Types + +### Continuous Data +**What it looks like:** Numbers on a scale with many possible values. + +| Example | Data Source | +|---------|-------------| +| Voice ratings 1-10 | `get_voice_scale_1_10()` | +| Speaking style scores | `get_ss_green_blue()` | +| Any averaged scores | Custom aggregations | + +``` +shape: (5, 3) +┌───────────┬─────────────────┬─────────────────┐ +│ _recordId │ Voice_Scale__V14│ Voice_Scale__V04│ +│ str │ f64 │ f64 │ +├───────────┼─────────────────┼─────────────────┤ +│ R_001 │ 7.5 │ 6.0 │ +│ R_002 │ 8.0 │ 7.5 │ +│ R_003 │ 6.5 │ 8.0 │ +``` + +### Ranking Data +**What it looks like:** Discrete ranks (1, 2, 3) or null if not ranked. + +| Example | Data Source | +|---------|-------------| +| Top 3 voice rankings | `get_top_3_voices()` | +| Character rankings | `get_character_ranking()` | + +``` +shape: (5, 3) +┌───────────┬──────────────────┬──────────────────┐ +│ _recordId │ Top_3__V14 │ Top_3__V04 │ +│ str │ i64 │ i64 │ +├───────────┼──────────────────┼──────────────────┤ +│ R_001 │ 1 │ null │ ← V14 was ranked 1st +│ R_002 │ 2 │ 1 │ ← V04 was ranked 1st +│ R_003 │ null │ 3 │ ← V04 was ranked 3rd +``` + +### ⚠️ Aggregated Data (Cannot Test!) +**What it looks like:** Already summarized/totaled data. + +``` +shape: (3, 2) +┌───────────┬────────────────┐ +│ Character │ Weighted Score │ ← ALREADY AGGREGATED +│ str │ i64 │ Lost individual variance +├───────────┼────────────────┤ Cannot do significance tests! +│ V14 │ 209 │ +│ V04 │ 180 │ +``` + +**Solution:** Go back to the raw data before aggregation. + +--- + +## Available Tests + +### 1. Mann-Whitney U Test (Default for Continuous) +**Use when:** Comparing scores/ratings between groups +**Assumes:** Nothing about distribution shape (non-parametric) +**Best for:** Most survey data, Likert scales, ratings + +```python +pairwise_df, meta = S.compute_pairwise_significance( + voice_data, + test_type="mannwhitney" # This is the default +) +``` + +**Pros:** +- Works with any distribution shape +- Robust to outliers +- Safe choice when unsure + +**Cons:** +- Slightly less powerful than t-test when data IS normally distributed + +--- + +### 2. Independent t-Test +**Use when:** Comparing means between groups +**Assumes:** Data is approximately normally distributed +**Best for:** Large samples (n > 30 per group), truly continuous data + +```python +pairwise_df, meta = S.compute_pairwise_significance( + voice_data, + test_type="ttest" +) +``` + +**Pros:** +- Most powerful when assumptions are met +- Well-understood, commonly reported + +**Cons:** +- Can give misleading results if data is skewed +- Sensitive to outliers + +--- + +### 3. Chi-Square Test +**Use when:** Comparing frequency distributions +**Assumes:** Expected counts ≥ 5 in each cell +**Best for:** Count data, categorical comparisons + +```python +pairwise_df, meta = S.compute_pairwise_significance( + count_data, + test_type="chi2" +) +``` + +**Pros:** +- Designed for count/frequency data +- Tests if distributions differ + +**Cons:** +- Needs sufficient sample sizes +- Less informative about direction of difference + +--- + +### 4. Two-Proportion Z-Test (For Rankings) +**Use when:** Comparing ranking vote proportions +**Automatically used by:** `compute_ranking_significance()` + +```python +pairwise_df, meta = S.compute_ranking_significance(ranking_data) +``` + +**What it tests:** "Does Voice A get a significantly different proportion of Rank 1 votes than Voice B?" + +--- + +## Multiple Comparison Corrections + +### Why Do We Need Corrections? + +When you compare many groups, you're doing many tests. Each test has a 5% chance of a false positive (if α = 0.05). With 17 voices: + +| Comparisons | Expected False Positives (no correction) | +|-------------|------------------------------------------| +| 136 pairs | ~7 false "significant" results! | + +**Corrections adjust p-values to account for this.** + +--- + +### Bonferroni Correction (Conservative) +**Formula:** `p_adjusted = p_value × number_of_comparisons` + +```python +pairwise_df, meta = S.compute_pairwise_significance( + data, + correction="bonferroni" # This is the default +) +``` + +**Use when:** +- You want to be very confident about significant results +- False positives are costly (publishing, major decisions) +- You have few comparisons (< 20) + +**Trade-off:** May miss real differences (more false negatives) + +--- + +### Holm-Bonferroni Correction (Less Conservative) +**Formula:** Step-down procedure that's less strict than Bonferroni + +```python +pairwise_df, meta = S.compute_pairwise_significance( + data, + correction="holm" +) +``` + +**Use when:** +- You have many comparisons +- You want better power to detect real differences +- Exploratory analysis where missing a real effect is costly + +**Trade-off:** Slightly higher false positive risk than Bonferroni + +--- + +### No Correction +**Not recommended for final analysis**, but useful for exploration. + +```python +pairwise_df, meta = S.compute_pairwise_significance( + data, + correction="none" +) +``` + +**Use when:** +- Initial exploration only +- You'll follow up with specific hypotheses +- You understand and accept the inflated false positive rate + +--- + +### Correction Method Comparison + +| Method | Strictness | Best For | Risk | +|--------|------------|----------|------| +| Bonferroni | Most strict | Few comparisons, high stakes | Miss real effects | +| Holm | Moderate | Many comparisons, balanced approach | Slightly more false positives | +| None | No control | Exploration only | Many false positives | + +**Recommendation for Voice Branding:** Use **Holm** for exploratory analysis, **Bonferroni** for final reporting. + +--- + +## Interpreting Results + +### Key Output Columns + +| Column | Meaning | +|--------|---------| +| `p_value` | Raw probability this difference happened by chance | +| `p_adjusted` | Corrected p-value (use this for decisions!) | +| `significant` | TRUE if p_adjusted < alpha (usually 0.05) | +| `effect_size` | How big is the difference (practical significance) | + +### What the p-value Means + +| p-value | Interpretation | +|---------|----------------| +| < 0.001 | Very strong evidence of difference | +| < 0.01 | Strong evidence | +| < 0.05 | Moderate evidence (traditional threshold) | +| 0.05 - 0.10 | Weak evidence, "trending" | +| > 0.10 | No significant evidence | + +### Statistical vs Practical Significance + +**Statistical significance** (p < 0.05) means the difference is unlikely due to chance. + +**Practical significance** (effect size) means the difference matters in the real world. + +| Effect Size (Cohen's d) | Interpretation | +|-------------------------|----------------| +| < 0.2 | Small (may not matter practically) | +| 0.2 - 0.5 | Medium | +| 0.5 - 0.8 | Large | +| > 0.8 | Very large | + +**Example:** A p-value of 0.001 with effect size of 0.1 means "we're confident there's a difference, but it's tiny." + +--- + +## Code Examples + +### Example 1: Voice Scale Ratings + +```python +# Get the raw rating data +voice_data, _ = S.get_voice_scale_1_10(data) + +# Test for significant differences +pairwise_df, meta = S.compute_pairwise_significance( + voice_data, + test_type="mannwhitney", # Safe default for ratings + alpha=0.05, + correction="bonferroni" +) + +# Check overall test first +print(f"Overall test: {meta['overall_test']}") +print(f"Overall p-value: {meta['overall_p_value']:.4f}") + +# If overall is significant, look at pairwise +if meta['overall_p_value'] < 0.05: + sig_pairs = pairwise_df.filter(pl.col('significant') == True) + print(f"Found {sig_pairs.height} significant pairwise differences") + +# Visualize +S.plot_significance_heatmap(pairwise_df, metadata=meta) +``` + +### Example 2: Top 3 Voice Rankings + +```python +# Get the raw ranking data (NOT the weighted scores!) +ranking_data, _ = S.get_top_3_voices(data) + +# Test for significant differences in Rank 1 proportions +pairwise_df, meta = S.compute_ranking_significance( + ranking_data, + alpha=0.05, + correction="holm" # Less conservative for many comparisons +) + +# Check chi-square test +print(f"Chi-square p-value: {meta['chi2_p_value']:.4f}") + +# View contingency table (Rank 1, 2, 3 counts per voice) +for voice, counts in meta['contingency_table'].items(): + print(f"{voice}: R1={counts[0]}, R2={counts[1]}, R3={counts[2]}") + +# Find significant pairs +sig_pairs = pairwise_df.filter(pl.col('significant') == True) +print(sig_pairs) +``` + +### Example 3: Comparing Demographic Subgroups + +```python +# Filter to specific demographics +S.filter_data(data, consumer=['Early Professional']) +early_pro_data, _ = S.get_voice_scale_1_10(data) + +S.filter_data(data, consumer=['Established Professional']) +estab_pro_data, _ = S.get_voice_scale_1_10(data) + +# Test each group separately, then compare results qualitatively +# (For direct group comparison, you'd need a different test design) +``` + +--- + +## Common Mistakes to Avoid + +### ❌ Using Aggregated Data +```python +# WRONG - already summarized, lost individual variance +weighted_scores = calculate_weighted_ranking_scores(ranking_data) +S.compute_pairwise_significance(weighted_scores) # Will fail! +``` + +### ✅ Use Raw Data +```python +# RIGHT - use raw data before aggregation +ranking_data, _ = S.get_top_3_voices(data) +S.compute_ranking_significance(ranking_data) +``` + +### ❌ Ignoring Multiple Comparisons +```python +# WRONG - 7% of pairs will be "significant" by chance alone! +S.compute_pairwise_significance(data, correction="none") +``` + +### ✅ Apply Correction +```python +# RIGHT - corrected p-values control false positives +S.compute_pairwise_significance(data, correction="bonferroni") +``` + +### ❌ Only Reporting p-values +```python +# WRONG - statistical significance isn't everything +print(f"p = {p_value}") # Missing context! +``` + +### ✅ Report Effect Sizes Too +```python +# RIGHT - include practical significance +print(f"p = {p_value}, effect size = {effect_size}") +print(f"Mean difference: {mean1 - mean2:.2f} points") +``` + +--- + +## Quick Reference Card + +| Data Type | Function | Default Test | Recommended Correction | +|-----------|----------|--------------|------------------------| +| Ratings (1-10) | `compute_pairwise_significance()` | Mann-Whitney U | Bonferroni | +| Rankings (1st/2nd/3rd) | `compute_ranking_significance()` | Proportion Z | Holm | +| Count frequencies | `compute_pairwise_significance(test_type="chi2")` | Chi-square | Bonferroni | + +| Scenario | Correction | +|----------|------------| +| Publishing results | Bonferroni | +| Client presentation | Bonferroni | +| Exploratory analysis | Holm | +| Quick internal check | Holm or None | + +--- + +## Further Reading + +- [Statistics for Dummies Cheat Sheet](https://www.dummies.com/article/academics-the-arts/math/statistics/statistics-for-dummies-cheat-sheet-208650/) +- [Choosing the Right Statistical Test](https://stats.oarc.ucla.edu/other/mult-pkg/whatstat/) +- [Multiple Comparisons Problem (Wikipedia)](https://en.wikipedia.org/wiki/Multiple_comparisons_problem) diff --git a/plots.py b/plots.py index c58c541..47be312 100644 --- a/plots.py +++ b/plots.py @@ -290,10 +290,11 @@ class QualtricsPlotsMixin: if domain is None: domain = [stats_df['average'].min(), stats_df['average'].max()] - # Base bar chart + # Base bar chart - use y2 to explicitly start bars at domain minimum bars = alt.Chart(stats_df).mark_bar(color=color).encode( x=alt.X('voice:N', title=x_label, sort='-y'), y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)), + y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge) tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('average:Q', title='Average', format='.2f'), @@ -1099,5 +1100,493 @@ class QualtricsPlotsMixin: height=height or getattr(self, 'plot_height', 400) ) + chart = self._save_plot(chart, title) + return chart + + def plot_single_character_trait_frequency( + self, + data: pl.LazyFrame | pl.DataFrame | None = None, + character_name: str = "Character", + bar_color: str = ColorPalette.PRIMARY, + highlight_color: str = ColorPalette.NEUTRAL, + title: str | None = None, + x_label: str = "Trait", + y_label: str = "Frequency", + trait_sort_order: list[str] | None = None, + height: int | None = None, + width: int | str | None = None, + ) -> alt.Chart: + """Create a bar plot showing trait frequency for a single character. + + Original request: "I need a bar plot that shows the frequency of the times + each trait is chosen per brand character. The function should be generalized + so that it can be used 4 times, once for each character. Each character should + use a slightly different color. Original traits should be highlighted." + + This function creates one plot per character. Call it 4 times (once per + character) to generate all plots for a slide. + + Args: + data: DataFrame with columns ['trait', 'count', 'is_original'] + as produced by transform_character_trait_frequency() + character_name: Name of the character (for title). E.g., "Bank Teller" + bar_color: Main bar color for non-original traits. Use ColorPalette + constants like ColorPalette.CHARACTER_BANK_TELLER + highlight_color: Lighter color for original/expected traits. Use the + matching highlight like ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT + title: Custom title. If None, auto-generates from character_name + x_label: X-axis label + y_label: Y-axis label + trait_sort_order: Optional list of traits for consistent sorting across + all character plots. If None, sorts by count descending. + height: Chart height + width: Chart width + + Returns: + alt.Chart: Altair bar chart + """ + df = self._ensure_dataframe(data) + + # Ensure we have the expected columns + required_cols = {'trait', 'count', 'is_original'} + if not required_cols.issubset(set(df.columns)): + return alt.Chart(pd.DataFrame({ + 'text': ['Data must have trait, count, is_original columns'] + })).mark_text().encode(text='text:N') + + # Convert to pandas for Altair + plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df + + # Determine sort order + if trait_sort_order is not None: + # Use provided order, append any missing traits at the end (sorted by count) + known_traits = set(trait_sort_order) + extra_traits = plot_df[~plot_df['trait'].isin(known_traits)].sort_values( + 'count', ascending=False + )['trait'].tolist() + sort_order = trait_sort_order + extra_traits + else: + # Default: sort by count descending + sort_order = plot_df.sort_values('count', ascending=False)['trait'].tolist() + + # Create category column for color encoding + plot_df['category'] = plot_df['is_original'].map({ + True: 'Original Trait', + False: 'Other Trait' + }) + + # Generate title if not provided + if title is None: + title = f"{character_name}
Trait Selection Frequency" + + # Build title config with sort order note as subtitle + sort_note = "Sorted by total frequency across all characters" if trait_sort_order else "Sorted by frequency (descending)" + title_text = self._process_title(title) + title_config = { + 'text': title_text, + 'subtitle': sort_note, + 'subtitleColor': 'gray', + 'subtitleFontSize': 10, + 'anchor': 'start', + } + + # Create HORIZONTAL bar chart with conditional coloring + # Reverse sort order for horizontal bars (highest at top) + reversed_sort = list(reversed(sort_order)) + + bars = alt.Chart(plot_df).mark_bar().encode( + y=alt.Y('trait:N', + title=x_label, + sort=reversed_sort, + axis=alt.Axis(labelLimit=200)), + x=alt.X('count:Q', title=y_label), + color=alt.Color('category:N', + scale=alt.Scale( + domain=['Original Trait', 'Other Trait'], + range=[highlight_color, bar_color] + ), + legend=alt.Legend( + orient='top', + direction='horizontal', + title=None + )), + tooltip=[ + alt.Tooltip('trait:N', title='Trait'), + alt.Tooltip('count:Q', title='Frequency'), + alt.Tooltip('category:N', title='Type') + ] + ) + + # Add count labels on bars (to the right of bars for horizontal) + text = alt.Chart(plot_df).mark_text( + dx=12, + color='black', + fontSize=10, + align='left' + ).encode( + y=alt.Y('trait:N', sort=reversed_sort), + x=alt.X('count:Q'), + text=alt.Text('count:Q') + ) + + chart = (bars + text).properties( + title=title_config, + width=width or 400, + height=height or getattr(self, 'plot_height', 450) + ) + + chart = self._save_plot(chart, title) + return chart + + def plot_significance_heatmap( + self, + pairwise_df: pl.LazyFrame | pl.DataFrame | None = None, + metadata: dict | None = None, + title: str = "Pairwise Statistical Significance
(Adjusted p-values)", + show_p_values: bool = True, + show_effect_size: bool = False, + height: int | None = None, + width: int | None = None, + ) -> alt.Chart: + """Create a heatmap showing pairwise statistical significance between groups. + + Original use-case: "I need to test for statistical significance and present + this in a logical manner - as a heatmap or similar visualization." + + This function visualizes the output of compute_pairwise_significance() as + a color-coded heatmap where color intensity indicates significance level. + + Args: + pairwise_df: Output from compute_pairwise_significance(). + Expected columns: ['group1', 'group2', 'p_value', 'p_adjusted', 'significant'] + metadata: Metadata dict from compute_pairwise_significance() (optional). + Used to add test information to the plot subtitle. + title: Chart title (supports
for line breaks) + show_p_values: Whether to display p-values as text annotations + show_effect_size: Whether to display effect sizes instead of p-values + height: Chart height (default: auto-sized based on groups) + width: Chart width (default: auto-sized based on groups) + + Returns: + alt.Chart: Altair heatmap chart + """ + df = self._ensure_dataframe(pairwise_df) + + # Get unique groups + all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list())) + n_groups = len(all_groups) + + # Create symmetric matrix data for heatmap + # We need both directions (A,B) and (B,A) for the full matrix + heatmap_data = [] + for row_group in all_groups: + for col_group in all_groups: + if row_group == col_group: + # Diagonal - self comparison + heatmap_data.append({ + 'row': row_group, + 'col': col_group, + 'p_adjusted': None, + 'p_value': None, + 'significant': None, + 'effect_size': None, + 'text_label': '—', + 'sig_category': 'Self', + }) + else: + # Find the comparison (could be in either order) + match = df.filter( + ((pl.col('group1') == row_group) & (pl.col('group2') == col_group)) | + ((pl.col('group1') == col_group) & (pl.col('group2') == row_group)) + ) + if match.height > 0: + p_adj = match['p_adjusted'][0] + p_val = match['p_value'][0] + sig = match['significant'][0] + eff = match['effect_size'][0] if 'effect_size' in match.columns else None + + # For ranking data, we can show Rank 1 % difference + has_rank_pcts = 'rank1_pct1' in match.columns and 'rank1_pct2' in match.columns + if has_rank_pcts: + pct_diff = abs(match['rank1_pct1'][0] - match['rank1_pct2'][0]) + else: + pct_diff = None + + # Helper to get display text when not showing p-values + def get_alt_text(): + if eff is not None: + return f'{eff:.2f}' + elif pct_diff is not None: + return f'{pct_diff:.1f}%' + else: + return '—' + + # Categorize significance level + if p_adj is None: + sig_cat = 'N/A' + text = 'N/A' + elif p_adj < 0.001: + sig_cat = 'p < 0.001' + text = '<.001' if show_p_values else get_alt_text() + elif p_adj < 0.01: + sig_cat = 'p < 0.01' + text = f'{p_adj:.3f}' if show_p_values else get_alt_text() + elif p_adj < 0.05: + sig_cat = 'p < 0.05' + text = f'{p_adj:.3f}' if show_p_values else get_alt_text() + else: + sig_cat = 'n.s.' + text = f'{p_adj:.2f}' if show_p_values else get_alt_text() + + if show_effect_size: + text = get_alt_text() + + heatmap_data.append({ + 'row': row_group, + 'col': col_group, + 'p_adjusted': p_adj, + 'p_value': p_val, + 'significant': sig, + 'effect_size': eff, + 'text_label': text, + 'sig_category': sig_cat, + }) + else: + heatmap_data.append({ + 'row': row_group, + 'col': col_group, + 'p_adjusted': None, + 'p_value': None, + 'significant': None, + 'effect_size': None, + 'text_label': 'N/A', + 'sig_category': 'N/A', + }) + + heatmap_df = pl.DataFrame(heatmap_data).to_pandas() + + # Define color scale for significance categories + sig_domain = ['p < 0.001', 'p < 0.01', 'p < 0.05', 'n.s.', 'Self', 'N/A'] + sig_range = [ + ColorPalette.SIG_STRONG, # p < 0.001 + ColorPalette.SIG_MODERATE, # p < 0.01 + ColorPalette.SIG_WEAK, # p < 0.05 + ColorPalette.SIG_NONE, # not significant + ColorPalette.SIG_DIAGONAL, # diagonal (self) + ColorPalette.NEUTRAL, # N/A + ] + + # Build tooltip fields based on available data + tooltip_fields = [ + alt.Tooltip('row:N', title='Group 1'), + alt.Tooltip('col:N', title='Group 2'), + alt.Tooltip('p_value:Q', title='p-value', format='.4f'), + alt.Tooltip('p_adjusted:Q', title='Adjusted p', format='.4f'), + ] + # Only add effect_size if it has non-null values (continuous data) + has_effect_size = 'effect_size' in heatmap_df.columns and heatmap_df['effect_size'].notna().any() + if has_effect_size: + tooltip_fields.append(alt.Tooltip('effect_size:Q', title='Effect Size', format='.3f')) + # Add rank info for ranking data + has_rank_pcts = 'rank1_pct1' in df.columns if isinstance(df, pl.DataFrame) else False + if has_rank_pcts: + tooltip_fields.append(alt.Tooltip('text_label:N', title='Rank 1 % Diff')) + + # Calculate dimensions + cell_size = 45 + auto_size = n_groups * cell_size + 100 + chart_width = width or auto_size + chart_height = height or auto_size + + # Base heatmap + heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode( + x=alt.X('col:N', title=None, sort=all_groups, + axis=alt.Axis(labelAngle=-45, labelLimit=150)), + y=alt.Y('row:N', title=None, sort=all_groups, + axis=alt.Axis(labelLimit=150)), + color=alt.Color('sig_category:N', + scale=alt.Scale(domain=sig_domain, range=sig_range), + legend=alt.Legend( + title='Significance', + orient='right', + direction='vertical' + )), + tooltip=tooltip_fields + ) + + # Text annotations + if show_p_values or show_effect_size: + # Add a column for text color based on significance + heatmap_df['text_color'] = heatmap_df['sig_category'].apply( + lambda x: 'white' if x in ['p < 0.001', 'p < 0.01'] else 'black' + ) + + text = alt.Chart(heatmap_df).mark_text( + fontSize=9, + fontWeight='normal' + ).encode( + x=alt.X('col:N', sort=all_groups), + y=alt.Y('row:N', sort=all_groups), + text='text_label:N', + color=alt.Color('text_color:N', scale=None), + ) + chart = (heatmap + text) + else: + chart = heatmap + + # Build subtitle with test info + subtitle_lines = [] + if metadata: + test_info = f"Test: {metadata.get('test_type', 'N/A')}" + if metadata.get('overall_p_value') is not None: + test_info += f" | Overall p={metadata['overall_p_value']:.4f}" + correction = metadata.get('correction', 'none') + if correction != 'none': + test_info += f" | Correction: {correction}" + subtitle_lines.append(test_info) + + title_config = { + 'text': self._process_title(title), + 'subtitle': subtitle_lines if subtitle_lines else None, + 'subtitleColor': 'gray', + 'subtitleFontSize': 10, + 'anchor': 'start', + } + + chart = chart.properties( + title=title_config, + width=chart_width, + height=chart_height, + ) + + chart = self._save_plot(chart, title) + return chart + + def plot_significance_summary( + self, + pairwise_df: pl.LazyFrame | pl.DataFrame | None = None, + metadata: dict | None = None, + title: str = "Significant Differences Summary
(Groups with significantly different means)", + height: int | None = None, + width: int | None = None, + ) -> alt.Chart: + """Create a summary bar chart showing which groups have significant differences. + + This shows each group with a count of how many other groups it differs from + significantly, plus the mean score or Rank 1 percentage for reference. + + Args: + pairwise_df: Output from compute_pairwise_significance() or compute_ranking_significance(). + metadata: Metadata dict from the significance computation (optional). + title: Chart title + height: Chart height + width: Chart width + + Returns: + alt.Chart: Altair bar chart with significance count per group + """ + df = self._ensure_dataframe(pairwise_df) + + # Detect data type: continuous (has mean1/mean2) vs ranking (has rank1_pct1/rank1_pct2) + has_means = 'mean1' in df.columns + has_ranks = 'rank1_pct1' in df.columns + + # Count significant differences per group + sig_df = df.filter(pl.col('significant') == True) + + # Count for each group (appears as either group1 or group2) + group1_counts = sig_df.group_by('group1').agg(pl.len().alias('count')) + group2_counts = sig_df.group_by('group2').agg(pl.len().alias('count')) + + # Combine counts + all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list())) + summary_data = [] + + for group in all_groups: + count1 = group1_counts.filter(pl.col('group1') == group)['count'].to_list() + count2 = group2_counts.filter(pl.col('group2') == group)['count'].to_list() + total_sig = (count1[0] if count1 else 0) + (count2[0] if count2 else 0) + + # Get score for this group from pairwise data + if has_means: + # Continuous data - use means + scores = df.filter(pl.col('group1') == group)['mean1'].to_list() + if not scores: + scores = df.filter(pl.col('group2') == group)['mean2'].to_list() + score_val = scores[0] if scores else None + score_label = 'mean' + elif has_ranks: + # Ranking data - use Rank 1 percentage + scores = df.filter(pl.col('group1') == group)['rank1_pct1'].to_list() + if not scores: + scores = df.filter(pl.col('group2') == group)['rank1_pct2'].to_list() + score_val = scores[0] if scores else None + score_label = 'rank1_pct' + else: + score_val = None + score_label = 'score' + + summary_data.append({ + 'group': group, + 'sig_count': total_sig, + 'score': score_val, + }) + + summary_df = pl.DataFrame(summary_data).sort('score', descending=True, nulls_last=True).to_pandas() + + # Create layered chart: bars for sig_count, text for score + tooltip_title = 'Mean Score' if has_means else 'Rank 1 %' if has_ranks else 'Score' + + bars = alt.Chart(summary_df).mark_bar(color=ColorPalette.PRIMARY).encode( + x=alt.X('group:N', title='Group', sort='-y'), + y=alt.Y('sig_count:Q', title='# of Significant Differences'), + tooltip=[ + alt.Tooltip('group:N', title='Group'), + alt.Tooltip('sig_count:Q', title='Sig. Differences'), + alt.Tooltip('score:Q', title=tooltip_title, format='.1f'), + ] + ) + + # Only add text labels if we have scores + if summary_df['score'].notna().any(): + text_format = '.1f' if has_means else '.0f' + text_suffix = '%' if has_ranks else '' + text = alt.Chart(summary_df).mark_text( + dy=-8, + color='black', + fontSize=9 + ).encode( + x=alt.X('group:N', sort='-y'), + y=alt.Y('sig_count:Q'), + text=alt.Text('score:Q', format=text_format) + ) + chart_layers = bars + text + else: + chart_layers = bars + + # Build subtitle + subtitle = None + if metadata: + if has_means: + subtitle = f"Mean scores shown above bars | α={metadata.get('alpha', 0.05)}" + elif has_ranks: + subtitle = f"Rank 1 % shown above bars | α={metadata.get('alpha', 0.05)}" + else: + subtitle = f"α={metadata.get('alpha', 0.05)}" + + title_config = { + 'text': self._process_title(title), + 'subtitle': subtitle, + 'subtitleColor': 'gray', + 'subtitleFontSize': 10, + 'anchor': 'start', + } + + chart = chart_layers.properties( + title=title_config, + width=width or 800, + height=height or getattr(self, 'plot_height', 400), + ) + chart = self._save_plot(chart, title) return chart \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d6dad56..dfdf2be 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "python-pptx>=1.0.2", "pyzmq>=27.1.0", "requests>=2.32.5", + "scipy>=1.14.0", "taguette>=1.5.1", "vl-convert-python>=1.9.0.post1", "wordcloud>=1.9.5", diff --git a/theme.py b/theme.py index 713c6f0..2c33533 100644 --- a/theme.py +++ b/theme.py @@ -19,11 +19,32 @@ class ColorPalette: # Neutral color for unhighlighted comparison items NEUTRAL = "#D3D3D3" # Light Grey + # Character-specific colors (for individual character plots) + # Each character has a main color and a lighter highlight for original traits + CHARACTER_BANK_TELLER = "#004C6D" # Dark Blue + CHARACTER_BANK_TELLER_HIGHLIGHT = "#669BBC" # Light Steel Blue + + CHARACTER_FAMILIAR_FRIEND = "#008493" # Teal + CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT = "#A8DADC" # Pale Cyan + + CHARACTER_COACH = "#5AAE95" # Sea Green + CHARACTER_COACH_HIGHLIGHT = "#A8DADC" # Pale Cyan + + CHARACTER_PERSONAL_ASSISTANT = "#457B9D" # Steel Blue + CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT = "#669BBC" # Light Steel Blue + # General UI elements TEXT = "black" GRID = "lightgray" BACKGROUND = "white" + # Statistical significance colors (for heatmaps/annotations) + SIG_STRONG = "#004C6D" # p < 0.001 - Dark Blue (highly significant) + SIG_MODERATE = "#0077B6" # p < 0.01 - Medium Blue (significant) + SIG_WEAK = "#5AAE95" # p < 0.05 - Sea Green (marginally significant) + SIG_NONE = "#E8E8E8" # p >= 0.05 - Light Grey (not significant) + SIG_DIAGONAL = "#FFFFFF" # White for diagonal (self-comparison) + # Extended palette for categorical charts (e.g., pie charts with many categories) CATEGORICAL = [ "#0077B6", # PRIMARY - Medium Blue diff --git a/utils.py b/utils.py index 64a6501..e4966dc 100644 --- a/utils.py +++ b/utils.py @@ -714,7 +714,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data class QualtricsSurvey(QualtricsPlotsMixin): - """Class to handle JPMorgan Chase survey data.""" + """Class to handle Qualtrics survey data.""" def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]): if isinstance(data_path, str): @@ -1072,6 +1072,441 @@ class QualtricsSurvey(QualtricsPlotsMixin): return self._get_subset(q, QIDs, rename_cols=True), None + def transform_character_trait_frequency( + self, + char_df: pl.LazyFrame | pl.DataFrame, + character_column: str, + ) -> tuple[pl.DataFrame, dict | None]: + """Transform character refine data to trait frequency counts for a single character. + + Original use-case: "I need a bar plot that shows the frequency of the times + each trait is chosen per brand character." + + This function takes a DataFrame with comma-separated trait selections per + character, explodes traits, and counts their frequency for a single character. + + Args: + char_df: Pre-fetched data + Expected columns: '_recordId', '' (with comma-separated traits) + character_column: Name of the character column to analyze (e.g., 'Bank Teller') + + Returns: + tuple: (DataFrame with columns ['trait', 'count', 'is_original'], None) + - 'trait': individual trait name + - 'count': frequency count + - 'is_original': boolean indicating if trait is in the original definition + """ + from reference import ORIGINAL_CHARACTER_TRAITS + + if isinstance(char_df, pl.LazyFrame): + char_df = char_df.collect() + + # Map display names to reference keys + character_key_map = { + 'Bank Teller': 'the_bank_teller', + 'Familiar Friend': 'the_familiar_friend', + 'The Coach': 'the_coach', + 'Personal Assistant': 'the_personal_assistant', + } + + # Get original traits for this character + ref_key = character_key_map.get(character_column) + original_traits = set(ORIGINAL_CHARACTER_TRAITS.get(ref_key, [])) + + # Filter to rows where this character has a value (not null) + char_data = char_df.filter(pl.col(character_column).is_not_null()) + + # Split comma-separated traits and explode + exploded = ( + char_data + .select( + pl.col(character_column) + .str.split(',') + .alias('traits') + ) + .explode('traits') + .with_columns( + pl.col('traits').str.strip_chars().alias('trait') + ) + .filter(pl.col('trait') != '') + ) + + # Count trait frequencies + freq_df = ( + exploded + .group_by('trait') + .agg(pl.len().alias('count')) + .sort('count', descending=True) + ) + + # Add is_original flag + freq_df = freq_df.with_columns( + pl.col('trait').is_in(list(original_traits)).alias('is_original') + ) + + return freq_df, None + + def compute_pairwise_significance( + self, + data: pl.LazyFrame | pl.DataFrame, + test_type: str = "auto", + alpha: float = 0.05, + correction: str = "bonferroni", + ) -> tuple[pl.DataFrame, dict]: + """Compute pairwise statistical significance tests between columns. + + Original use-case: "I need to test for statistical significance and present + this in a logical manner. It should be a generalized function to work on + many dataframes." + + This function performs pairwise statistical tests between all numeric columns + (excluding '_recordId') to determine which groups differ significantly. + + Args: + data: Pre-fetched data with numeric columns to compare. + Expected format: rows are observations, columns are groups/categories. + Example: Voice_Scale_1_10__V14, Voice_Scale_1_10__V04, etc. + test_type: Statistical test to use: + - "auto": Automatically chooses based on data (default) + - "mannwhitney": Mann-Whitney U test (non-parametric, for continuous) + - "ttest": Independent samples t-test (parametric, for continuous) + - "chi2": Chi-square test (for count/frequency data) + alpha: Significance level (default 0.05) + correction: Multiple comparison correction method: + - "bonferroni": Bonferroni correction (conservative) + - "holm": Holm-Bonferroni (less conservative) + - "none": No correction + + Returns: + tuple: (pairwise_df, metadata) + - pairwise_df: DataFrame with columns ['group1', 'group2', 'p_value', + 'p_adjusted', 'significant', 'effect_size', 'mean1', 'mean2', 'n1', 'n2'] + - metadata: dict with 'test_type', 'alpha', 'correction', 'n_comparisons', + 'overall_test_stat', 'overall_p_value' + """ + from scipy import stats as scipy_stats + import numpy as np + + if isinstance(data, pl.LazyFrame): + df = data.collect() + else: + df = data + + # Get numeric columns (exclude _recordId and other non-data columns) + value_cols = [c for c in df.columns if c != '_recordId' and df[c].dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]] + + if len(value_cols) < 2: + raise ValueError(f"Need at least 2 numeric columns for comparison, found {len(value_cols)}") + + # Auto-detect test type based on data characteristics + if test_type == "auto": + # Check if data looks like counts (integers, small range) vs continuous + sample_col = df[value_cols[0]].drop_nulls() + if len(sample_col) > 0: + is_integer = sample_col.dtype in [pl.Int64, pl.Int32] + unique_ratio = sample_col.n_unique() / len(sample_col) + if is_integer and unique_ratio < 0.1: + test_type = "chi2" + else: + test_type = "mannwhitney" # Default to non-parametric + else: + test_type = "mannwhitney" + + # Extract data as lists (dropping nulls for each column) + group_data = {} + for col in value_cols: + group_data[col] = df[col].drop_nulls().to_numpy() + + # Compute overall test (Kruskal-Wallis for non-parametric, ANOVA for parametric) + all_groups = [group_data[col] for col in value_cols if len(group_data[col]) > 0] + if test_type in ["mannwhitney", "auto"]: + overall_stat, overall_p = scipy_stats.kruskal(*all_groups) + overall_test_name = "Kruskal-Wallis" + elif test_type == "ttest": + overall_stat, overall_p = scipy_stats.f_oneway(*all_groups) + overall_test_name = "One-way ANOVA" + else: + overall_stat, overall_p = None, None + overall_test_name = "N/A (Chi-square)" + + # Compute pairwise tests + results = [] + n_comparisons = len(value_cols) * (len(value_cols) - 1) // 2 + + for i, col1 in enumerate(value_cols): + for col2 in value_cols[i+1:]: + data1 = group_data[col1] + data2 = group_data[col2] + + n1, n2 = len(data1), len(data2) + mean1 = float(np.mean(data1)) if n1 > 0 else None + mean2 = float(np.mean(data2)) if n2 > 0 else None + + # Skip if either group has no data + if n1 == 0 or n2 == 0: + results.append({ + 'group1': self._clean_voice_label(col1), + 'group2': self._clean_voice_label(col2), + 'p_value': None, + 'effect_size': None, + 'mean1': mean1, + 'mean2': mean2, + 'n1': n1, + 'n2': n2, + }) + continue + + # Perform the appropriate test + if test_type == "mannwhitney": + stat, p_value = scipy_stats.mannwhitneyu(data1, data2, alternative='two-sided') + # Effect size: rank-biserial correlation + effect_size = 1 - (2 * stat) / (n1 * n2) + elif test_type == "ttest": + stat, p_value = scipy_stats.ttest_ind(data1, data2) + # Effect size: Cohen's d + pooled_std = np.sqrt(((n1-1)*np.std(data1)**2 + (n2-1)*np.std(data2)**2) / (n1+n2-2)) + effect_size = (mean1 - mean2) / pooled_std if pooled_std > 0 else 0 + elif test_type == "chi2": + # Create contingency table from the two distributions + # Bin the data for chi-square + all_data = np.concatenate([data1, data2]) + bins = np.histogram_bin_edges(all_data, bins='auto') + counts1, _ = np.histogram(data1, bins=bins) + counts2, _ = np.histogram(data2, bins=bins) + contingency = np.array([counts1, counts2]) + # Remove zero columns + contingency = contingency[:, contingency.sum(axis=0) > 0] + if contingency.shape[1] > 1: + stat, p_value, _, _ = scipy_stats.chi2_contingency(contingency) + effect_size = np.sqrt(stat / (contingency.sum() * (min(contingency.shape) - 1))) + else: + p_value, effect_size = 1.0, 0.0 + else: + raise ValueError(f"Unknown test_type: {test_type}") + + results.append({ + 'group1': self._clean_voice_label(col1), + 'group2': self._clean_voice_label(col2), + 'p_value': float(p_value), + 'effect_size': float(effect_size), + 'mean1': mean1, + 'mean2': mean2, + 'n1': n1, + 'n2': n2, + }) + + # Create DataFrame and apply multiple comparison correction + results_df = pl.DataFrame(results) + + # Apply correction + p_values = results_df['p_value'].to_numpy() + valid_mask = ~np.isnan(p_values.astype(float)) + p_adjusted = np.full_like(p_values, np.nan, dtype=float) + + if correction == "bonferroni": + p_adjusted[valid_mask] = np.minimum(p_values[valid_mask] * n_comparisons, 1.0) + elif correction == "holm": + # Holm-Bonferroni step-down procedure + valid_p = p_values[valid_mask] + sorted_idx = np.argsort(valid_p) + sorted_p = valid_p[sorted_idx] + m = len(sorted_p) + adjusted = np.zeros(m) + for j in range(m): + adjusted[j] = sorted_p[j] * (m - j) + # Ensure monotonicity + for j in range(1, m): + adjusted[j] = max(adjusted[j], adjusted[j-1]) + adjusted = np.minimum(adjusted, 1.0) + # Restore original order + p_adjusted[valid_mask] = adjusted[np.argsort(sorted_idx)] + elif correction == "none": + p_adjusted = p_values.astype(float) + + results_df = results_df.with_columns([ + pl.Series('p_adjusted', p_adjusted), + pl.Series('significant', p_adjusted < alpha), + ]) + + metadata = { + 'test_type': test_type, + 'alpha': alpha, + 'correction': correction, + 'n_comparisons': n_comparisons, + 'overall_test': overall_test_name, + 'overall_stat': overall_stat, + 'overall_p_value': overall_p, + } + + return results_df, metadata + + def compute_ranking_significance( + self, + data: pl.LazyFrame | pl.DataFrame, + alpha: float = 0.05, + correction: str = "bonferroni", + ) -> tuple[pl.DataFrame, dict]: + """Compute statistical significance for ranking data (e.g., Top 3 Voices). + + Original use-case: "Test whether voices are ranked significantly differently + based on the distribution of 1st, 2nd, 3rd place votes." + + This function takes raw ranking data (rows = respondents, columns = voices, + values = rank 1/2/3 or null) and performs: + 1. Overall chi-square test on the full contingency table + 2. Pairwise proportion tests comparing Rank 1 vote shares + + Args: + data: Pre-fetched ranking data from get_top_3_voices() or get_character_ranking(). + Expected format: rows are respondents, columns are voices/characters, + values are 1, 2, 3 (rank) or null (not ranked). + alpha: Significance level (default 0.05) + correction: Multiple comparison correction method: + - "bonferroni": Bonferroni correction (conservative) + - "holm": Holm-Bonferroni (less conservative) + - "none": No correction + + Returns: + tuple: (pairwise_df, metadata) + - pairwise_df: DataFrame with columns ['group1', 'group2', 'p_value', + 'p_adjusted', 'significant', 'rank1_count1', 'rank1_count2', + 'rank1_pct1', 'rank1_pct2', 'total1', 'total2'] + - metadata: dict with 'alpha', 'correction', 'n_comparisons', + 'chi2_stat', 'chi2_p_value', 'contingency_table' + + Example: + >>> ranking_data, _ = S.get_top_3_voices(data) + >>> pairwise_df, meta = S.compute_ranking_significance(ranking_data) + >>> # See which voices have significantly different Rank 1 proportions + >>> print(pairwise_df.filter(pl.col('significant') == True)) + """ + from scipy import stats as scipy_stats + import numpy as np + + if isinstance(data, pl.LazyFrame): + df = data.collect() + else: + df = data + + # Get ranking columns (exclude _recordId) + ranking_cols = [c for c in df.columns if c != '_recordId'] + + if len(ranking_cols) < 2: + raise ValueError(f"Need at least 2 ranking columns, found {len(ranking_cols)}") + + # Build contingency table: rows = ranks (1, 2, 3), columns = voices + # Count how many times each voice received each rank + contingency_data = {} + for col in ranking_cols: + label = self._clean_voice_label(col) + r1 = df.filter(pl.col(col) == 1).height + r2 = df.filter(pl.col(col) == 2).height + r3 = df.filter(pl.col(col) == 3).height + contingency_data[label] = [r1, r2, r3] + + # Create contingency table as numpy array + labels = list(contingency_data.keys()) + contingency_table = np.array([contingency_data[l] for l in labels]).T # 3 x n_voices + + # Overall chi-square test on contingency table + # Tests whether rank distribution is independent of voice + chi2_stat, chi2_p, chi2_dof, _ = scipy_stats.chi2_contingency(contingency_table) + + # Pairwise proportion tests for Rank 1 votes + # We use a two-proportion z-test to compare rank 1 proportions + results = [] + n_comparisons = len(labels) * (len(labels) - 1) // 2 + + # Total respondents who ranked any voice in top 3 + total_respondents = df.height + + for i, label1 in enumerate(labels): + for label2 in labels[i+1:]: + r1_count1 = contingency_data[label1][0] # Rank 1 votes for voice 1 + r1_count2 = contingency_data[label2][0] # Rank 1 votes for voice 2 + + # Total times each voice was ranked (1st + 2nd + 3rd) + total1 = sum(contingency_data[label1]) + total2 = sum(contingency_data[label2]) + + # Calculate proportions of Rank 1 out of all rankings for each voice + pct1 = r1_count1 / total1 if total1 > 0 else 0 + pct2 = r1_count2 / total2 if total2 > 0 else 0 + + # Two-proportion z-test + # H0: p1 = p2 (both voices have same proportion of Rank 1) + if total1 > 0 and total2 > 0 and (r1_count1 + r1_count2) > 0: + # Pooled proportion + p_pooled = (r1_count1 + r1_count2) / (total1 + total2) + + # Standard error + se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total1 + 1/total2)) + + if se > 0: + z_stat = (pct1 - pct2) / se + p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat))) # Two-tailed + else: + p_value = 1.0 + else: + p_value = 1.0 + + results.append({ + 'group1': label1, + 'group2': label2, + 'p_value': float(p_value), + 'rank1_count1': r1_count1, + 'rank1_count2': r1_count2, + 'rank1_pct1': round(pct1 * 100, 1), + 'rank1_pct2': round(pct2 * 100, 1), + 'total1': total1, + 'total2': total2, + }) + + # Create DataFrame and apply correction + results_df = pl.DataFrame(results) + + p_values = results_df['p_value'].to_numpy() + p_adjusted = np.full_like(p_values, np.nan, dtype=float) + + if correction == "bonferroni": + p_adjusted = np.minimum(p_values * n_comparisons, 1.0) + elif correction == "holm": + sorted_idx = np.argsort(p_values) + sorted_p = p_values[sorted_idx] + m = len(sorted_p) + adjusted = np.zeros(m) + for j in range(m): + adjusted[j] = sorted_p[j] * (m - j) + for j in range(1, m): + adjusted[j] = max(adjusted[j], adjusted[j-1]) + adjusted = np.minimum(adjusted, 1.0) + p_adjusted = adjusted[np.argsort(sorted_idx)] + elif correction == "none": + p_adjusted = p_values.astype(float) + + results_df = results_df.with_columns([ + pl.Series('p_adjusted', p_adjusted), + pl.Series('significant', p_adjusted < alpha), + ]) + + # Sort by p_value for easier inspection + results_df = results_df.sort('p_value') + + metadata = { + 'test_type': 'proportion_z_test', + 'alpha': alpha, + 'correction': correction, + 'n_comparisons': n_comparisons, + 'chi2_stat': chi2_stat, + 'chi2_p_value': chi2_p, + 'chi2_dof': chi2_dof, + 'overall_test': 'Chi-square', + 'overall_stat': chi2_stat, + 'overall_p_value': chi2_p, + 'contingency_table': {label: contingency_data[label] for label in labels}, + } + + return results_df, metadata def process_speaking_style_data( diff --git a/uv.lock b/uv.lock index 3f7dd0c..1219912 100644 --- a/uv.lock +++ b/uv.lock @@ -1435,6 +1435,7 @@ dependencies = [ { name = "python-pptx" }, { name = "pyzmq" }, { name = "requests" }, + { name = "scipy" }, { name = "taguette" }, { name = "vl-convert-python" }, { name = "wordcloud" }, @@ -1459,6 +1460,7 @@ requires-dist = [ { name = "python-pptx", specifier = ">=1.0.2" }, { name = "pyzmq", specifier = ">=27.1.0" }, { name = "requests", specifier = ">=2.32.5" }, + { name = "scipy", specifier = ">=1.14.0" }, { name = "taguette", specifier = ">=1.5.1" }, { name = "vl-convert-python", specifier = ">=1.9.0.post1" }, { name = "wordcloud", specifier = ">=1.9.5" },