statistical tests

2026-02-02 21:47:37 +01:00
parent 29df6a4bd9
commit f2c659c266
9 changed files with 1679 additions and 47 deletions
--- a/.github/agents/plot-creator.agent.md
+++ b/.github/agents/plot-creator.agent.md
@@ -48,13 +48,12 @@ Check if an existing `transform_<descriptive_name>` function exists in `utils.py
 def transform_<descriptive_name>(self, df: pl.LazyFrame | pl.DataFrame) -> tuple[pl.LazyFrame, dict | None]:
    """Transform <input_description> to <output_description>.
    
-    Original request: "<paste user's original question here>"
+    Original use-case: "<paste user's original question here>"
    
    This function <concise 1-2 sentence explanation of what it does>.
    
    Args:
-        df: Pre-fetched data (e.g., from get_character_refine()). 
-            Do NOT call get_*() methods inside this function.
+        df: Pre-fetched data as a Polars LazyFrame or DataFrame.
    
    Returns:
        tuple: (LazyFrame with columns [...], Optional metadata dict)
@@ -96,19 +95,11 @@ chart = S.plot_character_trait_frequency(trait_freq)
 ```

 ### Step 5: Create Temporary Test File
-Create `debug_plot_temp.py` for testing. **You MUST ask the user to provide:**
+Create `debug_plot_temp.py` for testing. **Prefer using the data snippet already provided by the user.**

-1. **The exact code snippet to create the test data** - Do NOT generate or assume file paths
-2. **Confirmation of which notebook they're working in** (so you can read it for context if needed)
+**Option A: Use provided data snippet (preferred)**
+If the user provided a `df.head()` or sample data output, create inline test data from it:

-Example prompt to user:
-> "To create the test file, please provide:
-> 1. The exact code snippet that produces the dataframe you shared (copy from your notebook)
-> 2. Which notebook are you working in? (I may read it for context, but won't modify it)
-> 
-> I will NOT attempt to load any data without your explicit code."
-
-**Test file structure using user-provided data:**
 ```python
 """Temporary test file for <plot_name>.
 Delete after testing.
@@ -118,15 +109,32 @@ from theme import ColorPalette
 import altair as alt

 # ============================================================
-# USER-PROVIDED TEST DATA (paste from user's snippet)
+# TEST DATA (reconstructed from user's df.head() output)
 # ============================================================
-# <user's code goes here>
+test_data = pl.DataFrame({
+    "Column1": ["value1", "value2", ...],
+    "Column2": [1, 2, ...],
+    # ... recreate structure from provided sample
+})
 # ============================================================

 # Test the plot function
-# ...
+from plots import QualtricsPlotsMixin
+# ... test code
 ```

+**Option B: Ask user (only if necessary)**
+Only ask the user for additional code if:
+- The provided sample is insufficient to test the plot logic
+- You need to understand complex data relationships not visible in the sample
+- The transformation requires understanding the full data pipeline
+
+If you must ask:
+> "The sample data you provided should work for basic testing. However, I need [specific reason]. Could you provide:
+> 1. [specific information needed]
+> 
+> If you'd prefer, I can proceed with a minimal test using the sample data you shared."
+
 ### Step 6: Create Plot Function
 Add a new method to `QualtricsPlotsMixin` in `plots.py`:

--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -16,8 +16,8 @@ def _():

    from speaking_styles import SPEAKING_STYLES
    return (
-        QualtricsSurvey,
        Path,
+        QualtricsSurvey,
        SPEAKING_STYLES,
        calculate_weighted_ranking_scores,
        check_progress,
@@ -49,7 +49,7 @@ def _(Path, file_browser, mo):


@app.cell
-def _(QualtricsSurvey, QSF_FILE, RESULTS_FILE, mo):
+def _(QSF_FILE, QualtricsSurvey, RESULTS_FILE, mo):
    S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
    try:
        data_all = S.load_data()
@@ -285,6 +285,7 @@ def _(S, mo, v_18_8_3):
 def _(S, calculate_weighted_ranking_scores, data):
    top3_voices = S.get_top_3_voices(data)[0]
    top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
+
    return top3_voices, top3_voices_weighted


@@ -383,6 +384,12 @@ def _(S, data, mo):
    return (vscales,)


+@app.cell
+def _(vscales):
+    print(vscales.collect().head())
+    return
+
+
@app.cell
 def _(pl, vscales):
    # Count non-null values per row
--- a/03_quant_report.py
+++ b/03_quant_report.py
@@ -44,14 +44,14 @@ def _(QSF_FILE, RESULTS_FILE):


@app.cell(hide_code=True)
-def _():
-    mo.md(r"""
+def _(RESULTS_FILE, data_all):
+    mo.md(rf"""
    ---
    # Load Data

-    **Dataset:** `{Path(RESULTS_FILE).name}`
+    **Dataset:** {Path(RESULTS_FILE).name}

-    **Responses**: `{data_all.collect().shape[0]}`
+    **Responses**: {data_all.collect().shape[0]}
    """)
    return

@@ -112,11 +112,9 @@ def _():


@app.cell
-def _(data_validated):
-    data = data_validated
-
-    data.collect()
-    return (data,)
+def _():
+    # 
+    return


@app.cell(hide_code=True)
@@ -130,8 +128,8 @@ def _():


@app.cell
-def _(S, data):
-    demographics = S.get_demographics(data)[0].collect()
+def _(S, data_validated):
+    demographics = S.get_demographics(data_validated)[0].collect()
    demographics
    return (demographics,)

@@ -148,7 +146,7 @@ def _():
 def _(demographics):
    # Demographics where 'Consumer' is null
    demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
-    # demographics_no_consumer
+    demographics_no_consumer
    return (demographics_no_consumer,)


@@ -160,9 +158,26 @@ def _(data_all, demographics_no_consumer):


@app.cell
-def _(data_all):
+def _():
+    mo.md(r"""
+    # Filter Data (Global corrections)
+    """)
+    return
+
+
+@app.cell
+def _(data_validated):
+    # drop rows where 'consumer' is null
+    # data = data_validated.filter(pl.col('Consumer').is_not_null())
+    data = data_validated
+    data.collect()
+    return (data,)
+
+
+@app.cell
+def _():
    # Check if all business owners are missing a 'Consumer type' in demographics
-    assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
+    # assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
    return


@@ -187,14 +202,14 @@ def _():


@app.cell
-def _(S, demo_plot_cols, demographics):
+def _(S, data, demo_plot_cols):
    _content = """
    ## Demographic Distributions

    """
    for c in demo_plot_cols:
        _fig = S.plot_demographic_distribution(
-            data=demographics,
+            data=S.get_demographics(data)[0],
            column=c,
            title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
        )
@@ -265,6 +280,22 @@ def _(S, char_rank):
    return


+@app.cell
+def _(S, char_rank):
+    _pairwise_df, _meta = S.compute_ranking_significance(char_rank)
+
+    print(_pairwise_df.columns)
+
+    mo.md(f"""
+    ### Statistical Significance Character Ranking
+
+    {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
+
+    {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
+    """)
+    return
+
+
@app.cell
 def _():
    mo.md(r"""
@@ -307,28 +338,69 @@ def _():


@app.cell
-def _():
-    # Join respondent 
+def _(S, data):
+    char_df = S.get_character_refine(data)[0]
+    return (char_df,)
+
+
+@app.cell
+def _(S, char_df):
+    from theme import ColorPalette
+
+    # Assuming you already have char_df (your data from get_character_refine or similar)
+    characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
+    character_colors = {
+        'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
+        'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
+        'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
+        'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
+    }
+
+    # Build consistent sort order (by total frequency across all characters)
+    all_trait_counts = {}
+    for char in characters:
+        freq_df, _ = S.transform_character_trait_frequency(char_df, char)
+        for row in freq_df.iter_rows(named=True):
+            all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
+
+    consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
+
+    _content = """"""
+    # Generate 4 plots (one per character)
+    for char in characters:
+        freq_df, _ = S.transform_character_trait_frequency(char_df, char)
+        main_color, highlight_color = character_colors[char]
+        chart = S.plot_single_character_trait_frequency(
+            data=freq_df,
+            character_name=char,
+            bar_color=main_color,
+            highlight_color=highlight_color,
+            trait_sort_order=consistent_sort_order,
+        )
+        _content += f"""
+        {mo.ui.altair_chart(chart)}
+
+
+    """
+
+    mo.md(_content)
    return


@app.cell
 def _():
    mo.md(r"""
-    ---
+    ## Statistical significance best characters

-    # Spoken Voice Results
+    zie chat
+    > voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
+    >
    """)
    return


-@app.cell(hide_code=True)
+@app.cell
 def _():
-    mo.md(r"""
-    ---
-
-    # Brand Character Results
-    """)
    return


@@ -342,5 +414,174 @@ def _():
    return


+@app.cell
+def _(S, data):
+    top3_voices = S.get_top_3_voices(data)[0]
+    top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
+    return top3_voices, top3_voices_weighted
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ## Which voice is ranked best in the ranking question for top 3?
+
+    (not best 3 out of 8 question)
+    """)
+    return
+
+
+@app.cell
+def _(S, top3_voices):
+    _plot = S.plot_ranking_distribution(top3_voices, x_label='Voice')
+    mo.md(f"""
+    {mo.ui.altair_chart(_plot)}
+    """)
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Statistical significance for voice ranking
+    """)
+    return
+
+
+@app.cell
+def _():
+    # print(top3_voices.collect().head())
+    return
+
+
+@app.cell
+def _():
+
+    # _pairwise_df, _metadata = S.compute_ranking_significance(
+    #     top3_voices,alpha=0.05,correction="none")
+
+    # # View significant pairs
+    # # print(pairwise_df.filter(pl.col('significant') == True))
+
+    # # Create heatmap visualization
+    # _heatmap = S.plot_significance_heatmap(
+    #     _pairwise_df, 
+    #     metadata=_metadata,
+    #     title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
+    # )
+
+    # # Create summary bar chart
+    # _summary = S.plot_significance_summary(
+    #     _pairwise_df,
+    #     metadata=_metadata
+    # )
+
+    # mo.md(f"""
+    # {mo.ui.altair_chart(_heatmap)}
+
+    # {mo.ui.altair_chart(_summary)}
+    # """)
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ## Weighted Popularity Scores
+    """)
+    return
+
+
+@app.cell
+def _(S, top3_voices_weighted):
+    _plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)")
+
+    mo.md(f"""
+    {mo.ui.altair_chart(_plot)}
+    """)
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell
+def _(top3_voices_weighted):
+    print(top3_voices_weighted.head())
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    ## Voice Scale 1-10
+    """)
+    return
+
+
+@app.cell
+def _(S, data):
+    # Get your voice scale data (from notebook)
+    voice_1_10, _ = S.get_voice_scale_1_10(data)
+    return (voice_1_10,)
+
+
+@app.cell
+def _(S, voice_1_10):
+    S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)")
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Statistical Significance (Scale 1-10)
+    """)
+    return
+
+
+@app.cell
+def _(S, voice_1_10):
+    # Compute pairwise significance tests
+    pairwise_df, metadata = S.compute_pairwise_significance(
+        voice_1_10,
+        test_type="mannwhitney",  # or "ttest", "chi2", "auto"
+        alpha=0.05,
+        correction="bonferroni"   # or "holm", "none"
+    )
+
+    # View significant pairs
+    # print(pairwise_df.filter(pl.col('significant') == True))
+
+    # Create heatmap visualization
+    _heatmap = S.plot_significance_heatmap(
+        pairwise_df, 
+        metadata=metadata,
+        title="Voice Rating Significance<br>(Pairwise Comparisons)"
+    )
+
+    # Create summary bar chart
+    _summary = S.plot_significance_summary(
+        pairwise_df,
+        metadata=metadata
+    )
+
+    mo.md(f"""
+    {mo.ui.altair_chart(_heatmap)}
+
+    {mo.ui.altair_chart(_summary)}
+    """)
+
+
+    return
+
+
 if __name__ == "__main__":
    app.run()
--- a/docs/statistical-significance-guide.md
+++ b/docs/statistical-significance-guide.md
@@ -0,0 +1,428 @@
+# Statistical Significance Testing Guide
+
+A beginner-friendly reference for choosing the right statistical test and correction method for your Voice Branding analysis.
+
+---
+
+## Table of Contents
+1. [Quick Decision Flowchart](#quick-decision-flowchart)
+2. [Understanding Your Data Types](#understanding-your-data-types)
+3. [Available Tests](#available-tests)
+4. [Multiple Comparison Corrections](#multiple-comparison-corrections)
+5. [Interpreting Results](#interpreting-results)
+6. [Code Examples](#code-examples)
+
+---
+
+## Quick Decision Flowchart
+
+```
+What kind of data do you have?
+│
+├─► Continuous scores (1-10 ratings, averages)
+│   │
+│   └─► Use: compute_pairwise_significance()
+│       │
+│       ├─► Data normally distributed? → test_type="ttest"
+│       └─► Not sure / skewed data?   → test_type="mannwhitney" (safer choice)
+│
+└─► Ranking data (1st, 2nd, 3rd place votes)
+    │
+    └─► Use: compute_ranking_significance()
+        (automatically uses proportion z-test)
+```
+
+---
+
+## Understanding Your Data Types
+
+### Continuous Data
+**What it looks like:** Numbers on a scale with many possible values.
+
+| Example | Data Source |
+|---------|-------------|
+| Voice ratings 1-10 | `get_voice_scale_1_10()` |
+| Speaking style scores | `get_ss_green_blue()` |
+| Any averaged scores | Custom aggregations |
+
+```
+shape: (5, 3)
+┌───────────┬─────────────────┬─────────────────┐
+│ _recordId │ Voice_Scale__V14│ Voice_Scale__V04│
+│ str       │ f64             │ f64             │
+├───────────┼─────────────────┼─────────────────┤
+│ R_001     │ 7.5             │ 6.0             │
+│ R_002     │ 8.0             │ 7.5             │
+│ R_003     │ 6.5             │ 8.0             │
+```
+
+### Ranking Data
+**What it looks like:** Discrete ranks (1, 2, 3) or null if not ranked.
+
+| Example | Data Source |
+|---------|-------------|
+| Top 3 voice rankings | `get_top_3_voices()` |
+| Character rankings | `get_character_ranking()` |
+
+```
+shape: (5, 3)
+┌───────────┬──────────────────┬──────────────────┐
+│ _recordId │ Top_3__V14       │ Top_3__V04       │
+│ str       │ i64              │ i64              │
+├───────────┼──────────────────┼──────────────────┤
+│ R_001     │ 1                │ null             │  ← V14 was ranked 1st
+│ R_002     │ 2                │ 1                │  ← V04 was ranked 1st
+│ R_003     │ null             │ 3                │  ← V04 was ranked 3rd
+```
+
+### ⚠️ Aggregated Data (Cannot Test!)
+**What it looks like:** Already summarized/totaled data.
+
+```
+shape: (3, 2)
+┌───────────┬────────────────┐
+│ Character │ Weighted Score │  ← ALREADY AGGREGATED
+│ str       │ i64            │     Lost individual variance
+├───────────┼────────────────┤     Cannot do significance tests!
+│ V14       │ 209            │
+│ V04       │ 180            │
+```
+
+**Solution:** Go back to the raw data before aggregation.
+
+---
+
+## Available Tests
+
+### 1. Mann-Whitney U Test (Default for Continuous)
+**Use when:** Comparing scores/ratings between groups  
+**Assumes:** Nothing about distribution shape (non-parametric)  
+**Best for:** Most survey data, Likert scales, ratings
+
+```python
+pairwise_df, meta = S.compute_pairwise_significance(
+    voice_data, 
+    test_type="mannwhitney"  # This is the default
+)
+```
+
+**Pros:**
+- Works with any distribution shape
+- Robust to outliers
+- Safe choice when unsure
+
+**Cons:**
+- Slightly less powerful than t-test when data IS normally distributed
+
+---
+
+### 2. Independent t-Test
+**Use when:** Comparing means between groups  
+**Assumes:** Data is approximately normally distributed  
+**Best for:** Large samples (n > 30 per group), truly continuous data
+
+```python
+pairwise_df, meta = S.compute_pairwise_significance(
+    voice_data, 
+    test_type="ttest"
+)
+```
+
+**Pros:**
+- Most powerful when assumptions are met
+- Well-understood, commonly reported
+
+**Cons:**
+- Can give misleading results if data is skewed
+- Sensitive to outliers
+
+---
+
+### 3. Chi-Square Test
+**Use when:** Comparing frequency distributions  
+**Assumes:** Expected counts ≥ 5 in each cell  
+**Best for:** Count data, categorical comparisons
+
+```python
+pairwise_df, meta = S.compute_pairwise_significance(
+    count_data, 
+    test_type="chi2"
+)
+```
+
+**Pros:**
+- Designed for count/frequency data
+- Tests if distributions differ
+
+**Cons:**
+- Needs sufficient sample sizes
+- Less informative about direction of difference
+
+---
+
+### 4. Two-Proportion Z-Test (For Rankings)
+**Use when:** Comparing ranking vote proportions  
+**Automatically used by:** `compute_ranking_significance()`
+
+```python
+pairwise_df, meta = S.compute_ranking_significance(ranking_data)
+```
+
+**What it tests:** "Does Voice A get a significantly different proportion of Rank 1 votes than Voice B?"
+
+---
+
+## Multiple Comparison Corrections
+
+### Why Do We Need Corrections?
+
+When you compare many groups, you're doing many tests. Each test has a 5% chance of a false positive (if α = 0.05). With 17 voices:
+
+| Comparisons | Expected False Positives (no correction) |
+|-------------|------------------------------------------|
+| 136 pairs   | ~7 false "significant" results!          |
+
+**Corrections adjust p-values to account for this.**
+
+---
+
+### Bonferroni Correction (Conservative)
+**Formula:** `p_adjusted = p_value × number_of_comparisons`
+
+```python
+pairwise_df, meta = S.compute_pairwise_significance(
+    data, 
+    correction="bonferroni"  # This is the default
+)
+```
+
+**Use when:**
+- You want to be very confident about significant results
+- False positives are costly (publishing, major decisions)
+- You have few comparisons (< 20)
+
+**Trade-off:** May miss real differences (more false negatives)
+
+---
+
+### Holm-Bonferroni Correction (Less Conservative)
+**Formula:** Step-down procedure that's less strict than Bonferroni
+
+```python
+pairwise_df, meta = S.compute_pairwise_significance(
+    data, 
+    correction="holm"
+)
+```
+
+**Use when:**
+- You have many comparisons
+- You want better power to detect real differences
+- Exploratory analysis where missing a real effect is costly
+
+**Trade-off:** Slightly higher false positive risk than Bonferroni
+
+---
+
+### No Correction
+**Not recommended for final analysis**, but useful for exploration.
+
+```python
+pairwise_df, meta = S.compute_pairwise_significance(
+    data, 
+    correction="none"
+)
+```
+
+**Use when:**
+- Initial exploration only
+- You'll follow up with specific hypotheses
+- You understand and accept the inflated false positive rate
+
+---
+
+### Correction Method Comparison
+
+| Method | Strictness | Best For | Risk |
+|--------|------------|----------|------|
+| Bonferroni | Most strict | Few comparisons, high stakes | Miss real effects |
+| Holm | Moderate | Many comparisons, balanced approach | Slightly more false positives |
+| None | No control | Exploration only | Many false positives |
+
+**Recommendation for Voice Branding:** Use **Holm** for exploratory analysis, **Bonferroni** for final reporting.
+
+---
+
+## Interpreting Results
+
+### Key Output Columns
+
+| Column | Meaning |
+|--------|---------|
+| `p_value` | Raw probability this difference happened by chance |
+| `p_adjusted` | Corrected p-value (use this for decisions!) |
+| `significant` | TRUE if p_adjusted < alpha (usually 0.05) |
+| `effect_size` | How big is the difference (practical significance) |
+
+### What the p-value Means
+
+| p-value | Interpretation |
+|---------|----------------|
+| < 0.001 | Very strong evidence of difference |
+| < 0.01 | Strong evidence |
+| < 0.05 | Moderate evidence (traditional threshold) |
+| 0.05 - 0.10 | Weak evidence, "trending" |
+| > 0.10 | No significant evidence |
+
+### Statistical vs Practical Significance
+
+**Statistical significance** (p < 0.05) means the difference is unlikely due to chance.
+
+**Practical significance** (effect size) means the difference matters in the real world.
+
+| Effect Size (Cohen's d) | Interpretation |
+|-------------------------|----------------|
+| < 0.2 | Small (may not matter practically) |
+| 0.2 - 0.5 | Medium |
+| 0.5 - 0.8 | Large |
+| > 0.8 | Very large |
+
+**Example:** A p-value of 0.001 with effect size of 0.1 means "we're confident there's a difference, but it's tiny."
+
+---
+
+## Code Examples
+
+### Example 1: Voice Scale Ratings
+
+```python
+# Get the raw rating data
+voice_data, _ = S.get_voice_scale_1_10(data)
+
+# Test for significant differences
+pairwise_df, meta = S.compute_pairwise_significance(
+    voice_data,
+    test_type="mannwhitney",  # Safe default for ratings
+    alpha=0.05,
+    correction="bonferroni"
+)
+
+# Check overall test first
+print(f"Overall test: {meta['overall_test']}")
+print(f"Overall p-value: {meta['overall_p_value']:.4f}")
+
+# If overall is significant, look at pairwise
+if meta['overall_p_value'] < 0.05:
+    sig_pairs = pairwise_df.filter(pl.col('significant') == True)
+    print(f"Found {sig_pairs.height} significant pairwise differences")
+
+# Visualize
+S.plot_significance_heatmap(pairwise_df, metadata=meta)
+```
+
+### Example 2: Top 3 Voice Rankings
+
+```python
+# Get the raw ranking data (NOT the weighted scores!)
+ranking_data, _ = S.get_top_3_voices(data)
+
+# Test for significant differences in Rank 1 proportions
+pairwise_df, meta = S.compute_ranking_significance(
+    ranking_data,
+    alpha=0.05,
+    correction="holm"  # Less conservative for many comparisons
+)
+
+# Check chi-square test
+print(f"Chi-square p-value: {meta['chi2_p_value']:.4f}")
+
+# View contingency table (Rank 1, 2, 3 counts per voice)
+for voice, counts in meta['contingency_table'].items():
+    print(f"{voice}: R1={counts[0]}, R2={counts[1]}, R3={counts[2]}")
+
+# Find significant pairs
+sig_pairs = pairwise_df.filter(pl.col('significant') == True)
+print(sig_pairs)
+```
+
+### Example 3: Comparing Demographic Subgroups
+
+```python
+# Filter to specific demographics
+S.filter_data(data, consumer=['Early Professional'])
+early_pro_data, _ = S.get_voice_scale_1_10(data)
+
+S.filter_data(data, consumer=['Established Professional'])
+estab_pro_data, _ = S.get_voice_scale_1_10(data)
+
+# Test each group separately, then compare results qualitatively
+# (For direct group comparison, you'd need a different test design)
+```
+
+---
+
+## Common Mistakes to Avoid
+
+### ❌ Using Aggregated Data
+```python
+# WRONG - already summarized, lost individual variance
+weighted_scores = calculate_weighted_ranking_scores(ranking_data)
+S.compute_pairwise_significance(weighted_scores)  # Will fail!
+```
+
+### ✅ Use Raw Data
+```python
+# RIGHT - use raw data before aggregation
+ranking_data, _ = S.get_top_3_voices(data)
+S.compute_ranking_significance(ranking_data)
+```
+
+### ❌ Ignoring Multiple Comparisons
+```python
+# WRONG - 7% of pairs will be "significant" by chance alone!
+S.compute_pairwise_significance(data, correction="none")
+```
+
+### ✅ Apply Correction
+```python
+# RIGHT - corrected p-values control false positives
+S.compute_pairwise_significance(data, correction="bonferroni")
+```
+
+### ❌ Only Reporting p-values
+```python
+# WRONG - statistical significance isn't everything
+print(f"p = {p_value}")  # Missing context!
+```
+
+### ✅ Report Effect Sizes Too
+```python
+# RIGHT - include practical significance
+print(f"p = {p_value}, effect size = {effect_size}")
+print(f"Mean difference: {mean1 - mean2:.2f} points")
+```
+
+---
+
+## Quick Reference Card
+
+| Data Type | Function | Default Test | Recommended Correction |
+|-----------|----------|--------------|------------------------|
+| Ratings (1-10) | `compute_pairwise_significance()` | Mann-Whitney U | Bonferroni |
+| Rankings (1st/2nd/3rd) | `compute_ranking_significance()` | Proportion Z | Holm |
+| Count frequencies | `compute_pairwise_significance(test_type="chi2")` | Chi-square | Bonferroni |
+
+| Scenario | Correction |
+|----------|------------|
+| Publishing results | Bonferroni |
+| Client presentation | Bonferroni |
+| Exploratory analysis | Holm |
+| Quick internal check | Holm or None |
+
+---
+
+## Further Reading
+
+- [Statistics for Dummies Cheat Sheet](https://www.dummies.com/article/academics-the-arts/math/statistics/statistics-for-dummies-cheat-sheet-208650/)
+- [Choosing the Right Statistical Test](https://stats.oarc.ucla.edu/other/mult-pkg/whatstat/)
+- [Multiple Comparisons Problem (Wikipedia)](https://en.wikipedia.org/wiki/Multiple_comparisons_problem)
--- a/plots.py
+++ b/plots.py
@@ -290,10 +290,11 @@ class QualtricsPlotsMixin:
        if domain is None:
            domain = [stats_df['average'].min(), stats_df['average'].max()]

-        # Base bar chart
+        # Base bar chart - use y2 to explicitly start bars at domain minimum
        bars = alt.Chart(stats_df).mark_bar(color=color).encode(
            x=alt.X('voice:N', title=x_label, sort='-y'),
            y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)),
+            y2=alt.datum(domain[0]),  # Bars start at domain minimum (bottom edge)
            tooltip=[
                alt.Tooltip('voice:N', title='Voice'),
                alt.Tooltip('average:Q', title='Average', format='.2f'),
@@ -1099,5 +1100,493 @@ class QualtricsPlotsMixin:
            height=height or getattr(self, 'plot_height', 400)
        )
        
+        chart = self._save_plot(chart, title)
+        return chart
+
+    def plot_single_character_trait_frequency(
+        self,
+        data: pl.LazyFrame | pl.DataFrame | None = None,
+        character_name: str = "Character",
+        bar_color: str = ColorPalette.PRIMARY,
+        highlight_color: str = ColorPalette.NEUTRAL,
+        title: str | None = None,
+        x_label: str = "Trait",
+        y_label: str = "Frequency",
+        trait_sort_order: list[str] | None = None,
+        height: int | None = None,
+        width: int | str | None = None,
+    ) -> alt.Chart:
+        """Create a bar plot showing trait frequency for a single character.
+        
+        Original request: "I need a bar plot that shows the frequency of the times 
+        each trait is chosen per brand character. The function should be generalized 
+        so that it can be used 4 times, once for each character. Each character should 
+        use a slightly different color. Original traits should be highlighted."
+        
+        This function creates one plot per character. Call it 4 times (once per 
+        character) to generate all plots for a slide.
+        
+        Args:
+            data: DataFrame with columns ['trait', 'count', 'is_original'] 
+                  as produced by transform_character_trait_frequency()
+            character_name: Name of the character (for title). E.g., "Bank Teller"
+            bar_color: Main bar color for non-original traits. Use ColorPalette 
+                       constants like ColorPalette.CHARACTER_BANK_TELLER
+            highlight_color: Lighter color for original/expected traits. Use the
+                       matching highlight like ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT
+            title: Custom title. If None, auto-generates from character_name
+            x_label: X-axis label
+            y_label: Y-axis label
+            trait_sort_order: Optional list of traits for consistent sorting across
+                              all character plots. If None, sorts by count descending.
+            height: Chart height
+            width: Chart width
+        
+        Returns:
+            alt.Chart: Altair bar chart
+        """
+        df = self._ensure_dataframe(data)
+        
+        # Ensure we have the expected columns
+        required_cols = {'trait', 'count', 'is_original'}
+        if not required_cols.issubset(set(df.columns)):
+            return alt.Chart(pd.DataFrame({
+                'text': ['Data must have trait, count, is_original columns']
+            })).mark_text().encode(text='text:N')
+        
+        # Convert to pandas for Altair
+        plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df
+        
+        # Determine sort order
+        if trait_sort_order is not None:
+            # Use provided order, append any missing traits at the end (sorted by count)
+            known_traits = set(trait_sort_order)
+            extra_traits = plot_df[~plot_df['trait'].isin(known_traits)].sort_values(
+                'count', ascending=False
+            )['trait'].tolist()
+            sort_order = trait_sort_order + extra_traits
+        else:
+            # Default: sort by count descending
+            sort_order = plot_df.sort_values('count', ascending=False)['trait'].tolist()
+        
+        # Create category column for color encoding
+        plot_df['category'] = plot_df['is_original'].map({
+            True: 'Original Trait',
+            False: 'Other Trait'
+        })
+        
+        # Generate title if not provided
+        if title is None:
+            title = f"{character_name}<br>Trait Selection Frequency"
+        
+        # Build title config with sort order note as subtitle
+        sort_note = "Sorted by total frequency across all characters" if trait_sort_order else "Sorted by frequency (descending)"
+        title_text = self._process_title(title)
+        title_config = {
+            'text': title_text,
+            'subtitle': sort_note,
+            'subtitleColor': 'gray',
+            'subtitleFontSize': 10,
+            'anchor': 'start',
+        }
+        
+        # Create HORIZONTAL bar chart with conditional coloring
+        # Reverse sort order for horizontal bars (highest at top)
+        reversed_sort = list(reversed(sort_order))
+        
+        bars = alt.Chart(plot_df).mark_bar().encode(
+            y=alt.Y('trait:N',
+                    title=x_label,
+                    sort=reversed_sort,
+                    axis=alt.Axis(labelLimit=200)),
+            x=alt.X('count:Q', title=y_label),
+            color=alt.Color('category:N',
+                           scale=alt.Scale(
+                               domain=['Original Trait', 'Other Trait'],
+                               range=[highlight_color, bar_color]
+                           ),
+                           legend=alt.Legend(
+                               orient='top', 
+                               direction='horizontal', 
+                               title=None
+                           )),
+            tooltip=[
+                alt.Tooltip('trait:N', title='Trait'),
+                alt.Tooltip('count:Q', title='Frequency'),
+                alt.Tooltip('category:N', title='Type')
+            ]
+        )
+        
+        # Add count labels on bars (to the right of bars for horizontal)
+        text = alt.Chart(plot_df).mark_text(
+            dx=12,
+            color='black',
+            fontSize=10,
+            align='left'
+        ).encode(
+            y=alt.Y('trait:N', sort=reversed_sort),
+            x=alt.X('count:Q'),
+            text=alt.Text('count:Q')
+        )
+        
+        chart = (bars + text).properties(
+            title=title_config,
+            width=width or 400,
+            height=height or getattr(self, 'plot_height', 450)
+        )
+        
+        chart = self._save_plot(chart, title)
+        return chart
+
+    def plot_significance_heatmap(
+        self,
+        pairwise_df: pl.LazyFrame | pl.DataFrame | None = None,
+        metadata: dict | None = None,
+        title: str = "Pairwise Statistical Significance<br>(Adjusted p-values)",
+        show_p_values: bool = True,
+        show_effect_size: bool = False,
+        height: int | None = None,
+        width: int | None = None,
+    ) -> alt.Chart:
+        """Create a heatmap showing pairwise statistical significance between groups.
+        
+        Original use-case: "I need to test for statistical significance and present
+        this in a logical manner - as a heatmap or similar visualization."
+        
+        This function visualizes the output of compute_pairwise_significance() as 
+        a color-coded heatmap where color intensity indicates significance level.
+        
+        Args:
+            pairwise_df: Output from compute_pairwise_significance().
+                Expected columns: ['group1', 'group2', 'p_value', 'p_adjusted', 'significant']
+            metadata: Metadata dict from compute_pairwise_significance() (optional).
+                Used to add test information to the plot subtitle.
+            title: Chart title (supports <br> for line breaks)
+            show_p_values: Whether to display p-values as text annotations
+            show_effect_size: Whether to display effect sizes instead of p-values
+            height: Chart height (default: auto-sized based on groups)
+            width: Chart width (default: auto-sized based on groups)
+        
+        Returns:
+            alt.Chart: Altair heatmap chart
+        """
+        df = self._ensure_dataframe(pairwise_df)
+        
+        # Get unique groups
+        all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list()))
+        n_groups = len(all_groups)
+        
+        # Create symmetric matrix data for heatmap
+        # We need both directions (A,B) and (B,A) for the full matrix
+        heatmap_data = []
+        for row_group in all_groups:
+            for col_group in all_groups:
+                if row_group == col_group:
+                    # Diagonal - self comparison
+                    heatmap_data.append({
+                        'row': row_group,
+                        'col': col_group,
+                        'p_adjusted': None,
+                        'p_value': None,
+                        'significant': None,
+                        'effect_size': None,
+                        'text_label': '—',
+                        'sig_category': 'Self',
+                    })
+                else:
+                    # Find the comparison (could be in either order)
+                    match = df.filter(
+                        ((pl.col('group1') == row_group) & (pl.col('group2') == col_group)) |
+                        ((pl.col('group1') == col_group) & (pl.col('group2') == row_group))
+                    )
+                    if match.height > 0:
+                        p_adj = match['p_adjusted'][0]
+                        p_val = match['p_value'][0]
+                        sig = match['significant'][0]
+                        eff = match['effect_size'][0] if 'effect_size' in match.columns else None
+                        
+                        # For ranking data, we can show Rank 1 % difference
+                        has_rank_pcts = 'rank1_pct1' in match.columns and 'rank1_pct2' in match.columns
+                        if has_rank_pcts:
+                            pct_diff = abs(match['rank1_pct1'][0] - match['rank1_pct2'][0])
+                        else:
+                            pct_diff = None
+                        
+                        # Helper to get display text when not showing p-values
+                        def get_alt_text():
+                            if eff is not None:
+                                return f'{eff:.2f}'
+                            elif pct_diff is not None:
+                                return f'{pct_diff:.1f}%'
+                            else:
+                                return '—'
+                        
+                        # Categorize significance level
+                        if p_adj is None:
+                            sig_cat = 'N/A'
+                            text = 'N/A'
+                        elif p_adj < 0.001:
+                            sig_cat = 'p < 0.001'
+                            text = '<.001' if show_p_values else get_alt_text()
+                        elif p_adj < 0.01:
+                            sig_cat = 'p < 0.01'
+                            text = f'{p_adj:.3f}' if show_p_values else get_alt_text()
+                        elif p_adj < 0.05:
+                            sig_cat = 'p < 0.05'
+                            text = f'{p_adj:.3f}' if show_p_values else get_alt_text()
+                        else:
+                            sig_cat = 'n.s.'
+                            text = f'{p_adj:.2f}' if show_p_values else get_alt_text()
+                        
+                        if show_effect_size:
+                            text = get_alt_text()
+                        
+                        heatmap_data.append({
+                            'row': row_group,
+                            'col': col_group,
+                            'p_adjusted': p_adj,
+                            'p_value': p_val,
+                            'significant': sig,
+                            'effect_size': eff,
+                            'text_label': text,
+                            'sig_category': sig_cat,
+                        })
+                    else:
+                        heatmap_data.append({
+                            'row': row_group,
+                            'col': col_group,
+                            'p_adjusted': None,
+                            'p_value': None,
+                            'significant': None,
+                            'effect_size': None,
+                            'text_label': 'N/A',
+                            'sig_category': 'N/A',
+                        })
+        
+        heatmap_df = pl.DataFrame(heatmap_data).to_pandas()
+        
+        # Define color scale for significance categories
+        sig_domain = ['p < 0.001', 'p < 0.01', 'p < 0.05', 'n.s.', 'Self', 'N/A']
+        sig_range = [
+            ColorPalette.SIG_STRONG,    # p < 0.001
+            ColorPalette.SIG_MODERATE,  # p < 0.01
+            ColorPalette.SIG_WEAK,      # p < 0.05
+            ColorPalette.SIG_NONE,      # not significant
+            ColorPalette.SIG_DIAGONAL,  # diagonal (self)
+            ColorPalette.NEUTRAL,       # N/A
+        ]
+        
+        # Build tooltip fields based on available data
+        tooltip_fields = [
+            alt.Tooltip('row:N', title='Group 1'),
+            alt.Tooltip('col:N', title='Group 2'),
+            alt.Tooltip('p_value:Q', title='p-value', format='.4f'),
+            alt.Tooltip('p_adjusted:Q', title='Adjusted p', format='.4f'),
+        ]
+        # Only add effect_size if it has non-null values (continuous data)
+        has_effect_size = 'effect_size' in heatmap_df.columns and heatmap_df['effect_size'].notna().any()
+        if has_effect_size:
+            tooltip_fields.append(alt.Tooltip('effect_size:Q', title='Effect Size', format='.3f'))
+        # Add rank info for ranking data
+        has_rank_pcts = 'rank1_pct1' in df.columns if isinstance(df, pl.DataFrame) else False
+        if has_rank_pcts:
+            tooltip_fields.append(alt.Tooltip('text_label:N', title='Rank 1 % Diff'))
+        
+        # Calculate dimensions
+        cell_size = 45
+        auto_size = n_groups * cell_size + 100
+        chart_width = width or auto_size
+        chart_height = height or auto_size
+        
+        # Base heatmap
+        heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode(
+            x=alt.X('col:N', title=None, sort=all_groups,
+                   axis=alt.Axis(labelAngle=-45, labelLimit=150)),
+            y=alt.Y('row:N', title=None, sort=all_groups,
+                   axis=alt.Axis(labelLimit=150)),
+            color=alt.Color('sig_category:N',
+                           scale=alt.Scale(domain=sig_domain, range=sig_range),
+                           legend=alt.Legend(
+                               title='Significance',
+                               orient='right',
+                               direction='vertical'
+                           )),
+            tooltip=tooltip_fields
+        )
+        
+        # Text annotations
+        if show_p_values or show_effect_size:
+            # Add a column for text color based on significance
+            heatmap_df['text_color'] = heatmap_df['sig_category'].apply(
+                lambda x: 'white' if x in ['p < 0.001', 'p < 0.01'] else 'black'
+            )
+            
+            text = alt.Chart(heatmap_df).mark_text(
+                fontSize=9,
+                fontWeight='normal'
+            ).encode(
+                x=alt.X('col:N', sort=all_groups),
+                y=alt.Y('row:N', sort=all_groups),
+                text='text_label:N',
+                color=alt.Color('text_color:N', scale=None),
+            )
+            chart = (heatmap + text)
+        else:
+            chart = heatmap
+        
+        # Build subtitle with test info
+        subtitle_lines = []
+        if metadata:
+            test_info = f"Test: {metadata.get('test_type', 'N/A')}"
+            if metadata.get('overall_p_value') is not None:
+                test_info += f" | Overall p={metadata['overall_p_value']:.4f}"
+            correction = metadata.get('correction', 'none')
+            if correction != 'none':
+                test_info += f" | Correction: {correction}"
+            subtitle_lines.append(test_info)
+        
+        title_config = {
+            'text': self._process_title(title),
+            'subtitle': subtitle_lines if subtitle_lines else None,
+            'subtitleColor': 'gray',
+            'subtitleFontSize': 10,
+            'anchor': 'start',
+        }
+        
+        chart = chart.properties(
+            title=title_config,
+            width=chart_width,
+            height=chart_height,
+        )
+        
+        chart = self._save_plot(chart, title)
+        return chart
+
+    def plot_significance_summary(
+        self,
+        pairwise_df: pl.LazyFrame | pl.DataFrame | None = None,
+        metadata: dict | None = None,
+        title: str = "Significant Differences Summary<br>(Groups with significantly different means)",
+        height: int | None = None,
+        width: int | None = None,
+    ) -> alt.Chart:
+        """Create a summary bar chart showing which groups have significant differences.
+        
+        This shows each group with a count of how many other groups it differs from
+        significantly, plus the mean score or Rank 1 percentage for reference.
+        
+        Args:
+            pairwise_df: Output from compute_pairwise_significance() or compute_ranking_significance().
+            metadata: Metadata dict from the significance computation (optional).
+            title: Chart title
+            height: Chart height
+            width: Chart width
+        
+        Returns:
+            alt.Chart: Altair bar chart with significance count per group
+        """
+        df = self._ensure_dataframe(pairwise_df)
+        
+        # Detect data type: continuous (has mean1/mean2) vs ranking (has rank1_pct1/rank1_pct2)
+        has_means = 'mean1' in df.columns
+        has_ranks = 'rank1_pct1' in df.columns
+        
+        # Count significant differences per group
+        sig_df = df.filter(pl.col('significant') == True)
+        
+        # Count for each group (appears as either group1 or group2)
+        group1_counts = sig_df.group_by('group1').agg(pl.len().alias('count'))
+        group2_counts = sig_df.group_by('group2').agg(pl.len().alias('count'))
+        
+        # Combine counts
+        all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list()))
+        summary_data = []
+        
+        for group in all_groups:
+            count1 = group1_counts.filter(pl.col('group1') == group)['count'].to_list()
+            count2 = group2_counts.filter(pl.col('group2') == group)['count'].to_list()
+            total_sig = (count1[0] if count1 else 0) + (count2[0] if count2 else 0)
+            
+            # Get score for this group from pairwise data
+            if has_means:
+                # Continuous data - use means
+                scores = df.filter(pl.col('group1') == group)['mean1'].to_list()
+                if not scores:
+                    scores = df.filter(pl.col('group2') == group)['mean2'].to_list()
+                score_val = scores[0] if scores else None
+                score_label = 'mean'
+            elif has_ranks:
+                # Ranking data - use Rank 1 percentage
+                scores = df.filter(pl.col('group1') == group)['rank1_pct1'].to_list()
+                if not scores:
+                    scores = df.filter(pl.col('group2') == group)['rank1_pct2'].to_list()
+                score_val = scores[0] if scores else None
+                score_label = 'rank1_pct'
+            else:
+                score_val = None
+                score_label = 'score'
+            
+            summary_data.append({
+                'group': group,
+                'sig_count': total_sig,
+                'score': score_val,
+            })
+        
+        summary_df = pl.DataFrame(summary_data).sort('score', descending=True, nulls_last=True).to_pandas()
+        
+        # Create layered chart: bars for sig_count, text for score
+        tooltip_title = 'Mean Score' if has_means else 'Rank 1 %' if has_ranks else 'Score'
+        
+        bars = alt.Chart(summary_df).mark_bar(color=ColorPalette.PRIMARY).encode(
+            x=alt.X('group:N', title='Group', sort='-y'),
+            y=alt.Y('sig_count:Q', title='# of Significant Differences'),
+            tooltip=[
+                alt.Tooltip('group:N', title='Group'),
+                alt.Tooltip('sig_count:Q', title='Sig. Differences'),
+                alt.Tooltip('score:Q', title=tooltip_title, format='.1f'),
+            ]
+        )
+        
+        # Only add text labels if we have scores
+        if summary_df['score'].notna().any():
+            text_format = '.1f' if has_means else '.0f'
+            text_suffix = '%' if has_ranks else ''
+            text = alt.Chart(summary_df).mark_text(
+                dy=-8,
+                color='black',
+                fontSize=9
+            ).encode(
+                x=alt.X('group:N', sort='-y'),
+                y=alt.Y('sig_count:Q'),
+                text=alt.Text('score:Q', format=text_format)
+            )
+            chart_layers = bars + text
+        else:
+            chart_layers = bars
+        
+        # Build subtitle
+        subtitle = None
+        if metadata:
+            if has_means:
+                subtitle = f"Mean scores shown above bars | α={metadata.get('alpha', 0.05)}"
+            elif has_ranks:
+                subtitle = f"Rank 1 % shown above bars | α={metadata.get('alpha', 0.05)}"
+            else:
+                subtitle = f"α={metadata.get('alpha', 0.05)}"
+        
+        title_config = {
+            'text': self._process_title(title),
+            'subtitle': subtitle,
+            'subtitleColor': 'gray',
+            'subtitleFontSize': 10,
+            'anchor': 'start',
+        }
+        
+        chart = chart_layers.properties(
+            title=title_config,
+            width=width or 800,
+            height=height or getattr(self, 'plot_height', 400),
+        )
+        
        chart = self._save_plot(chart, title)
        return chart
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,7 @@ dependencies = [
    "python-pptx>=1.0.2",
    "pyzmq>=27.1.0",
    "requests>=2.32.5",
+    "scipy>=1.14.0",
    "taguette>=1.5.1",
    "vl-convert-python>=1.9.0.post1",
    "wordcloud>=1.9.5",
--- a/theme.py
+++ b/theme.py
@@ -19,11 +19,32 @@ class ColorPalette:
    # Neutral color for unhighlighted comparison items
    NEUTRAL = "#D3D3D3"  # Light Grey

+    # Character-specific colors (for individual character plots)
+    # Each character has a main color and a lighter highlight for original traits
+    CHARACTER_BANK_TELLER = "#004C6D"              # Dark Blue
+    CHARACTER_BANK_TELLER_HIGHLIGHT = "#669BBC"    # Light Steel Blue
+    
+    CHARACTER_FAMILIAR_FRIEND = "#008493"          # Teal
+    CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT = "#A8DADC" # Pale Cyan
+    
+    CHARACTER_COACH = "#5AAE95"                    # Sea Green
+    CHARACTER_COACH_HIGHLIGHT = "#A8DADC"          # Pale Cyan
+    
+    CHARACTER_PERSONAL_ASSISTANT = "#457B9D"       # Steel Blue
+    CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT = "#669BBC" # Light Steel Blue
+
    # General UI elements
    TEXT = "black"
    GRID = "lightgray"
    BACKGROUND = "white"

+    # Statistical significance colors (for heatmaps/annotations)
+    SIG_STRONG = "#004C6D"       # p < 0.001 - Dark Blue (highly significant)
+    SIG_MODERATE = "#0077B6"     # p < 0.01 - Medium Blue (significant)
+    SIG_WEAK = "#5AAE95"         # p < 0.05 - Sea Green (marginally significant)
+    SIG_NONE = "#E8E8E8"         # p >= 0.05 - Light Grey (not significant)
+    SIG_DIAGONAL = "#FFFFFF"     # White for diagonal (self-comparison)
+
    # Extended palette for categorical charts (e.g., pie charts with many categories)
    CATEGORICAL = [
        "#0077B6",  # PRIMARY - Medium Blue
--- a/utils.py
+++ b/utils.py
@@ -714,7 +714,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data


 class QualtricsSurvey(QualtricsPlotsMixin):
-    """Class to handle JPMorgan Chase survey data."""
+    """Class to handle Qualtrics survey data."""
    
    def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
        if isinstance(data_path, str):
@@ -1072,6 +1072,441 @@ class QualtricsSurvey(QualtricsPlotsMixin):
        
        return self._get_subset(q, QIDs, rename_cols=True), None

+    def transform_character_trait_frequency(
+        self, 
+        char_df: pl.LazyFrame | pl.DataFrame,
+        character_column: str,
+    ) -> tuple[pl.DataFrame, dict | None]:
+        """Transform character refine data to trait frequency counts for a single character.
+        
+        Original use-case: "I need a bar plot that shows the frequency of the times 
+        each trait is chosen per brand character."
+        
+        This function takes a DataFrame with comma-separated trait selections per 
+        character, explodes traits, and counts their frequency for a single character.
+        
+        Args:
+            char_df: Pre-fetched data 
+                Expected columns: '_recordId', '<character_column>' (with comma-separated traits)
+            character_column: Name of the character column to analyze (e.g., 'Bank Teller')
+        
+        Returns:
+            tuple: (DataFrame with columns ['trait', 'count', 'is_original'], None)
+                   - 'trait': individual trait name
+                   - 'count': frequency count
+                   - 'is_original': boolean indicating if trait is in the original definition
+        """
+        from reference import ORIGINAL_CHARACTER_TRAITS
+        
+        if isinstance(char_df, pl.LazyFrame):
+            char_df = char_df.collect()
+        
+        # Map display names to reference keys
+        character_key_map = {
+            'Bank Teller': 'the_bank_teller',
+            'Familiar Friend': 'the_familiar_friend',
+            'The Coach': 'the_coach',
+            'Personal Assistant': 'the_personal_assistant',
+        }
+        
+        # Get original traits for this character
+        ref_key = character_key_map.get(character_column)
+        original_traits = set(ORIGINAL_CHARACTER_TRAITS.get(ref_key, []))
+        
+        # Filter to rows where this character has a value (not null)
+        char_data = char_df.filter(pl.col(character_column).is_not_null())
+        
+        # Split comma-separated traits and explode
+        exploded = (
+            char_data
+            .select(
+                pl.col(character_column)
+                .str.split(',')
+                .alias('traits')
+            )
+            .explode('traits')
+            .with_columns(
+                pl.col('traits').str.strip_chars().alias('trait')
+            )
+            .filter(pl.col('trait') != '')
+        )
+        
+        # Count trait frequencies
+        freq_df = (
+            exploded
+            .group_by('trait')
+            .agg(pl.len().alias('count'))
+            .sort('count', descending=True)
+        )
+        
+        # Add is_original flag
+        freq_df = freq_df.with_columns(
+            pl.col('trait').is_in(list(original_traits)).alias('is_original')
+        )
+        
+        return freq_df, None
+
+    def compute_pairwise_significance(
+        self,
+        data: pl.LazyFrame | pl.DataFrame,
+        test_type: str = "auto",
+        alpha: float = 0.05,
+        correction: str = "bonferroni",
+    ) -> tuple[pl.DataFrame, dict]:
+        """Compute pairwise statistical significance tests between columns.
+        
+        Original use-case: "I need to test for statistical significance and present 
+        this in a logical manner. It should be a generalized function to work on 
+        many dataframes."
+        
+        This function performs pairwise statistical tests between all numeric columns
+        (excluding '_recordId') to determine which groups differ significantly.
+        
+        Args:
+            data: Pre-fetched data with numeric columns to compare.
+                Expected format: rows are observations, columns are groups/categories.
+                Example: Voice_Scale_1_10__V14, Voice_Scale_1_10__V04, etc.
+            test_type: Statistical test to use:
+                - "auto": Automatically chooses based on data (default)
+                - "mannwhitney": Mann-Whitney U test (non-parametric, for continuous)
+                - "ttest": Independent samples t-test (parametric, for continuous)
+                - "chi2": Chi-square test (for count/frequency data)
+            alpha: Significance level (default 0.05)
+            correction: Multiple comparison correction method:
+                - "bonferroni": Bonferroni correction (conservative)
+                - "holm": Holm-Bonferroni (less conservative)
+                - "none": No correction
+        
+        Returns:
+            tuple: (pairwise_df, metadata)
+                - pairwise_df: DataFrame with columns ['group1', 'group2', 'p_value', 
+                  'p_adjusted', 'significant', 'effect_size', 'mean1', 'mean2', 'n1', 'n2']
+                - metadata: dict with 'test_type', 'alpha', 'correction', 'n_comparisons',
+                  'overall_test_stat', 'overall_p_value'
+        """
+        from scipy import stats as scipy_stats
+        import numpy as np
+        
+        if isinstance(data, pl.LazyFrame):
+            df = data.collect()
+        else:
+            df = data
+        
+        # Get numeric columns (exclude _recordId and other non-data columns)
+        value_cols = [c for c in df.columns if c != '_recordId' and df[c].dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
+        
+        if len(value_cols) < 2:
+            raise ValueError(f"Need at least 2 numeric columns for comparison, found {len(value_cols)}")
+        
+        # Auto-detect test type based on data characteristics
+        if test_type == "auto":
+            # Check if data looks like counts (integers, small range) vs continuous
+            sample_col = df[value_cols[0]].drop_nulls()
+            if len(sample_col) > 0:
+                is_integer = sample_col.dtype in [pl.Int64, pl.Int32]
+                unique_ratio = sample_col.n_unique() / len(sample_col)
+                if is_integer and unique_ratio < 0.1:
+                    test_type = "chi2"
+                else:
+                    test_type = "mannwhitney"  # Default to non-parametric
+            else:
+                test_type = "mannwhitney"
+        
+        # Extract data as lists (dropping nulls for each column)
+        group_data = {}
+        for col in value_cols:
+            group_data[col] = df[col].drop_nulls().to_numpy()
+        
+        # Compute overall test (Kruskal-Wallis for non-parametric, ANOVA for parametric)
+        all_groups = [group_data[col] for col in value_cols if len(group_data[col]) > 0]
+        if test_type in ["mannwhitney", "auto"]:
+            overall_stat, overall_p = scipy_stats.kruskal(*all_groups)
+            overall_test_name = "Kruskal-Wallis"
+        elif test_type == "ttest":
+            overall_stat, overall_p = scipy_stats.f_oneway(*all_groups)
+            overall_test_name = "One-way ANOVA"
+        else:
+            overall_stat, overall_p = None, None
+            overall_test_name = "N/A (Chi-square)"
+        
+        # Compute pairwise tests
+        results = []
+        n_comparisons = len(value_cols) * (len(value_cols) - 1) // 2
+        
+        for i, col1 in enumerate(value_cols):
+            for col2 in value_cols[i+1:]:
+                data1 = group_data[col1]
+                data2 = group_data[col2]
+                
+                n1, n2 = len(data1), len(data2)
+                mean1 = float(np.mean(data1)) if n1 > 0 else None
+                mean2 = float(np.mean(data2)) if n2 > 0 else None
+                
+                # Skip if either group has no data
+                if n1 == 0 or n2 == 0:
+                    results.append({
+                        'group1': self._clean_voice_label(col1),
+                        'group2': self._clean_voice_label(col2),
+                        'p_value': None,
+                        'effect_size': None,
+                        'mean1': mean1,
+                        'mean2': mean2,
+                        'n1': n1,
+                        'n2': n2,
+                    })
+                    continue
+                
+                # Perform the appropriate test
+                if test_type == "mannwhitney":
+                    stat, p_value = scipy_stats.mannwhitneyu(data1, data2, alternative='two-sided')
+                    # Effect size: rank-biserial correlation
+                    effect_size = 1 - (2 * stat) / (n1 * n2)
+                elif test_type == "ttest":
+                    stat, p_value = scipy_stats.ttest_ind(data1, data2)
+                    # Effect size: Cohen's d
+                    pooled_std = np.sqrt(((n1-1)*np.std(data1)**2 + (n2-1)*np.std(data2)**2) / (n1+n2-2))
+                    effect_size = (mean1 - mean2) / pooled_std if pooled_std > 0 else 0
+                elif test_type == "chi2":
+                    # Create contingency table from the two distributions
+                    # Bin the data for chi-square
+                    all_data = np.concatenate([data1, data2])
+                    bins = np.histogram_bin_edges(all_data, bins='auto')
+                    counts1, _ = np.histogram(data1, bins=bins)
+                    counts2, _ = np.histogram(data2, bins=bins)
+                    contingency = np.array([counts1, counts2])
+                    # Remove zero columns
+                    contingency = contingency[:, contingency.sum(axis=0) > 0]
+                    if contingency.shape[1] > 1:
+                        stat, p_value, _, _ = scipy_stats.chi2_contingency(contingency)
+                        effect_size = np.sqrt(stat / (contingency.sum() * (min(contingency.shape) - 1)))
+                    else:
+                        p_value, effect_size = 1.0, 0.0
+                else:
+                    raise ValueError(f"Unknown test_type: {test_type}")
+                
+                results.append({
+                    'group1': self._clean_voice_label(col1),
+                    'group2': self._clean_voice_label(col2),
+                    'p_value': float(p_value),
+                    'effect_size': float(effect_size),
+                    'mean1': mean1,
+                    'mean2': mean2,
+                    'n1': n1,
+                    'n2': n2,
+                })
+        
+        # Create DataFrame and apply multiple comparison correction
+        results_df = pl.DataFrame(results)
+        
+        # Apply correction
+        p_values = results_df['p_value'].to_numpy()
+        valid_mask = ~np.isnan(p_values.astype(float))
+        p_adjusted = np.full_like(p_values, np.nan, dtype=float)
+        
+        if correction == "bonferroni":
+            p_adjusted[valid_mask] = np.minimum(p_values[valid_mask] * n_comparisons, 1.0)
+        elif correction == "holm":
+            # Holm-Bonferroni step-down procedure
+            valid_p = p_values[valid_mask]
+            sorted_idx = np.argsort(valid_p)
+            sorted_p = valid_p[sorted_idx]
+            m = len(sorted_p)
+            adjusted = np.zeros(m)
+            for j in range(m):
+                adjusted[j] = sorted_p[j] * (m - j)
+            # Ensure monotonicity
+            for j in range(1, m):
+                adjusted[j] = max(adjusted[j], adjusted[j-1])
+            adjusted = np.minimum(adjusted, 1.0)
+            # Restore original order
+            p_adjusted[valid_mask] = adjusted[np.argsort(sorted_idx)]
+        elif correction == "none":
+            p_adjusted = p_values.astype(float)
+        
+        results_df = results_df.with_columns([
+            pl.Series('p_adjusted', p_adjusted),
+            pl.Series('significant', p_adjusted < alpha),
+        ])
+        
+        metadata = {
+            'test_type': test_type,
+            'alpha': alpha,
+            'correction': correction,
+            'n_comparisons': n_comparisons,
+            'overall_test': overall_test_name,
+            'overall_stat': overall_stat,
+            'overall_p_value': overall_p,
+        }
+        
+        return results_df, metadata
+
+    def compute_ranking_significance(
+        self,
+        data: pl.LazyFrame | pl.DataFrame,
+        alpha: float = 0.05,
+        correction: str = "bonferroni",
+    ) -> tuple[pl.DataFrame, dict]:
+        """Compute statistical significance for ranking data (e.g., Top 3 Voices).
+        
+        Original use-case: "Test whether voices are ranked significantly differently
+        based on the distribution of 1st, 2nd, 3rd place votes."
+        
+        This function takes raw ranking data (rows = respondents, columns = voices,
+        values = rank 1/2/3 or null) and performs:
+        1. Overall chi-square test on the full contingency table
+        2. Pairwise proportion tests comparing Rank 1 vote shares
+        
+        Args:
+            data: Pre-fetched ranking data from get_top_3_voices() or get_character_ranking().
+                Expected format: rows are respondents, columns are voices/characters,
+                values are 1, 2, 3 (rank) or null (not ranked).
+            alpha: Significance level (default 0.05)
+            correction: Multiple comparison correction method:
+                - "bonferroni": Bonferroni correction (conservative)
+                - "holm": Holm-Bonferroni (less conservative)
+                - "none": No correction
+        
+        Returns:
+            tuple: (pairwise_df, metadata)
+                - pairwise_df: DataFrame with columns ['group1', 'group2', 'p_value', 
+                  'p_adjusted', 'significant', 'rank1_count1', 'rank1_count2', 
+                  'rank1_pct1', 'rank1_pct2', 'total1', 'total2']
+                - metadata: dict with 'alpha', 'correction', 'n_comparisons',
+                  'chi2_stat', 'chi2_p_value', 'contingency_table'
+        
+        Example:
+            >>> ranking_data, _ = S.get_top_3_voices(data)
+            >>> pairwise_df, meta = S.compute_ranking_significance(ranking_data)
+            >>> # See which voices have significantly different Rank 1 proportions
+            >>> print(pairwise_df.filter(pl.col('significant') == True))
+        """
+        from scipy import stats as scipy_stats
+        import numpy as np
+        
+        if isinstance(data, pl.LazyFrame):
+            df = data.collect()
+        else:
+            df = data
+        
+        # Get ranking columns (exclude _recordId)
+        ranking_cols = [c for c in df.columns if c != '_recordId']
+        
+        if len(ranking_cols) < 2:
+            raise ValueError(f"Need at least 2 ranking columns, found {len(ranking_cols)}")
+        
+        # Build contingency table: rows = ranks (1, 2, 3), columns = voices
+        # Count how many times each voice received each rank
+        contingency_data = {}
+        for col in ranking_cols:
+            label = self._clean_voice_label(col)
+            r1 = df.filter(pl.col(col) == 1).height
+            r2 = df.filter(pl.col(col) == 2).height
+            r3 = df.filter(pl.col(col) == 3).height
+            contingency_data[label] = [r1, r2, r3]
+        
+        # Create contingency table as numpy array
+        labels = list(contingency_data.keys())
+        contingency_table = np.array([contingency_data[l] for l in labels]).T  # 3 x n_voices
+        
+        # Overall chi-square test on contingency table
+        # Tests whether rank distribution is independent of voice
+        chi2_stat, chi2_p, chi2_dof, _ = scipy_stats.chi2_contingency(contingency_table)
+        
+        # Pairwise proportion tests for Rank 1 votes
+        # We use a two-proportion z-test to compare rank 1 proportions
+        results = []
+        n_comparisons = len(labels) * (len(labels) - 1) // 2
+        
+        # Total respondents who ranked any voice in top 3
+        total_respondents = df.height
+        
+        for i, label1 in enumerate(labels):
+            for label2 in labels[i+1:]:
+                r1_count1 = contingency_data[label1][0]  # Rank 1 votes for voice 1
+                r1_count2 = contingency_data[label2][0]  # Rank 1 votes for voice 2
+                
+                # Total times each voice was ranked (1st + 2nd + 3rd)
+                total1 = sum(contingency_data[label1])
+                total2 = sum(contingency_data[label2])
+                
+                # Calculate proportions of Rank 1 out of all rankings for each voice
+                pct1 = r1_count1 / total1 if total1 > 0 else 0
+                pct2 = r1_count2 / total2 if total2 > 0 else 0
+                
+                # Two-proportion z-test
+                # H0: p1 = p2 (both voices have same proportion of Rank 1)
+                if total1 > 0 and total2 > 0 and (r1_count1 + r1_count2) > 0:
+                    # Pooled proportion
+                    p_pooled = (r1_count1 + r1_count2) / (total1 + total2)
+                    
+                    # Standard error
+                    se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total1 + 1/total2))
+                    
+                    if se > 0:
+                        z_stat = (pct1 - pct2) / se
+                        p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat)))  # Two-tailed
+                    else:
+                        p_value = 1.0
+                else:
+                    p_value = 1.0
+                
+                results.append({
+                    'group1': label1,
+                    'group2': label2,
+                    'p_value': float(p_value),
+                    'rank1_count1': r1_count1,
+                    'rank1_count2': r1_count2,
+                    'rank1_pct1': round(pct1 * 100, 1),
+                    'rank1_pct2': round(pct2 * 100, 1),
+                    'total1': total1,
+                    'total2': total2,
+                })
+        
+        # Create DataFrame and apply correction
+        results_df = pl.DataFrame(results)
+        
+        p_values = results_df['p_value'].to_numpy()
+        p_adjusted = np.full_like(p_values, np.nan, dtype=float)
+        
+        if correction == "bonferroni":
+            p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
+        elif correction == "holm":
+            sorted_idx = np.argsort(p_values)
+            sorted_p = p_values[sorted_idx]
+            m = len(sorted_p)
+            adjusted = np.zeros(m)
+            for j in range(m):
+                adjusted[j] = sorted_p[j] * (m - j)
+            for j in range(1, m):
+                adjusted[j] = max(adjusted[j], adjusted[j-1])
+            adjusted = np.minimum(adjusted, 1.0)
+            p_adjusted = adjusted[np.argsort(sorted_idx)]
+        elif correction == "none":
+            p_adjusted = p_values.astype(float)
+        
+        results_df = results_df.with_columns([
+            pl.Series('p_adjusted', p_adjusted),
+            pl.Series('significant', p_adjusted < alpha),
+        ])
+        
+        # Sort by p_value for easier inspection
+        results_df = results_df.sort('p_value')
+        
+        metadata = {
+            'test_type': 'proportion_z_test',
+            'alpha': alpha,
+            'correction': correction,
+            'n_comparisons': n_comparisons,
+            'chi2_stat': chi2_stat,
+            'chi2_p_value': chi2_p,
+            'chi2_dof': chi2_dof,
+            'overall_test': 'Chi-square',
+            'overall_stat': chi2_stat,
+            'overall_p_value': chi2_p,
+            'contingency_table': {label: contingency_data[label] for label in labels},
+        }
+        
+        return results_df, metadata


 def process_speaking_style_data(
--- a/uv.lock
+++ b/uv.lock
@@ -1435,6 +1435,7 @@ dependencies = [
    { name = "python-pptx" },
    { name = "pyzmq" },
    { name = "requests" },
+    { name = "scipy" },
    { name = "taguette" },
    { name = "vl-convert-python" },
    { name = "wordcloud" },
@@ -1459,6 +1460,7 @@ requires-dist = [
    { name = "python-pptx", specifier = ">=1.0.2" },
    { name = "pyzmq", specifier = ">=27.1.0" },
    { name = "requests", specifier = ">=2.32.5" },
+    { name = "scipy", specifier = ">=1.14.0" },
    { name = "taguette", specifier = ">=1.5.1" },
    { name = "vl-convert-python", specifier = ">=1.9.0.post1" },
    { name = "wordcloud", specifier = ">=1.9.5" },