statistical tests
This commit is contained in:
42
.github/agents/plot-creator.agent.md
vendored
42
.github/agents/plot-creator.agent.md
vendored
@@ -48,13 +48,12 @@ Check if an existing `transform_<descriptive_name>` function exists in `utils.py
|
||||
def transform_<descriptive_name>(self, df: pl.LazyFrame | pl.DataFrame) -> tuple[pl.LazyFrame, dict | None]:
|
||||
"""Transform <input_description> to <output_description>.
|
||||
|
||||
Original request: "<paste user's original question here>"
|
||||
Original use-case: "<paste user's original question here>"
|
||||
|
||||
This function <concise 1-2 sentence explanation of what it does>.
|
||||
|
||||
Args:
|
||||
df: Pre-fetched data (e.g., from get_character_refine()).
|
||||
Do NOT call get_*() methods inside this function.
|
||||
df: Pre-fetched data as a Polars LazyFrame or DataFrame.
|
||||
|
||||
Returns:
|
||||
tuple: (LazyFrame with columns [...], Optional metadata dict)
|
||||
@@ -96,19 +95,11 @@ chart = S.plot_character_trait_frequency(trait_freq)
|
||||
```
|
||||
|
||||
### Step 5: Create Temporary Test File
|
||||
Create `debug_plot_temp.py` for testing. **You MUST ask the user to provide:**
|
||||
Create `debug_plot_temp.py` for testing. **Prefer using the data snippet already provided by the user.**
|
||||
|
||||
1. **The exact code snippet to create the test data** - Do NOT generate or assume file paths
|
||||
2. **Confirmation of which notebook they're working in** (so you can read it for context if needed)
|
||||
**Option A: Use provided data snippet (preferred)**
|
||||
If the user provided a `df.head()` or sample data output, create inline test data from it:
|
||||
|
||||
Example prompt to user:
|
||||
> "To create the test file, please provide:
|
||||
> 1. The exact code snippet that produces the dataframe you shared (copy from your notebook)
|
||||
> 2. Which notebook are you working in? (I may read it for context, but won't modify it)
|
||||
>
|
||||
> I will NOT attempt to load any data without your explicit code."
|
||||
|
||||
**Test file structure using user-provided data:**
|
||||
```python
|
||||
"""Temporary test file for <plot_name>.
|
||||
Delete after testing.
|
||||
@@ -118,15 +109,32 @@ from theme import ColorPalette
|
||||
import altair as alt
|
||||
|
||||
# ============================================================
|
||||
# USER-PROVIDED TEST DATA (paste from user's snippet)
|
||||
# TEST DATA (reconstructed from user's df.head() output)
|
||||
# ============================================================
|
||||
# <user's code goes here>
|
||||
test_data = pl.DataFrame({
|
||||
"Column1": ["value1", "value2", ...],
|
||||
"Column2": [1, 2, ...],
|
||||
# ... recreate structure from provided sample
|
||||
})
|
||||
# ============================================================
|
||||
|
||||
# Test the plot function
|
||||
# ...
|
||||
from plots import QualtricsPlotsMixin
|
||||
# ... test code
|
||||
```
|
||||
|
||||
**Option B: Ask user (only if necessary)**
|
||||
Only ask the user for additional code if:
|
||||
- The provided sample is insufficient to test the plot logic
|
||||
- You need to understand complex data relationships not visible in the sample
|
||||
- The transformation requires understanding the full data pipeline
|
||||
|
||||
If you must ask:
|
||||
> "The sample data you provided should work for basic testing. However, I need [specific reason]. Could you provide:
|
||||
> 1. [specific information needed]
|
||||
>
|
||||
> If you'd prefer, I can proceed with a minimal test using the sample data you shared."
|
||||
|
||||
### Step 6: Create Plot Function
|
||||
Add a new method to `QualtricsPlotsMixin` in `plots.py`:
|
||||
|
||||
|
||||
@@ -16,8 +16,8 @@ def _():
|
||||
|
||||
from speaking_styles import SPEAKING_STYLES
|
||||
return (
|
||||
QualtricsSurvey,
|
||||
Path,
|
||||
QualtricsSurvey,
|
||||
SPEAKING_STYLES,
|
||||
calculate_weighted_ranking_scores,
|
||||
check_progress,
|
||||
@@ -49,7 +49,7 @@ def _(Path, file_browser, mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(QualtricsSurvey, QSF_FILE, RESULTS_FILE, mo):
|
||||
def _(QSF_FILE, QualtricsSurvey, RESULTS_FILE, mo):
|
||||
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
|
||||
try:
|
||||
data_all = S.load_data()
|
||||
@@ -285,6 +285,7 @@ def _(S, mo, v_18_8_3):
|
||||
def _(S, calculate_weighted_ranking_scores, data):
|
||||
top3_voices = S.get_top_3_voices(data)[0]
|
||||
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
|
||||
|
||||
return top3_voices, top3_voices_weighted
|
||||
|
||||
|
||||
@@ -383,6 +384,12 @@ def _(S, data, mo):
|
||||
return (vscales,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(vscales):
|
||||
print(vscales.collect().head())
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pl, vscales):
|
||||
# Count non-null values per row
|
||||
|
||||
@@ -44,14 +44,14 @@ def _(QSF_FILE, RESULTS_FILE):
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(r"""
|
||||
def _(RESULTS_FILE, data_all):
|
||||
mo.md(rf"""
|
||||
---
|
||||
# Load Data
|
||||
|
||||
**Dataset:** `{Path(RESULTS_FILE).name}`
|
||||
**Dataset:** {Path(RESULTS_FILE).name}
|
||||
|
||||
**Responses**: `{data_all.collect().shape[0]}`
|
||||
**Responses**: {data_all.collect().shape[0]}
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -112,11 +112,9 @@ def _():
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data_validated):
|
||||
data = data_validated
|
||||
|
||||
data.collect()
|
||||
return (data,)
|
||||
def _():
|
||||
#
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -130,8 +128,8 @@ def _():
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, data):
|
||||
demographics = S.get_demographics(data)[0].collect()
|
||||
def _(S, data_validated):
|
||||
demographics = S.get_demographics(data_validated)[0].collect()
|
||||
demographics
|
||||
return (demographics,)
|
||||
|
||||
@@ -148,7 +146,7 @@ def _():
|
||||
def _(demographics):
|
||||
# Demographics where 'Consumer' is null
|
||||
demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
|
||||
# demographics_no_consumer
|
||||
demographics_no_consumer
|
||||
return (demographics_no_consumer,)
|
||||
|
||||
|
||||
@@ -160,9 +158,26 @@ def _(data_all, demographics_no_consumer):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data_all):
|
||||
def _():
|
||||
mo.md(r"""
|
||||
# Filter Data (Global corrections)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data_validated):
|
||||
# drop rows where 'consumer' is null
|
||||
# data = data_validated.filter(pl.col('Consumer').is_not_null())
|
||||
data = data_validated
|
||||
data.collect()
|
||||
return (data,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# Check if all business owners are missing a 'Consumer type' in demographics
|
||||
assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
|
||||
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
|
||||
return
|
||||
|
||||
|
||||
@@ -187,14 +202,14 @@ def _():
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, demo_plot_cols, demographics):
|
||||
def _(S, data, demo_plot_cols):
|
||||
_content = """
|
||||
## Demographic Distributions
|
||||
|
||||
"""
|
||||
for c in demo_plot_cols:
|
||||
_fig = S.plot_demographic_distribution(
|
||||
data=demographics,
|
||||
data=S.get_demographics(data)[0],
|
||||
column=c,
|
||||
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
|
||||
)
|
||||
@@ -265,6 +280,22 @@ def _(S, char_rank):
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, char_rank):
|
||||
_pairwise_df, _meta = S.compute_ranking_significance(char_rank)
|
||||
|
||||
print(_pairwise_df.columns)
|
||||
|
||||
mo.md(f"""
|
||||
### Statistical Significance Character Ranking
|
||||
|
||||
{mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
|
||||
|
||||
{mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
@@ -307,28 +338,69 @@ def _():
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# Join respondent
|
||||
def _(S, data):
|
||||
char_df = S.get_character_refine(data)[0]
|
||||
return (char_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, char_df):
|
||||
from theme import ColorPalette
|
||||
|
||||
# Assuming you already have char_df (your data from get_character_refine or similar)
|
||||
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
|
||||
character_colors = {
|
||||
'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
|
||||
'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
|
||||
'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
|
||||
'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
|
||||
}
|
||||
|
||||
# Build consistent sort order (by total frequency across all characters)
|
||||
all_trait_counts = {}
|
||||
for char in characters:
|
||||
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
||||
for row in freq_df.iter_rows(named=True):
|
||||
all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
|
||||
|
||||
consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
|
||||
|
||||
_content = """"""
|
||||
# Generate 4 plots (one per character)
|
||||
for char in characters:
|
||||
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
||||
main_color, highlight_color = character_colors[char]
|
||||
chart = S.plot_single_character_trait_frequency(
|
||||
data=freq_df,
|
||||
character_name=char,
|
||||
bar_color=main_color,
|
||||
highlight_color=highlight_color,
|
||||
trait_sort_order=consistent_sort_order,
|
||||
)
|
||||
_content += f"""
|
||||
{mo.ui.altair_chart(chart)}
|
||||
|
||||
|
||||
"""
|
||||
|
||||
mo.md(_content)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
---
|
||||
## Statistical significance best characters
|
||||
|
||||
# Spoken Voice Results
|
||||
zie chat
|
||||
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
|
||||
>
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
---
|
||||
|
||||
# Brand Character Results
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@@ -342,5 +414,174 @@ def _():
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, data):
|
||||
top3_voices = S.get_top_3_voices(data)[0]
|
||||
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
|
||||
return top3_voices, top3_voices_weighted
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
## Which voice is ranked best in the ranking question for top 3?
|
||||
|
||||
(not best 3 out of 8 question)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, top3_voices):
|
||||
_plot = S.plot_ranking_distribution(top3_voices, x_label='Voice')
|
||||
mo.md(f"""
|
||||
{mo.ui.altair_chart(_plot)}
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
### Statistical significance for voice ranking
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# print(top3_voices.collect().head())
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
|
||||
# _pairwise_df, _metadata = S.compute_ranking_significance(
|
||||
# top3_voices,alpha=0.05,correction="none")
|
||||
|
||||
# # View significant pairs
|
||||
# # print(pairwise_df.filter(pl.col('significant') == True))
|
||||
|
||||
# # Create heatmap visualization
|
||||
# _heatmap = S.plot_significance_heatmap(
|
||||
# _pairwise_df,
|
||||
# metadata=_metadata,
|
||||
# title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
|
||||
# )
|
||||
|
||||
# # Create summary bar chart
|
||||
# _summary = S.plot_significance_summary(
|
||||
# _pairwise_df,
|
||||
# metadata=_metadata
|
||||
# )
|
||||
|
||||
# mo.md(f"""
|
||||
# {mo.ui.altair_chart(_heatmap)}
|
||||
|
||||
# {mo.ui.altair_chart(_summary)}
|
||||
# """)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
## Weighted Popularity Scores
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, top3_voices_weighted):
|
||||
_plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)")
|
||||
|
||||
mo.md(f"""
|
||||
{mo.ui.altair_chart(_plot)}
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(top3_voices_weighted):
|
||||
print(top3_voices_weighted.head())
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _():
|
||||
mo.md(r"""
|
||||
## Voice Scale 1-10
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, data):
|
||||
# Get your voice scale data (from notebook)
|
||||
voice_1_10, _ = S.get_voice_scale_1_10(data)
|
||||
return (voice_1_10,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, voice_1_10):
|
||||
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
mo.md(r"""
|
||||
### Statistical Significance (Scale 1-10)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(S, voice_1_10):
|
||||
# Compute pairwise significance tests
|
||||
pairwise_df, metadata = S.compute_pairwise_significance(
|
||||
voice_1_10,
|
||||
test_type="mannwhitney", # or "ttest", "chi2", "auto"
|
||||
alpha=0.05,
|
||||
correction="bonferroni" # or "holm", "none"
|
||||
)
|
||||
|
||||
# View significant pairs
|
||||
# print(pairwise_df.filter(pl.col('significant') == True))
|
||||
|
||||
# Create heatmap visualization
|
||||
_heatmap = S.plot_significance_heatmap(
|
||||
pairwise_df,
|
||||
metadata=metadata,
|
||||
title="Voice Rating Significance<br>(Pairwise Comparisons)"
|
||||
)
|
||||
|
||||
# Create summary bar chart
|
||||
_summary = S.plot_significance_summary(
|
||||
pairwise_df,
|
||||
metadata=metadata
|
||||
)
|
||||
|
||||
mo.md(f"""
|
||||
{mo.ui.altair_chart(_heatmap)}
|
||||
|
||||
{mo.ui.altair_chart(_summary)}
|
||||
""")
|
||||
|
||||
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
|
||||
428
docs/statistical-significance-guide.md
Normal file
428
docs/statistical-significance-guide.md
Normal file
@@ -0,0 +1,428 @@
|
||||
# Statistical Significance Testing Guide
|
||||
|
||||
A beginner-friendly reference for choosing the right statistical test and correction method for your Voice Branding analysis.
|
||||
|
||||
---
|
||||
|
||||
## Table of Contents
|
||||
1. [Quick Decision Flowchart](#quick-decision-flowchart)
|
||||
2. [Understanding Your Data Types](#understanding-your-data-types)
|
||||
3. [Available Tests](#available-tests)
|
||||
4. [Multiple Comparison Corrections](#multiple-comparison-corrections)
|
||||
5. [Interpreting Results](#interpreting-results)
|
||||
6. [Code Examples](#code-examples)
|
||||
|
||||
---
|
||||
|
||||
## Quick Decision Flowchart
|
||||
|
||||
```
|
||||
What kind of data do you have?
|
||||
│
|
||||
├─► Continuous scores (1-10 ratings, averages)
|
||||
│ │
|
||||
│ └─► Use: compute_pairwise_significance()
|
||||
│ │
|
||||
│ ├─► Data normally distributed? → test_type="ttest"
|
||||
│ └─► Not sure / skewed data? → test_type="mannwhitney" (safer choice)
|
||||
│
|
||||
└─► Ranking data (1st, 2nd, 3rd place votes)
|
||||
│
|
||||
└─► Use: compute_ranking_significance()
|
||||
(automatically uses proportion z-test)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Understanding Your Data Types
|
||||
|
||||
### Continuous Data
|
||||
**What it looks like:** Numbers on a scale with many possible values.
|
||||
|
||||
| Example | Data Source |
|
||||
|---------|-------------|
|
||||
| Voice ratings 1-10 | `get_voice_scale_1_10()` |
|
||||
| Speaking style scores | `get_ss_green_blue()` |
|
||||
| Any averaged scores | Custom aggregations |
|
||||
|
||||
```
|
||||
shape: (5, 3)
|
||||
┌───────────┬─────────────────┬─────────────────┐
|
||||
│ _recordId │ Voice_Scale__V14│ Voice_Scale__V04│
|
||||
│ str │ f64 │ f64 │
|
||||
├───────────┼─────────────────┼─────────────────┤
|
||||
│ R_001 │ 7.5 │ 6.0 │
|
||||
│ R_002 │ 8.0 │ 7.5 │
|
||||
│ R_003 │ 6.5 │ 8.0 │
|
||||
```
|
||||
|
||||
### Ranking Data
|
||||
**What it looks like:** Discrete ranks (1, 2, 3) or null if not ranked.
|
||||
|
||||
| Example | Data Source |
|
||||
|---------|-------------|
|
||||
| Top 3 voice rankings | `get_top_3_voices()` |
|
||||
| Character rankings | `get_character_ranking()` |
|
||||
|
||||
```
|
||||
shape: (5, 3)
|
||||
┌───────────┬──────────────────┬──────────────────┐
|
||||
│ _recordId │ Top_3__V14 │ Top_3__V04 │
|
||||
│ str │ i64 │ i64 │
|
||||
├───────────┼──────────────────┼──────────────────┤
|
||||
│ R_001 │ 1 │ null │ ← V14 was ranked 1st
|
||||
│ R_002 │ 2 │ 1 │ ← V04 was ranked 1st
|
||||
│ R_003 │ null │ 3 │ ← V04 was ranked 3rd
|
||||
```
|
||||
|
||||
### ⚠️ Aggregated Data (Cannot Test!)
|
||||
**What it looks like:** Already summarized/totaled data.
|
||||
|
||||
```
|
||||
shape: (3, 2)
|
||||
┌───────────┬────────────────┐
|
||||
│ Character │ Weighted Score │ ← ALREADY AGGREGATED
|
||||
│ str │ i64 │ Lost individual variance
|
||||
├───────────┼────────────────┤ Cannot do significance tests!
|
||||
│ V14 │ 209 │
|
||||
│ V04 │ 180 │
|
||||
```
|
||||
|
||||
**Solution:** Go back to the raw data before aggregation.
|
||||
|
||||
---
|
||||
|
||||
## Available Tests
|
||||
|
||||
### 1. Mann-Whitney U Test (Default for Continuous)
|
||||
**Use when:** Comparing scores/ratings between groups
|
||||
**Assumes:** Nothing about distribution shape (non-parametric)
|
||||
**Best for:** Most survey data, Likert scales, ratings
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
voice_data,
|
||||
test_type="mannwhitney" # This is the default
|
||||
)
|
||||
```
|
||||
|
||||
**Pros:**
|
||||
- Works with any distribution shape
|
||||
- Robust to outliers
|
||||
- Safe choice when unsure
|
||||
|
||||
**Cons:**
|
||||
- Slightly less powerful than t-test when data IS normally distributed
|
||||
|
||||
---
|
||||
|
||||
### 2. Independent t-Test
|
||||
**Use when:** Comparing means between groups
|
||||
**Assumes:** Data is approximately normally distributed
|
||||
**Best for:** Large samples (n > 30 per group), truly continuous data
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
voice_data,
|
||||
test_type="ttest"
|
||||
)
|
||||
```
|
||||
|
||||
**Pros:**
|
||||
- Most powerful when assumptions are met
|
||||
- Well-understood, commonly reported
|
||||
|
||||
**Cons:**
|
||||
- Can give misleading results if data is skewed
|
||||
- Sensitive to outliers
|
||||
|
||||
---
|
||||
|
||||
### 3. Chi-Square Test
|
||||
**Use when:** Comparing frequency distributions
|
||||
**Assumes:** Expected counts ≥ 5 in each cell
|
||||
**Best for:** Count data, categorical comparisons
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
count_data,
|
||||
test_type="chi2"
|
||||
)
|
||||
```
|
||||
|
||||
**Pros:**
|
||||
- Designed for count/frequency data
|
||||
- Tests if distributions differ
|
||||
|
||||
**Cons:**
|
||||
- Needs sufficient sample sizes
|
||||
- Less informative about direction of difference
|
||||
|
||||
---
|
||||
|
||||
### 4. Two-Proportion Z-Test (For Rankings)
|
||||
**Use when:** Comparing ranking vote proportions
|
||||
**Automatically used by:** `compute_ranking_significance()`
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_ranking_significance(ranking_data)
|
||||
```
|
||||
|
||||
**What it tests:** "Does Voice A get a significantly different proportion of Rank 1 votes than Voice B?"
|
||||
|
||||
---
|
||||
|
||||
## Multiple Comparison Corrections
|
||||
|
||||
### Why Do We Need Corrections?
|
||||
|
||||
When you compare many groups, you're doing many tests. Each test has a 5% chance of a false positive (if α = 0.05). With 17 voices:
|
||||
|
||||
| Comparisons | Expected False Positives (no correction) |
|
||||
|-------------|------------------------------------------|
|
||||
| 136 pairs | ~7 false "significant" results! |
|
||||
|
||||
**Corrections adjust p-values to account for this.**
|
||||
|
||||
---
|
||||
|
||||
### Bonferroni Correction (Conservative)
|
||||
**Formula:** `p_adjusted = p_value × number_of_comparisons`
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
data,
|
||||
correction="bonferroni" # This is the default
|
||||
)
|
||||
```
|
||||
|
||||
**Use when:**
|
||||
- You want to be very confident about significant results
|
||||
- False positives are costly (publishing, major decisions)
|
||||
- You have few comparisons (< 20)
|
||||
|
||||
**Trade-off:** May miss real differences (more false negatives)
|
||||
|
||||
---
|
||||
|
||||
### Holm-Bonferroni Correction (Less Conservative)
|
||||
**Formula:** Step-down procedure that's less strict than Bonferroni
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
data,
|
||||
correction="holm"
|
||||
)
|
||||
```
|
||||
|
||||
**Use when:**
|
||||
- You have many comparisons
|
||||
- You want better power to detect real differences
|
||||
- Exploratory analysis where missing a real effect is costly
|
||||
|
||||
**Trade-off:** Slightly higher false positive risk than Bonferroni
|
||||
|
||||
---
|
||||
|
||||
### No Correction
|
||||
**Not recommended for final analysis**, but useful for exploration.
|
||||
|
||||
```python
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
data,
|
||||
correction="none"
|
||||
)
|
||||
```
|
||||
|
||||
**Use when:**
|
||||
- Initial exploration only
|
||||
- You'll follow up with specific hypotheses
|
||||
- You understand and accept the inflated false positive rate
|
||||
|
||||
---
|
||||
|
||||
### Correction Method Comparison
|
||||
|
||||
| Method | Strictness | Best For | Risk |
|
||||
|--------|------------|----------|------|
|
||||
| Bonferroni | Most strict | Few comparisons, high stakes | Miss real effects |
|
||||
| Holm | Moderate | Many comparisons, balanced approach | Slightly more false positives |
|
||||
| None | No control | Exploration only | Many false positives |
|
||||
|
||||
**Recommendation for Voice Branding:** Use **Holm** for exploratory analysis, **Bonferroni** for final reporting.
|
||||
|
||||
---
|
||||
|
||||
## Interpreting Results
|
||||
|
||||
### Key Output Columns
|
||||
|
||||
| Column | Meaning |
|
||||
|--------|---------|
|
||||
| `p_value` | Raw probability this difference happened by chance |
|
||||
| `p_adjusted` | Corrected p-value (use this for decisions!) |
|
||||
| `significant` | TRUE if p_adjusted < alpha (usually 0.05) |
|
||||
| `effect_size` | How big is the difference (practical significance) |
|
||||
|
||||
### What the p-value Means
|
||||
|
||||
| p-value | Interpretation |
|
||||
|---------|----------------|
|
||||
| < 0.001 | Very strong evidence of difference |
|
||||
| < 0.01 | Strong evidence |
|
||||
| < 0.05 | Moderate evidence (traditional threshold) |
|
||||
| 0.05 - 0.10 | Weak evidence, "trending" |
|
||||
| > 0.10 | No significant evidence |
|
||||
|
||||
### Statistical vs Practical Significance
|
||||
|
||||
**Statistical significance** (p < 0.05) means the difference is unlikely due to chance.
|
||||
|
||||
**Practical significance** (effect size) means the difference matters in the real world.
|
||||
|
||||
| Effect Size (Cohen's d) | Interpretation |
|
||||
|-------------------------|----------------|
|
||||
| < 0.2 | Small (may not matter practically) |
|
||||
| 0.2 - 0.5 | Medium |
|
||||
| 0.5 - 0.8 | Large |
|
||||
| > 0.8 | Very large |
|
||||
|
||||
**Example:** A p-value of 0.001 with effect size of 0.1 means "we're confident there's a difference, but it's tiny."
|
||||
|
||||
---
|
||||
|
||||
## Code Examples
|
||||
|
||||
### Example 1: Voice Scale Ratings
|
||||
|
||||
```python
|
||||
# Get the raw rating data
|
||||
voice_data, _ = S.get_voice_scale_1_10(data)
|
||||
|
||||
# Test for significant differences
|
||||
pairwise_df, meta = S.compute_pairwise_significance(
|
||||
voice_data,
|
||||
test_type="mannwhitney", # Safe default for ratings
|
||||
alpha=0.05,
|
||||
correction="bonferroni"
|
||||
)
|
||||
|
||||
# Check overall test first
|
||||
print(f"Overall test: {meta['overall_test']}")
|
||||
print(f"Overall p-value: {meta['overall_p_value']:.4f}")
|
||||
|
||||
# If overall is significant, look at pairwise
|
||||
if meta['overall_p_value'] < 0.05:
|
||||
sig_pairs = pairwise_df.filter(pl.col('significant') == True)
|
||||
print(f"Found {sig_pairs.height} significant pairwise differences")
|
||||
|
||||
# Visualize
|
||||
S.plot_significance_heatmap(pairwise_df, metadata=meta)
|
||||
```
|
||||
|
||||
### Example 2: Top 3 Voice Rankings
|
||||
|
||||
```python
|
||||
# Get the raw ranking data (NOT the weighted scores!)
|
||||
ranking_data, _ = S.get_top_3_voices(data)
|
||||
|
||||
# Test for significant differences in Rank 1 proportions
|
||||
pairwise_df, meta = S.compute_ranking_significance(
|
||||
ranking_data,
|
||||
alpha=0.05,
|
||||
correction="holm" # Less conservative for many comparisons
|
||||
)
|
||||
|
||||
# Check chi-square test
|
||||
print(f"Chi-square p-value: {meta['chi2_p_value']:.4f}")
|
||||
|
||||
# View contingency table (Rank 1, 2, 3 counts per voice)
|
||||
for voice, counts in meta['contingency_table'].items():
|
||||
print(f"{voice}: R1={counts[0]}, R2={counts[1]}, R3={counts[2]}")
|
||||
|
||||
# Find significant pairs
|
||||
sig_pairs = pairwise_df.filter(pl.col('significant') == True)
|
||||
print(sig_pairs)
|
||||
```
|
||||
|
||||
### Example 3: Comparing Demographic Subgroups
|
||||
|
||||
```python
|
||||
# Filter to specific demographics
|
||||
S.filter_data(data, consumer=['Early Professional'])
|
||||
early_pro_data, _ = S.get_voice_scale_1_10(data)
|
||||
|
||||
S.filter_data(data, consumer=['Established Professional'])
|
||||
estab_pro_data, _ = S.get_voice_scale_1_10(data)
|
||||
|
||||
# Test each group separately, then compare results qualitatively
|
||||
# (For direct group comparison, you'd need a different test design)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Mistakes to Avoid
|
||||
|
||||
### ❌ Using Aggregated Data
|
||||
```python
|
||||
# WRONG - already summarized, lost individual variance
|
||||
weighted_scores = calculate_weighted_ranking_scores(ranking_data)
|
||||
S.compute_pairwise_significance(weighted_scores) # Will fail!
|
||||
```
|
||||
|
||||
### ✅ Use Raw Data
|
||||
```python
|
||||
# RIGHT - use raw data before aggregation
|
||||
ranking_data, _ = S.get_top_3_voices(data)
|
||||
S.compute_ranking_significance(ranking_data)
|
||||
```
|
||||
|
||||
### ❌ Ignoring Multiple Comparisons
|
||||
```python
|
||||
# WRONG - 7% of pairs will be "significant" by chance alone!
|
||||
S.compute_pairwise_significance(data, correction="none")
|
||||
```
|
||||
|
||||
### ✅ Apply Correction
|
||||
```python
|
||||
# RIGHT - corrected p-values control false positives
|
||||
S.compute_pairwise_significance(data, correction="bonferroni")
|
||||
```
|
||||
|
||||
### ❌ Only Reporting p-values
|
||||
```python
|
||||
# WRONG - statistical significance isn't everything
|
||||
print(f"p = {p_value}") # Missing context!
|
||||
```
|
||||
|
||||
### ✅ Report Effect Sizes Too
|
||||
```python
|
||||
# RIGHT - include practical significance
|
||||
print(f"p = {p_value}, effect size = {effect_size}")
|
||||
print(f"Mean difference: {mean1 - mean2:.2f} points")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference Card
|
||||
|
||||
| Data Type | Function | Default Test | Recommended Correction |
|
||||
|-----------|----------|--------------|------------------------|
|
||||
| Ratings (1-10) | `compute_pairwise_significance()` | Mann-Whitney U | Bonferroni |
|
||||
| Rankings (1st/2nd/3rd) | `compute_ranking_significance()` | Proportion Z | Holm |
|
||||
| Count frequencies | `compute_pairwise_significance(test_type="chi2")` | Chi-square | Bonferroni |
|
||||
|
||||
| Scenario | Correction |
|
||||
|----------|------------|
|
||||
| Publishing results | Bonferroni |
|
||||
| Client presentation | Bonferroni |
|
||||
| Exploratory analysis | Holm |
|
||||
| Quick internal check | Holm or None |
|
||||
|
||||
---
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [Statistics for Dummies Cheat Sheet](https://www.dummies.com/article/academics-the-arts/math/statistics/statistics-for-dummies-cheat-sheet-208650/)
|
||||
- [Choosing the Right Statistical Test](https://stats.oarc.ucla.edu/other/mult-pkg/whatstat/)
|
||||
- [Multiple Comparisons Problem (Wikipedia)](https://en.wikipedia.org/wiki/Multiple_comparisons_problem)
|
||||
491
plots.py
491
plots.py
@@ -290,10 +290,11 @@ class QualtricsPlotsMixin:
|
||||
if domain is None:
|
||||
domain = [stats_df['average'].min(), stats_df['average'].max()]
|
||||
|
||||
# Base bar chart
|
||||
# Base bar chart - use y2 to explicitly start bars at domain minimum
|
||||
bars = alt.Chart(stats_df).mark_bar(color=color).encode(
|
||||
x=alt.X('voice:N', title=x_label, sort='-y'),
|
||||
y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)),
|
||||
y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge)
|
||||
tooltip=[
|
||||
alt.Tooltip('voice:N', title='Voice'),
|
||||
alt.Tooltip('average:Q', title='Average', format='.2f'),
|
||||
@@ -1099,5 +1100,493 @@ class QualtricsPlotsMixin:
|
||||
height=height or getattr(self, 'plot_height', 400)
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
|
||||
def plot_single_character_trait_frequency(
|
||||
self,
|
||||
data: pl.LazyFrame | pl.DataFrame | None = None,
|
||||
character_name: str = "Character",
|
||||
bar_color: str = ColorPalette.PRIMARY,
|
||||
highlight_color: str = ColorPalette.NEUTRAL,
|
||||
title: str | None = None,
|
||||
x_label: str = "Trait",
|
||||
y_label: str = "Frequency",
|
||||
trait_sort_order: list[str] | None = None,
|
||||
height: int | None = None,
|
||||
width: int | str | None = None,
|
||||
) -> alt.Chart:
|
||||
"""Create a bar plot showing trait frequency for a single character.
|
||||
|
||||
Original request: "I need a bar plot that shows the frequency of the times
|
||||
each trait is chosen per brand character. The function should be generalized
|
||||
so that it can be used 4 times, once for each character. Each character should
|
||||
use a slightly different color. Original traits should be highlighted."
|
||||
|
||||
This function creates one plot per character. Call it 4 times (once per
|
||||
character) to generate all plots for a slide.
|
||||
|
||||
Args:
|
||||
data: DataFrame with columns ['trait', 'count', 'is_original']
|
||||
as produced by transform_character_trait_frequency()
|
||||
character_name: Name of the character (for title). E.g., "Bank Teller"
|
||||
bar_color: Main bar color for non-original traits. Use ColorPalette
|
||||
constants like ColorPalette.CHARACTER_BANK_TELLER
|
||||
highlight_color: Lighter color for original/expected traits. Use the
|
||||
matching highlight like ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT
|
||||
title: Custom title. If None, auto-generates from character_name
|
||||
x_label: X-axis label
|
||||
y_label: Y-axis label
|
||||
trait_sort_order: Optional list of traits for consistent sorting across
|
||||
all character plots. If None, sorts by count descending.
|
||||
height: Chart height
|
||||
width: Chart width
|
||||
|
||||
Returns:
|
||||
alt.Chart: Altair bar chart
|
||||
"""
|
||||
df = self._ensure_dataframe(data)
|
||||
|
||||
# Ensure we have the expected columns
|
||||
required_cols = {'trait', 'count', 'is_original'}
|
||||
if not required_cols.issubset(set(df.columns)):
|
||||
return alt.Chart(pd.DataFrame({
|
||||
'text': ['Data must have trait, count, is_original columns']
|
||||
})).mark_text().encode(text='text:N')
|
||||
|
||||
# Convert to pandas for Altair
|
||||
plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df
|
||||
|
||||
# Determine sort order
|
||||
if trait_sort_order is not None:
|
||||
# Use provided order, append any missing traits at the end (sorted by count)
|
||||
known_traits = set(trait_sort_order)
|
||||
extra_traits = plot_df[~plot_df['trait'].isin(known_traits)].sort_values(
|
||||
'count', ascending=False
|
||||
)['trait'].tolist()
|
||||
sort_order = trait_sort_order + extra_traits
|
||||
else:
|
||||
# Default: sort by count descending
|
||||
sort_order = plot_df.sort_values('count', ascending=False)['trait'].tolist()
|
||||
|
||||
# Create category column for color encoding
|
||||
plot_df['category'] = plot_df['is_original'].map({
|
||||
True: 'Original Trait',
|
||||
False: 'Other Trait'
|
||||
})
|
||||
|
||||
# Generate title if not provided
|
||||
if title is None:
|
||||
title = f"{character_name}<br>Trait Selection Frequency"
|
||||
|
||||
# Build title config with sort order note as subtitle
|
||||
sort_note = "Sorted by total frequency across all characters" if trait_sort_order else "Sorted by frequency (descending)"
|
||||
title_text = self._process_title(title)
|
||||
title_config = {
|
||||
'text': title_text,
|
||||
'subtitle': sort_note,
|
||||
'subtitleColor': 'gray',
|
||||
'subtitleFontSize': 10,
|
||||
'anchor': 'start',
|
||||
}
|
||||
|
||||
# Create HORIZONTAL bar chart with conditional coloring
|
||||
# Reverse sort order for horizontal bars (highest at top)
|
||||
reversed_sort = list(reversed(sort_order))
|
||||
|
||||
bars = alt.Chart(plot_df).mark_bar().encode(
|
||||
y=alt.Y('trait:N',
|
||||
title=x_label,
|
||||
sort=reversed_sort,
|
||||
axis=alt.Axis(labelLimit=200)),
|
||||
x=alt.X('count:Q', title=y_label),
|
||||
color=alt.Color('category:N',
|
||||
scale=alt.Scale(
|
||||
domain=['Original Trait', 'Other Trait'],
|
||||
range=[highlight_color, bar_color]
|
||||
),
|
||||
legend=alt.Legend(
|
||||
orient='top',
|
||||
direction='horizontal',
|
||||
title=None
|
||||
)),
|
||||
tooltip=[
|
||||
alt.Tooltip('trait:N', title='Trait'),
|
||||
alt.Tooltip('count:Q', title='Frequency'),
|
||||
alt.Tooltip('category:N', title='Type')
|
||||
]
|
||||
)
|
||||
|
||||
# Add count labels on bars (to the right of bars for horizontal)
|
||||
text = alt.Chart(plot_df).mark_text(
|
||||
dx=12,
|
||||
color='black',
|
||||
fontSize=10,
|
||||
align='left'
|
||||
).encode(
|
||||
y=alt.Y('trait:N', sort=reversed_sort),
|
||||
x=alt.X('count:Q'),
|
||||
text=alt.Text('count:Q')
|
||||
)
|
||||
|
||||
chart = (bars + text).properties(
|
||||
title=title_config,
|
||||
width=width or 400,
|
||||
height=height or getattr(self, 'plot_height', 450)
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
|
||||
def plot_significance_heatmap(
|
||||
self,
|
||||
pairwise_df: pl.LazyFrame | pl.DataFrame | None = None,
|
||||
metadata: dict | None = None,
|
||||
title: str = "Pairwise Statistical Significance<br>(Adjusted p-values)",
|
||||
show_p_values: bool = True,
|
||||
show_effect_size: bool = False,
|
||||
height: int | None = None,
|
||||
width: int | None = None,
|
||||
) -> alt.Chart:
|
||||
"""Create a heatmap showing pairwise statistical significance between groups.
|
||||
|
||||
Original use-case: "I need to test for statistical significance and present
|
||||
this in a logical manner - as a heatmap or similar visualization."
|
||||
|
||||
This function visualizes the output of compute_pairwise_significance() as
|
||||
a color-coded heatmap where color intensity indicates significance level.
|
||||
|
||||
Args:
|
||||
pairwise_df: Output from compute_pairwise_significance().
|
||||
Expected columns: ['group1', 'group2', 'p_value', 'p_adjusted', 'significant']
|
||||
metadata: Metadata dict from compute_pairwise_significance() (optional).
|
||||
Used to add test information to the plot subtitle.
|
||||
title: Chart title (supports <br> for line breaks)
|
||||
show_p_values: Whether to display p-values as text annotations
|
||||
show_effect_size: Whether to display effect sizes instead of p-values
|
||||
height: Chart height (default: auto-sized based on groups)
|
||||
width: Chart width (default: auto-sized based on groups)
|
||||
|
||||
Returns:
|
||||
alt.Chart: Altair heatmap chart
|
||||
"""
|
||||
df = self._ensure_dataframe(pairwise_df)
|
||||
|
||||
# Get unique groups
|
||||
all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list()))
|
||||
n_groups = len(all_groups)
|
||||
|
||||
# Create symmetric matrix data for heatmap
|
||||
# We need both directions (A,B) and (B,A) for the full matrix
|
||||
heatmap_data = []
|
||||
for row_group in all_groups:
|
||||
for col_group in all_groups:
|
||||
if row_group == col_group:
|
||||
# Diagonal - self comparison
|
||||
heatmap_data.append({
|
||||
'row': row_group,
|
||||
'col': col_group,
|
||||
'p_adjusted': None,
|
||||
'p_value': None,
|
||||
'significant': None,
|
||||
'effect_size': None,
|
||||
'text_label': '—',
|
||||
'sig_category': 'Self',
|
||||
})
|
||||
else:
|
||||
# Find the comparison (could be in either order)
|
||||
match = df.filter(
|
||||
((pl.col('group1') == row_group) & (pl.col('group2') == col_group)) |
|
||||
((pl.col('group1') == col_group) & (pl.col('group2') == row_group))
|
||||
)
|
||||
if match.height > 0:
|
||||
p_adj = match['p_adjusted'][0]
|
||||
p_val = match['p_value'][0]
|
||||
sig = match['significant'][0]
|
||||
eff = match['effect_size'][0] if 'effect_size' in match.columns else None
|
||||
|
||||
# For ranking data, we can show Rank 1 % difference
|
||||
has_rank_pcts = 'rank1_pct1' in match.columns and 'rank1_pct2' in match.columns
|
||||
if has_rank_pcts:
|
||||
pct_diff = abs(match['rank1_pct1'][0] - match['rank1_pct2'][0])
|
||||
else:
|
||||
pct_diff = None
|
||||
|
||||
# Helper to get display text when not showing p-values
|
||||
def get_alt_text():
|
||||
if eff is not None:
|
||||
return f'{eff:.2f}'
|
||||
elif pct_diff is not None:
|
||||
return f'{pct_diff:.1f}%'
|
||||
else:
|
||||
return '—'
|
||||
|
||||
# Categorize significance level
|
||||
if p_adj is None:
|
||||
sig_cat = 'N/A'
|
||||
text = 'N/A'
|
||||
elif p_adj < 0.001:
|
||||
sig_cat = 'p < 0.001'
|
||||
text = '<.001' if show_p_values else get_alt_text()
|
||||
elif p_adj < 0.01:
|
||||
sig_cat = 'p < 0.01'
|
||||
text = f'{p_adj:.3f}' if show_p_values else get_alt_text()
|
||||
elif p_adj < 0.05:
|
||||
sig_cat = 'p < 0.05'
|
||||
text = f'{p_adj:.3f}' if show_p_values else get_alt_text()
|
||||
else:
|
||||
sig_cat = 'n.s.'
|
||||
text = f'{p_adj:.2f}' if show_p_values else get_alt_text()
|
||||
|
||||
if show_effect_size:
|
||||
text = get_alt_text()
|
||||
|
||||
heatmap_data.append({
|
||||
'row': row_group,
|
||||
'col': col_group,
|
||||
'p_adjusted': p_adj,
|
||||
'p_value': p_val,
|
||||
'significant': sig,
|
||||
'effect_size': eff,
|
||||
'text_label': text,
|
||||
'sig_category': sig_cat,
|
||||
})
|
||||
else:
|
||||
heatmap_data.append({
|
||||
'row': row_group,
|
||||
'col': col_group,
|
||||
'p_adjusted': None,
|
||||
'p_value': None,
|
||||
'significant': None,
|
||||
'effect_size': None,
|
||||
'text_label': 'N/A',
|
||||
'sig_category': 'N/A',
|
||||
})
|
||||
|
||||
heatmap_df = pl.DataFrame(heatmap_data).to_pandas()
|
||||
|
||||
# Define color scale for significance categories
|
||||
sig_domain = ['p < 0.001', 'p < 0.01', 'p < 0.05', 'n.s.', 'Self', 'N/A']
|
||||
sig_range = [
|
||||
ColorPalette.SIG_STRONG, # p < 0.001
|
||||
ColorPalette.SIG_MODERATE, # p < 0.01
|
||||
ColorPalette.SIG_WEAK, # p < 0.05
|
||||
ColorPalette.SIG_NONE, # not significant
|
||||
ColorPalette.SIG_DIAGONAL, # diagonal (self)
|
||||
ColorPalette.NEUTRAL, # N/A
|
||||
]
|
||||
|
||||
# Build tooltip fields based on available data
|
||||
tooltip_fields = [
|
||||
alt.Tooltip('row:N', title='Group 1'),
|
||||
alt.Tooltip('col:N', title='Group 2'),
|
||||
alt.Tooltip('p_value:Q', title='p-value', format='.4f'),
|
||||
alt.Tooltip('p_adjusted:Q', title='Adjusted p', format='.4f'),
|
||||
]
|
||||
# Only add effect_size if it has non-null values (continuous data)
|
||||
has_effect_size = 'effect_size' in heatmap_df.columns and heatmap_df['effect_size'].notna().any()
|
||||
if has_effect_size:
|
||||
tooltip_fields.append(alt.Tooltip('effect_size:Q', title='Effect Size', format='.3f'))
|
||||
# Add rank info for ranking data
|
||||
has_rank_pcts = 'rank1_pct1' in df.columns if isinstance(df, pl.DataFrame) else False
|
||||
if has_rank_pcts:
|
||||
tooltip_fields.append(alt.Tooltip('text_label:N', title='Rank 1 % Diff'))
|
||||
|
||||
# Calculate dimensions
|
||||
cell_size = 45
|
||||
auto_size = n_groups * cell_size + 100
|
||||
chart_width = width or auto_size
|
||||
chart_height = height or auto_size
|
||||
|
||||
# Base heatmap
|
||||
heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode(
|
||||
x=alt.X('col:N', title=None, sort=all_groups,
|
||||
axis=alt.Axis(labelAngle=-45, labelLimit=150)),
|
||||
y=alt.Y('row:N', title=None, sort=all_groups,
|
||||
axis=alt.Axis(labelLimit=150)),
|
||||
color=alt.Color('sig_category:N',
|
||||
scale=alt.Scale(domain=sig_domain, range=sig_range),
|
||||
legend=alt.Legend(
|
||||
title='Significance',
|
||||
orient='right',
|
||||
direction='vertical'
|
||||
)),
|
||||
tooltip=tooltip_fields
|
||||
)
|
||||
|
||||
# Text annotations
|
||||
if show_p_values or show_effect_size:
|
||||
# Add a column for text color based on significance
|
||||
heatmap_df['text_color'] = heatmap_df['sig_category'].apply(
|
||||
lambda x: 'white' if x in ['p < 0.001', 'p < 0.01'] else 'black'
|
||||
)
|
||||
|
||||
text = alt.Chart(heatmap_df).mark_text(
|
||||
fontSize=9,
|
||||
fontWeight='normal'
|
||||
).encode(
|
||||
x=alt.X('col:N', sort=all_groups),
|
||||
y=alt.Y('row:N', sort=all_groups),
|
||||
text='text_label:N',
|
||||
color=alt.Color('text_color:N', scale=None),
|
||||
)
|
||||
chart = (heatmap + text)
|
||||
else:
|
||||
chart = heatmap
|
||||
|
||||
# Build subtitle with test info
|
||||
subtitle_lines = []
|
||||
if metadata:
|
||||
test_info = f"Test: {metadata.get('test_type', 'N/A')}"
|
||||
if metadata.get('overall_p_value') is not None:
|
||||
test_info += f" | Overall p={metadata['overall_p_value']:.4f}"
|
||||
correction = metadata.get('correction', 'none')
|
||||
if correction != 'none':
|
||||
test_info += f" | Correction: {correction}"
|
||||
subtitle_lines.append(test_info)
|
||||
|
||||
title_config = {
|
||||
'text': self._process_title(title),
|
||||
'subtitle': subtitle_lines if subtitle_lines else None,
|
||||
'subtitleColor': 'gray',
|
||||
'subtitleFontSize': 10,
|
||||
'anchor': 'start',
|
||||
}
|
||||
|
||||
chart = chart.properties(
|
||||
title=title_config,
|
||||
width=chart_width,
|
||||
height=chart_height,
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
|
||||
def plot_significance_summary(
|
||||
self,
|
||||
pairwise_df: pl.LazyFrame | pl.DataFrame | None = None,
|
||||
metadata: dict | None = None,
|
||||
title: str = "Significant Differences Summary<br>(Groups with significantly different means)",
|
||||
height: int | None = None,
|
||||
width: int | None = None,
|
||||
) -> alt.Chart:
|
||||
"""Create a summary bar chart showing which groups have significant differences.
|
||||
|
||||
This shows each group with a count of how many other groups it differs from
|
||||
significantly, plus the mean score or Rank 1 percentage for reference.
|
||||
|
||||
Args:
|
||||
pairwise_df: Output from compute_pairwise_significance() or compute_ranking_significance().
|
||||
metadata: Metadata dict from the significance computation (optional).
|
||||
title: Chart title
|
||||
height: Chart height
|
||||
width: Chart width
|
||||
|
||||
Returns:
|
||||
alt.Chart: Altair bar chart with significance count per group
|
||||
"""
|
||||
df = self._ensure_dataframe(pairwise_df)
|
||||
|
||||
# Detect data type: continuous (has mean1/mean2) vs ranking (has rank1_pct1/rank1_pct2)
|
||||
has_means = 'mean1' in df.columns
|
||||
has_ranks = 'rank1_pct1' in df.columns
|
||||
|
||||
# Count significant differences per group
|
||||
sig_df = df.filter(pl.col('significant') == True)
|
||||
|
||||
# Count for each group (appears as either group1 or group2)
|
||||
group1_counts = sig_df.group_by('group1').agg(pl.len().alias('count'))
|
||||
group2_counts = sig_df.group_by('group2').agg(pl.len().alias('count'))
|
||||
|
||||
# Combine counts
|
||||
all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list()))
|
||||
summary_data = []
|
||||
|
||||
for group in all_groups:
|
||||
count1 = group1_counts.filter(pl.col('group1') == group)['count'].to_list()
|
||||
count2 = group2_counts.filter(pl.col('group2') == group)['count'].to_list()
|
||||
total_sig = (count1[0] if count1 else 0) + (count2[0] if count2 else 0)
|
||||
|
||||
# Get score for this group from pairwise data
|
||||
if has_means:
|
||||
# Continuous data - use means
|
||||
scores = df.filter(pl.col('group1') == group)['mean1'].to_list()
|
||||
if not scores:
|
||||
scores = df.filter(pl.col('group2') == group)['mean2'].to_list()
|
||||
score_val = scores[0] if scores else None
|
||||
score_label = 'mean'
|
||||
elif has_ranks:
|
||||
# Ranking data - use Rank 1 percentage
|
||||
scores = df.filter(pl.col('group1') == group)['rank1_pct1'].to_list()
|
||||
if not scores:
|
||||
scores = df.filter(pl.col('group2') == group)['rank1_pct2'].to_list()
|
||||
score_val = scores[0] if scores else None
|
||||
score_label = 'rank1_pct'
|
||||
else:
|
||||
score_val = None
|
||||
score_label = 'score'
|
||||
|
||||
summary_data.append({
|
||||
'group': group,
|
||||
'sig_count': total_sig,
|
||||
'score': score_val,
|
||||
})
|
||||
|
||||
summary_df = pl.DataFrame(summary_data).sort('score', descending=True, nulls_last=True).to_pandas()
|
||||
|
||||
# Create layered chart: bars for sig_count, text for score
|
||||
tooltip_title = 'Mean Score' if has_means else 'Rank 1 %' if has_ranks else 'Score'
|
||||
|
||||
bars = alt.Chart(summary_df).mark_bar(color=ColorPalette.PRIMARY).encode(
|
||||
x=alt.X('group:N', title='Group', sort='-y'),
|
||||
y=alt.Y('sig_count:Q', title='# of Significant Differences'),
|
||||
tooltip=[
|
||||
alt.Tooltip('group:N', title='Group'),
|
||||
alt.Tooltip('sig_count:Q', title='Sig. Differences'),
|
||||
alt.Tooltip('score:Q', title=tooltip_title, format='.1f'),
|
||||
]
|
||||
)
|
||||
|
||||
# Only add text labels if we have scores
|
||||
if summary_df['score'].notna().any():
|
||||
text_format = '.1f' if has_means else '.0f'
|
||||
text_suffix = '%' if has_ranks else ''
|
||||
text = alt.Chart(summary_df).mark_text(
|
||||
dy=-8,
|
||||
color='black',
|
||||
fontSize=9
|
||||
).encode(
|
||||
x=alt.X('group:N', sort='-y'),
|
||||
y=alt.Y('sig_count:Q'),
|
||||
text=alt.Text('score:Q', format=text_format)
|
||||
)
|
||||
chart_layers = bars + text
|
||||
else:
|
||||
chart_layers = bars
|
||||
|
||||
# Build subtitle
|
||||
subtitle = None
|
||||
if metadata:
|
||||
if has_means:
|
||||
subtitle = f"Mean scores shown above bars | α={metadata.get('alpha', 0.05)}"
|
||||
elif has_ranks:
|
||||
subtitle = f"Rank 1 % shown above bars | α={metadata.get('alpha', 0.05)}"
|
||||
else:
|
||||
subtitle = f"α={metadata.get('alpha', 0.05)}"
|
||||
|
||||
title_config = {
|
||||
'text': self._process_title(title),
|
||||
'subtitle': subtitle,
|
||||
'subtitleColor': 'gray',
|
||||
'subtitleFontSize': 10,
|
||||
'anchor': 'start',
|
||||
}
|
||||
|
||||
chart = chart_layers.properties(
|
||||
title=title_config,
|
||||
width=width or 800,
|
||||
height=height or getattr(self, 'plot_height', 400),
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
@@ -22,6 +22,7 @@ dependencies = [
|
||||
"python-pptx>=1.0.2",
|
||||
"pyzmq>=27.1.0",
|
||||
"requests>=2.32.5",
|
||||
"scipy>=1.14.0",
|
||||
"taguette>=1.5.1",
|
||||
"vl-convert-python>=1.9.0.post1",
|
||||
"wordcloud>=1.9.5",
|
||||
|
||||
21
theme.py
21
theme.py
@@ -19,11 +19,32 @@ class ColorPalette:
|
||||
# Neutral color for unhighlighted comparison items
|
||||
NEUTRAL = "#D3D3D3" # Light Grey
|
||||
|
||||
# Character-specific colors (for individual character plots)
|
||||
# Each character has a main color and a lighter highlight for original traits
|
||||
CHARACTER_BANK_TELLER = "#004C6D" # Dark Blue
|
||||
CHARACTER_BANK_TELLER_HIGHLIGHT = "#669BBC" # Light Steel Blue
|
||||
|
||||
CHARACTER_FAMILIAR_FRIEND = "#008493" # Teal
|
||||
CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT = "#A8DADC" # Pale Cyan
|
||||
|
||||
CHARACTER_COACH = "#5AAE95" # Sea Green
|
||||
CHARACTER_COACH_HIGHLIGHT = "#A8DADC" # Pale Cyan
|
||||
|
||||
CHARACTER_PERSONAL_ASSISTANT = "#457B9D" # Steel Blue
|
||||
CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT = "#669BBC" # Light Steel Blue
|
||||
|
||||
# General UI elements
|
||||
TEXT = "black"
|
||||
GRID = "lightgray"
|
||||
BACKGROUND = "white"
|
||||
|
||||
# Statistical significance colors (for heatmaps/annotations)
|
||||
SIG_STRONG = "#004C6D" # p < 0.001 - Dark Blue (highly significant)
|
||||
SIG_MODERATE = "#0077B6" # p < 0.01 - Medium Blue (significant)
|
||||
SIG_WEAK = "#5AAE95" # p < 0.05 - Sea Green (marginally significant)
|
||||
SIG_NONE = "#E8E8E8" # p >= 0.05 - Light Grey (not significant)
|
||||
SIG_DIAGONAL = "#FFFFFF" # White for diagonal (self-comparison)
|
||||
|
||||
# Extended palette for categorical charts (e.g., pie charts with many categories)
|
||||
CATEGORICAL = [
|
||||
"#0077B6", # PRIMARY - Medium Blue
|
||||
|
||||
437
utils.py
437
utils.py
@@ -714,7 +714,7 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
|
||||
|
||||
|
||||
class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
"""Class to handle JPMorgan Chase survey data."""
|
||||
"""Class to handle Qualtrics survey data."""
|
||||
|
||||
def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
|
||||
if isinstance(data_path, str):
|
||||
@@ -1072,6 +1072,441 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
|
||||
return self._get_subset(q, QIDs, rename_cols=True), None
|
||||
|
||||
def transform_character_trait_frequency(
|
||||
self,
|
||||
char_df: pl.LazyFrame | pl.DataFrame,
|
||||
character_column: str,
|
||||
) -> tuple[pl.DataFrame, dict | None]:
|
||||
"""Transform character refine data to trait frequency counts for a single character.
|
||||
|
||||
Original use-case: "I need a bar plot that shows the frequency of the times
|
||||
each trait is chosen per brand character."
|
||||
|
||||
This function takes a DataFrame with comma-separated trait selections per
|
||||
character, explodes traits, and counts their frequency for a single character.
|
||||
|
||||
Args:
|
||||
char_df: Pre-fetched data
|
||||
Expected columns: '_recordId', '<character_column>' (with comma-separated traits)
|
||||
character_column: Name of the character column to analyze (e.g., 'Bank Teller')
|
||||
|
||||
Returns:
|
||||
tuple: (DataFrame with columns ['trait', 'count', 'is_original'], None)
|
||||
- 'trait': individual trait name
|
||||
- 'count': frequency count
|
||||
- 'is_original': boolean indicating if trait is in the original definition
|
||||
"""
|
||||
from reference import ORIGINAL_CHARACTER_TRAITS
|
||||
|
||||
if isinstance(char_df, pl.LazyFrame):
|
||||
char_df = char_df.collect()
|
||||
|
||||
# Map display names to reference keys
|
||||
character_key_map = {
|
||||
'Bank Teller': 'the_bank_teller',
|
||||
'Familiar Friend': 'the_familiar_friend',
|
||||
'The Coach': 'the_coach',
|
||||
'Personal Assistant': 'the_personal_assistant',
|
||||
}
|
||||
|
||||
# Get original traits for this character
|
||||
ref_key = character_key_map.get(character_column)
|
||||
original_traits = set(ORIGINAL_CHARACTER_TRAITS.get(ref_key, []))
|
||||
|
||||
# Filter to rows where this character has a value (not null)
|
||||
char_data = char_df.filter(pl.col(character_column).is_not_null())
|
||||
|
||||
# Split comma-separated traits and explode
|
||||
exploded = (
|
||||
char_data
|
||||
.select(
|
||||
pl.col(character_column)
|
||||
.str.split(',')
|
||||
.alias('traits')
|
||||
)
|
||||
.explode('traits')
|
||||
.with_columns(
|
||||
pl.col('traits').str.strip_chars().alias('trait')
|
||||
)
|
||||
.filter(pl.col('trait') != '')
|
||||
)
|
||||
|
||||
# Count trait frequencies
|
||||
freq_df = (
|
||||
exploded
|
||||
.group_by('trait')
|
||||
.agg(pl.len().alias('count'))
|
||||
.sort('count', descending=True)
|
||||
)
|
||||
|
||||
# Add is_original flag
|
||||
freq_df = freq_df.with_columns(
|
||||
pl.col('trait').is_in(list(original_traits)).alias('is_original')
|
||||
)
|
||||
|
||||
return freq_df, None
|
||||
|
||||
def compute_pairwise_significance(
|
||||
self,
|
||||
data: pl.LazyFrame | pl.DataFrame,
|
||||
test_type: str = "auto",
|
||||
alpha: float = 0.05,
|
||||
correction: str = "bonferroni",
|
||||
) -> tuple[pl.DataFrame, dict]:
|
||||
"""Compute pairwise statistical significance tests between columns.
|
||||
|
||||
Original use-case: "I need to test for statistical significance and present
|
||||
this in a logical manner. It should be a generalized function to work on
|
||||
many dataframes."
|
||||
|
||||
This function performs pairwise statistical tests between all numeric columns
|
||||
(excluding '_recordId') to determine which groups differ significantly.
|
||||
|
||||
Args:
|
||||
data: Pre-fetched data with numeric columns to compare.
|
||||
Expected format: rows are observations, columns are groups/categories.
|
||||
Example: Voice_Scale_1_10__V14, Voice_Scale_1_10__V04, etc.
|
||||
test_type: Statistical test to use:
|
||||
- "auto": Automatically chooses based on data (default)
|
||||
- "mannwhitney": Mann-Whitney U test (non-parametric, for continuous)
|
||||
- "ttest": Independent samples t-test (parametric, for continuous)
|
||||
- "chi2": Chi-square test (for count/frequency data)
|
||||
alpha: Significance level (default 0.05)
|
||||
correction: Multiple comparison correction method:
|
||||
- "bonferroni": Bonferroni correction (conservative)
|
||||
- "holm": Holm-Bonferroni (less conservative)
|
||||
- "none": No correction
|
||||
|
||||
Returns:
|
||||
tuple: (pairwise_df, metadata)
|
||||
- pairwise_df: DataFrame with columns ['group1', 'group2', 'p_value',
|
||||
'p_adjusted', 'significant', 'effect_size', 'mean1', 'mean2', 'n1', 'n2']
|
||||
- metadata: dict with 'test_type', 'alpha', 'correction', 'n_comparisons',
|
||||
'overall_test_stat', 'overall_p_value'
|
||||
"""
|
||||
from scipy import stats as scipy_stats
|
||||
import numpy as np
|
||||
|
||||
if isinstance(data, pl.LazyFrame):
|
||||
df = data.collect()
|
||||
else:
|
||||
df = data
|
||||
|
||||
# Get numeric columns (exclude _recordId and other non-data columns)
|
||||
value_cols = [c for c in df.columns if c != '_recordId' and df[c].dtype in [pl.Float64, pl.Float32, pl.Int64, pl.Int32]]
|
||||
|
||||
if len(value_cols) < 2:
|
||||
raise ValueError(f"Need at least 2 numeric columns for comparison, found {len(value_cols)}")
|
||||
|
||||
# Auto-detect test type based on data characteristics
|
||||
if test_type == "auto":
|
||||
# Check if data looks like counts (integers, small range) vs continuous
|
||||
sample_col = df[value_cols[0]].drop_nulls()
|
||||
if len(sample_col) > 0:
|
||||
is_integer = sample_col.dtype in [pl.Int64, pl.Int32]
|
||||
unique_ratio = sample_col.n_unique() / len(sample_col)
|
||||
if is_integer and unique_ratio < 0.1:
|
||||
test_type = "chi2"
|
||||
else:
|
||||
test_type = "mannwhitney" # Default to non-parametric
|
||||
else:
|
||||
test_type = "mannwhitney"
|
||||
|
||||
# Extract data as lists (dropping nulls for each column)
|
||||
group_data = {}
|
||||
for col in value_cols:
|
||||
group_data[col] = df[col].drop_nulls().to_numpy()
|
||||
|
||||
# Compute overall test (Kruskal-Wallis for non-parametric, ANOVA for parametric)
|
||||
all_groups = [group_data[col] for col in value_cols if len(group_data[col]) > 0]
|
||||
if test_type in ["mannwhitney", "auto"]:
|
||||
overall_stat, overall_p = scipy_stats.kruskal(*all_groups)
|
||||
overall_test_name = "Kruskal-Wallis"
|
||||
elif test_type == "ttest":
|
||||
overall_stat, overall_p = scipy_stats.f_oneway(*all_groups)
|
||||
overall_test_name = "One-way ANOVA"
|
||||
else:
|
||||
overall_stat, overall_p = None, None
|
||||
overall_test_name = "N/A (Chi-square)"
|
||||
|
||||
# Compute pairwise tests
|
||||
results = []
|
||||
n_comparisons = len(value_cols) * (len(value_cols) - 1) // 2
|
||||
|
||||
for i, col1 in enumerate(value_cols):
|
||||
for col2 in value_cols[i+1:]:
|
||||
data1 = group_data[col1]
|
||||
data2 = group_data[col2]
|
||||
|
||||
n1, n2 = len(data1), len(data2)
|
||||
mean1 = float(np.mean(data1)) if n1 > 0 else None
|
||||
mean2 = float(np.mean(data2)) if n2 > 0 else None
|
||||
|
||||
# Skip if either group has no data
|
||||
if n1 == 0 or n2 == 0:
|
||||
results.append({
|
||||
'group1': self._clean_voice_label(col1),
|
||||
'group2': self._clean_voice_label(col2),
|
||||
'p_value': None,
|
||||
'effect_size': None,
|
||||
'mean1': mean1,
|
||||
'mean2': mean2,
|
||||
'n1': n1,
|
||||
'n2': n2,
|
||||
})
|
||||
continue
|
||||
|
||||
# Perform the appropriate test
|
||||
if test_type == "mannwhitney":
|
||||
stat, p_value = scipy_stats.mannwhitneyu(data1, data2, alternative='two-sided')
|
||||
# Effect size: rank-biserial correlation
|
||||
effect_size = 1 - (2 * stat) / (n1 * n2)
|
||||
elif test_type == "ttest":
|
||||
stat, p_value = scipy_stats.ttest_ind(data1, data2)
|
||||
# Effect size: Cohen's d
|
||||
pooled_std = np.sqrt(((n1-1)*np.std(data1)**2 + (n2-1)*np.std(data2)**2) / (n1+n2-2))
|
||||
effect_size = (mean1 - mean2) / pooled_std if pooled_std > 0 else 0
|
||||
elif test_type == "chi2":
|
||||
# Create contingency table from the two distributions
|
||||
# Bin the data for chi-square
|
||||
all_data = np.concatenate([data1, data2])
|
||||
bins = np.histogram_bin_edges(all_data, bins='auto')
|
||||
counts1, _ = np.histogram(data1, bins=bins)
|
||||
counts2, _ = np.histogram(data2, bins=bins)
|
||||
contingency = np.array([counts1, counts2])
|
||||
# Remove zero columns
|
||||
contingency = contingency[:, contingency.sum(axis=0) > 0]
|
||||
if contingency.shape[1] > 1:
|
||||
stat, p_value, _, _ = scipy_stats.chi2_contingency(contingency)
|
||||
effect_size = np.sqrt(stat / (contingency.sum() * (min(contingency.shape) - 1)))
|
||||
else:
|
||||
p_value, effect_size = 1.0, 0.0
|
||||
else:
|
||||
raise ValueError(f"Unknown test_type: {test_type}")
|
||||
|
||||
results.append({
|
||||
'group1': self._clean_voice_label(col1),
|
||||
'group2': self._clean_voice_label(col2),
|
||||
'p_value': float(p_value),
|
||||
'effect_size': float(effect_size),
|
||||
'mean1': mean1,
|
||||
'mean2': mean2,
|
||||
'n1': n1,
|
||||
'n2': n2,
|
||||
})
|
||||
|
||||
# Create DataFrame and apply multiple comparison correction
|
||||
results_df = pl.DataFrame(results)
|
||||
|
||||
# Apply correction
|
||||
p_values = results_df['p_value'].to_numpy()
|
||||
valid_mask = ~np.isnan(p_values.astype(float))
|
||||
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
|
||||
|
||||
if correction == "bonferroni":
|
||||
p_adjusted[valid_mask] = np.minimum(p_values[valid_mask] * n_comparisons, 1.0)
|
||||
elif correction == "holm":
|
||||
# Holm-Bonferroni step-down procedure
|
||||
valid_p = p_values[valid_mask]
|
||||
sorted_idx = np.argsort(valid_p)
|
||||
sorted_p = valid_p[sorted_idx]
|
||||
m = len(sorted_p)
|
||||
adjusted = np.zeros(m)
|
||||
for j in range(m):
|
||||
adjusted[j] = sorted_p[j] * (m - j)
|
||||
# Ensure monotonicity
|
||||
for j in range(1, m):
|
||||
adjusted[j] = max(adjusted[j], adjusted[j-1])
|
||||
adjusted = np.minimum(adjusted, 1.0)
|
||||
# Restore original order
|
||||
p_adjusted[valid_mask] = adjusted[np.argsort(sorted_idx)]
|
||||
elif correction == "none":
|
||||
p_adjusted = p_values.astype(float)
|
||||
|
||||
results_df = results_df.with_columns([
|
||||
pl.Series('p_adjusted', p_adjusted),
|
||||
pl.Series('significant', p_adjusted < alpha),
|
||||
])
|
||||
|
||||
metadata = {
|
||||
'test_type': test_type,
|
||||
'alpha': alpha,
|
||||
'correction': correction,
|
||||
'n_comparisons': n_comparisons,
|
||||
'overall_test': overall_test_name,
|
||||
'overall_stat': overall_stat,
|
||||
'overall_p_value': overall_p,
|
||||
}
|
||||
|
||||
return results_df, metadata
|
||||
|
||||
def compute_ranking_significance(
|
||||
self,
|
||||
data: pl.LazyFrame | pl.DataFrame,
|
||||
alpha: float = 0.05,
|
||||
correction: str = "bonferroni",
|
||||
) -> tuple[pl.DataFrame, dict]:
|
||||
"""Compute statistical significance for ranking data (e.g., Top 3 Voices).
|
||||
|
||||
Original use-case: "Test whether voices are ranked significantly differently
|
||||
based on the distribution of 1st, 2nd, 3rd place votes."
|
||||
|
||||
This function takes raw ranking data (rows = respondents, columns = voices,
|
||||
values = rank 1/2/3 or null) and performs:
|
||||
1. Overall chi-square test on the full contingency table
|
||||
2. Pairwise proportion tests comparing Rank 1 vote shares
|
||||
|
||||
Args:
|
||||
data: Pre-fetched ranking data from get_top_3_voices() or get_character_ranking().
|
||||
Expected format: rows are respondents, columns are voices/characters,
|
||||
values are 1, 2, 3 (rank) or null (not ranked).
|
||||
alpha: Significance level (default 0.05)
|
||||
correction: Multiple comparison correction method:
|
||||
- "bonferroni": Bonferroni correction (conservative)
|
||||
- "holm": Holm-Bonferroni (less conservative)
|
||||
- "none": No correction
|
||||
|
||||
Returns:
|
||||
tuple: (pairwise_df, metadata)
|
||||
- pairwise_df: DataFrame with columns ['group1', 'group2', 'p_value',
|
||||
'p_adjusted', 'significant', 'rank1_count1', 'rank1_count2',
|
||||
'rank1_pct1', 'rank1_pct2', 'total1', 'total2']
|
||||
- metadata: dict with 'alpha', 'correction', 'n_comparisons',
|
||||
'chi2_stat', 'chi2_p_value', 'contingency_table'
|
||||
|
||||
Example:
|
||||
>>> ranking_data, _ = S.get_top_3_voices(data)
|
||||
>>> pairwise_df, meta = S.compute_ranking_significance(ranking_data)
|
||||
>>> # See which voices have significantly different Rank 1 proportions
|
||||
>>> print(pairwise_df.filter(pl.col('significant') == True))
|
||||
"""
|
||||
from scipy import stats as scipy_stats
|
||||
import numpy as np
|
||||
|
||||
if isinstance(data, pl.LazyFrame):
|
||||
df = data.collect()
|
||||
else:
|
||||
df = data
|
||||
|
||||
# Get ranking columns (exclude _recordId)
|
||||
ranking_cols = [c for c in df.columns if c != '_recordId']
|
||||
|
||||
if len(ranking_cols) < 2:
|
||||
raise ValueError(f"Need at least 2 ranking columns, found {len(ranking_cols)}")
|
||||
|
||||
# Build contingency table: rows = ranks (1, 2, 3), columns = voices
|
||||
# Count how many times each voice received each rank
|
||||
contingency_data = {}
|
||||
for col in ranking_cols:
|
||||
label = self._clean_voice_label(col)
|
||||
r1 = df.filter(pl.col(col) == 1).height
|
||||
r2 = df.filter(pl.col(col) == 2).height
|
||||
r3 = df.filter(pl.col(col) == 3).height
|
||||
contingency_data[label] = [r1, r2, r3]
|
||||
|
||||
# Create contingency table as numpy array
|
||||
labels = list(contingency_data.keys())
|
||||
contingency_table = np.array([contingency_data[l] for l in labels]).T # 3 x n_voices
|
||||
|
||||
# Overall chi-square test on contingency table
|
||||
# Tests whether rank distribution is independent of voice
|
||||
chi2_stat, chi2_p, chi2_dof, _ = scipy_stats.chi2_contingency(contingency_table)
|
||||
|
||||
# Pairwise proportion tests for Rank 1 votes
|
||||
# We use a two-proportion z-test to compare rank 1 proportions
|
||||
results = []
|
||||
n_comparisons = len(labels) * (len(labels) - 1) // 2
|
||||
|
||||
# Total respondents who ranked any voice in top 3
|
||||
total_respondents = df.height
|
||||
|
||||
for i, label1 in enumerate(labels):
|
||||
for label2 in labels[i+1:]:
|
||||
r1_count1 = contingency_data[label1][0] # Rank 1 votes for voice 1
|
||||
r1_count2 = contingency_data[label2][0] # Rank 1 votes for voice 2
|
||||
|
||||
# Total times each voice was ranked (1st + 2nd + 3rd)
|
||||
total1 = sum(contingency_data[label1])
|
||||
total2 = sum(contingency_data[label2])
|
||||
|
||||
# Calculate proportions of Rank 1 out of all rankings for each voice
|
||||
pct1 = r1_count1 / total1 if total1 > 0 else 0
|
||||
pct2 = r1_count2 / total2 if total2 > 0 else 0
|
||||
|
||||
# Two-proportion z-test
|
||||
# H0: p1 = p2 (both voices have same proportion of Rank 1)
|
||||
if total1 > 0 and total2 > 0 and (r1_count1 + r1_count2) > 0:
|
||||
# Pooled proportion
|
||||
p_pooled = (r1_count1 + r1_count2) / (total1 + total2)
|
||||
|
||||
# Standard error
|
||||
se = np.sqrt(p_pooled * (1 - p_pooled) * (1/total1 + 1/total2))
|
||||
|
||||
if se > 0:
|
||||
z_stat = (pct1 - pct2) / se
|
||||
p_value = 2 * (1 - scipy_stats.norm.cdf(abs(z_stat))) # Two-tailed
|
||||
else:
|
||||
p_value = 1.0
|
||||
else:
|
||||
p_value = 1.0
|
||||
|
||||
results.append({
|
||||
'group1': label1,
|
||||
'group2': label2,
|
||||
'p_value': float(p_value),
|
||||
'rank1_count1': r1_count1,
|
||||
'rank1_count2': r1_count2,
|
||||
'rank1_pct1': round(pct1 * 100, 1),
|
||||
'rank1_pct2': round(pct2 * 100, 1),
|
||||
'total1': total1,
|
||||
'total2': total2,
|
||||
})
|
||||
|
||||
# Create DataFrame and apply correction
|
||||
results_df = pl.DataFrame(results)
|
||||
|
||||
p_values = results_df['p_value'].to_numpy()
|
||||
p_adjusted = np.full_like(p_values, np.nan, dtype=float)
|
||||
|
||||
if correction == "bonferroni":
|
||||
p_adjusted = np.minimum(p_values * n_comparisons, 1.0)
|
||||
elif correction == "holm":
|
||||
sorted_idx = np.argsort(p_values)
|
||||
sorted_p = p_values[sorted_idx]
|
||||
m = len(sorted_p)
|
||||
adjusted = np.zeros(m)
|
||||
for j in range(m):
|
||||
adjusted[j] = sorted_p[j] * (m - j)
|
||||
for j in range(1, m):
|
||||
adjusted[j] = max(adjusted[j], adjusted[j-1])
|
||||
adjusted = np.minimum(adjusted, 1.0)
|
||||
p_adjusted = adjusted[np.argsort(sorted_idx)]
|
||||
elif correction == "none":
|
||||
p_adjusted = p_values.astype(float)
|
||||
|
||||
results_df = results_df.with_columns([
|
||||
pl.Series('p_adjusted', p_adjusted),
|
||||
pl.Series('significant', p_adjusted < alpha),
|
||||
])
|
||||
|
||||
# Sort by p_value for easier inspection
|
||||
results_df = results_df.sort('p_value')
|
||||
|
||||
metadata = {
|
||||
'test_type': 'proportion_z_test',
|
||||
'alpha': alpha,
|
||||
'correction': correction,
|
||||
'n_comparisons': n_comparisons,
|
||||
'chi2_stat': chi2_stat,
|
||||
'chi2_p_value': chi2_p,
|
||||
'chi2_dof': chi2_dof,
|
||||
'overall_test': 'Chi-square',
|
||||
'overall_stat': chi2_stat,
|
||||
'overall_p_value': chi2_p,
|
||||
'contingency_table': {label: contingency_data[label] for label in labels},
|
||||
}
|
||||
|
||||
return results_df, metadata
|
||||
|
||||
|
||||
def process_speaking_style_data(
|
||||
|
||||
2
uv.lock
generated
2
uv.lock
generated
@@ -1435,6 +1435,7 @@ dependencies = [
|
||||
{ name = "python-pptx" },
|
||||
{ name = "pyzmq" },
|
||||
{ name = "requests" },
|
||||
{ name = "scipy" },
|
||||
{ name = "taguette" },
|
||||
{ name = "vl-convert-python" },
|
||||
{ name = "wordcloud" },
|
||||
@@ -1459,6 +1460,7 @@ requires-dist = [
|
||||
{ name = "python-pptx", specifier = ">=1.0.2" },
|
||||
{ name = "pyzmq", specifier = ">=27.1.0" },
|
||||
{ name = "requests", specifier = ">=2.32.5" },
|
||||
{ name = "scipy", specifier = ">=1.14.0" },
|
||||
{ name = "taguette", specifier = ">=1.5.1" },
|
||||
{ name = "vl-convert-python", specifier = ">=1.9.0.post1" },
|
||||
{ name = "wordcloud", specifier = ">=1.9.5" },
|
||||
|
||||
Reference in New Issue
Block a user