statistical tests
This commit is contained in:
491
plots.py
491
plots.py
@@ -290,10 +290,11 @@ class QualtricsPlotsMixin:
|
||||
if domain is None:
|
||||
domain = [stats_df['average'].min(), stats_df['average'].max()]
|
||||
|
||||
# Base bar chart
|
||||
# Base bar chart - use y2 to explicitly start bars at domain minimum
|
||||
bars = alt.Chart(stats_df).mark_bar(color=color).encode(
|
||||
x=alt.X('voice:N', title=x_label, sort='-y'),
|
||||
y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)),
|
||||
y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge)
|
||||
tooltip=[
|
||||
alt.Tooltip('voice:N', title='Voice'),
|
||||
alt.Tooltip('average:Q', title='Average', format='.2f'),
|
||||
@@ -1099,5 +1100,493 @@ class QualtricsPlotsMixin:
|
||||
height=height or getattr(self, 'plot_height', 400)
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
|
||||
def plot_single_character_trait_frequency(
|
||||
self,
|
||||
data: pl.LazyFrame | pl.DataFrame | None = None,
|
||||
character_name: str = "Character",
|
||||
bar_color: str = ColorPalette.PRIMARY,
|
||||
highlight_color: str = ColorPalette.NEUTRAL,
|
||||
title: str | None = None,
|
||||
x_label: str = "Trait",
|
||||
y_label: str = "Frequency",
|
||||
trait_sort_order: list[str] | None = None,
|
||||
height: int | None = None,
|
||||
width: int | str | None = None,
|
||||
) -> alt.Chart:
|
||||
"""Create a bar plot showing trait frequency for a single character.
|
||||
|
||||
Original request: "I need a bar plot that shows the frequency of the times
|
||||
each trait is chosen per brand character. The function should be generalized
|
||||
so that it can be used 4 times, once for each character. Each character should
|
||||
use a slightly different color. Original traits should be highlighted."
|
||||
|
||||
This function creates one plot per character. Call it 4 times (once per
|
||||
character) to generate all plots for a slide.
|
||||
|
||||
Args:
|
||||
data: DataFrame with columns ['trait', 'count', 'is_original']
|
||||
as produced by transform_character_trait_frequency()
|
||||
character_name: Name of the character (for title). E.g., "Bank Teller"
|
||||
bar_color: Main bar color for non-original traits. Use ColorPalette
|
||||
constants like ColorPalette.CHARACTER_BANK_TELLER
|
||||
highlight_color: Lighter color for original/expected traits. Use the
|
||||
matching highlight like ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT
|
||||
title: Custom title. If None, auto-generates from character_name
|
||||
x_label: X-axis label
|
||||
y_label: Y-axis label
|
||||
trait_sort_order: Optional list of traits for consistent sorting across
|
||||
all character plots. If None, sorts by count descending.
|
||||
height: Chart height
|
||||
width: Chart width
|
||||
|
||||
Returns:
|
||||
alt.Chart: Altair bar chart
|
||||
"""
|
||||
df = self._ensure_dataframe(data)
|
||||
|
||||
# Ensure we have the expected columns
|
||||
required_cols = {'trait', 'count', 'is_original'}
|
||||
if not required_cols.issubset(set(df.columns)):
|
||||
return alt.Chart(pd.DataFrame({
|
||||
'text': ['Data must have trait, count, is_original columns']
|
||||
})).mark_text().encode(text='text:N')
|
||||
|
||||
# Convert to pandas for Altair
|
||||
plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df
|
||||
|
||||
# Determine sort order
|
||||
if trait_sort_order is not None:
|
||||
# Use provided order, append any missing traits at the end (sorted by count)
|
||||
known_traits = set(trait_sort_order)
|
||||
extra_traits = plot_df[~plot_df['trait'].isin(known_traits)].sort_values(
|
||||
'count', ascending=False
|
||||
)['trait'].tolist()
|
||||
sort_order = trait_sort_order + extra_traits
|
||||
else:
|
||||
# Default: sort by count descending
|
||||
sort_order = plot_df.sort_values('count', ascending=False)['trait'].tolist()
|
||||
|
||||
# Create category column for color encoding
|
||||
plot_df['category'] = plot_df['is_original'].map({
|
||||
True: 'Original Trait',
|
||||
False: 'Other Trait'
|
||||
})
|
||||
|
||||
# Generate title if not provided
|
||||
if title is None:
|
||||
title = f"{character_name}<br>Trait Selection Frequency"
|
||||
|
||||
# Build title config with sort order note as subtitle
|
||||
sort_note = "Sorted by total frequency across all characters" if trait_sort_order else "Sorted by frequency (descending)"
|
||||
title_text = self._process_title(title)
|
||||
title_config = {
|
||||
'text': title_text,
|
||||
'subtitle': sort_note,
|
||||
'subtitleColor': 'gray',
|
||||
'subtitleFontSize': 10,
|
||||
'anchor': 'start',
|
||||
}
|
||||
|
||||
# Create HORIZONTAL bar chart with conditional coloring
|
||||
# Reverse sort order for horizontal bars (highest at top)
|
||||
reversed_sort = list(reversed(sort_order))
|
||||
|
||||
bars = alt.Chart(plot_df).mark_bar().encode(
|
||||
y=alt.Y('trait:N',
|
||||
title=x_label,
|
||||
sort=reversed_sort,
|
||||
axis=alt.Axis(labelLimit=200)),
|
||||
x=alt.X('count:Q', title=y_label),
|
||||
color=alt.Color('category:N',
|
||||
scale=alt.Scale(
|
||||
domain=['Original Trait', 'Other Trait'],
|
||||
range=[highlight_color, bar_color]
|
||||
),
|
||||
legend=alt.Legend(
|
||||
orient='top',
|
||||
direction='horizontal',
|
||||
title=None
|
||||
)),
|
||||
tooltip=[
|
||||
alt.Tooltip('trait:N', title='Trait'),
|
||||
alt.Tooltip('count:Q', title='Frequency'),
|
||||
alt.Tooltip('category:N', title='Type')
|
||||
]
|
||||
)
|
||||
|
||||
# Add count labels on bars (to the right of bars for horizontal)
|
||||
text = alt.Chart(plot_df).mark_text(
|
||||
dx=12,
|
||||
color='black',
|
||||
fontSize=10,
|
||||
align='left'
|
||||
).encode(
|
||||
y=alt.Y('trait:N', sort=reversed_sort),
|
||||
x=alt.X('count:Q'),
|
||||
text=alt.Text('count:Q')
|
||||
)
|
||||
|
||||
chart = (bars + text).properties(
|
||||
title=title_config,
|
||||
width=width or 400,
|
||||
height=height or getattr(self, 'plot_height', 450)
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
|
||||
def plot_significance_heatmap(
|
||||
self,
|
||||
pairwise_df: pl.LazyFrame | pl.DataFrame | None = None,
|
||||
metadata: dict | None = None,
|
||||
title: str = "Pairwise Statistical Significance<br>(Adjusted p-values)",
|
||||
show_p_values: bool = True,
|
||||
show_effect_size: bool = False,
|
||||
height: int | None = None,
|
||||
width: int | None = None,
|
||||
) -> alt.Chart:
|
||||
"""Create a heatmap showing pairwise statistical significance between groups.
|
||||
|
||||
Original use-case: "I need to test for statistical significance and present
|
||||
this in a logical manner - as a heatmap or similar visualization."
|
||||
|
||||
This function visualizes the output of compute_pairwise_significance() as
|
||||
a color-coded heatmap where color intensity indicates significance level.
|
||||
|
||||
Args:
|
||||
pairwise_df: Output from compute_pairwise_significance().
|
||||
Expected columns: ['group1', 'group2', 'p_value', 'p_adjusted', 'significant']
|
||||
metadata: Metadata dict from compute_pairwise_significance() (optional).
|
||||
Used to add test information to the plot subtitle.
|
||||
title: Chart title (supports <br> for line breaks)
|
||||
show_p_values: Whether to display p-values as text annotations
|
||||
show_effect_size: Whether to display effect sizes instead of p-values
|
||||
height: Chart height (default: auto-sized based on groups)
|
||||
width: Chart width (default: auto-sized based on groups)
|
||||
|
||||
Returns:
|
||||
alt.Chart: Altair heatmap chart
|
||||
"""
|
||||
df = self._ensure_dataframe(pairwise_df)
|
||||
|
||||
# Get unique groups
|
||||
all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list()))
|
||||
n_groups = len(all_groups)
|
||||
|
||||
# Create symmetric matrix data for heatmap
|
||||
# We need both directions (A,B) and (B,A) for the full matrix
|
||||
heatmap_data = []
|
||||
for row_group in all_groups:
|
||||
for col_group in all_groups:
|
||||
if row_group == col_group:
|
||||
# Diagonal - self comparison
|
||||
heatmap_data.append({
|
||||
'row': row_group,
|
||||
'col': col_group,
|
||||
'p_adjusted': None,
|
||||
'p_value': None,
|
||||
'significant': None,
|
||||
'effect_size': None,
|
||||
'text_label': '—',
|
||||
'sig_category': 'Self',
|
||||
})
|
||||
else:
|
||||
# Find the comparison (could be in either order)
|
||||
match = df.filter(
|
||||
((pl.col('group1') == row_group) & (pl.col('group2') == col_group)) |
|
||||
((pl.col('group1') == col_group) & (pl.col('group2') == row_group))
|
||||
)
|
||||
if match.height > 0:
|
||||
p_adj = match['p_adjusted'][0]
|
||||
p_val = match['p_value'][0]
|
||||
sig = match['significant'][0]
|
||||
eff = match['effect_size'][0] if 'effect_size' in match.columns else None
|
||||
|
||||
# For ranking data, we can show Rank 1 % difference
|
||||
has_rank_pcts = 'rank1_pct1' in match.columns and 'rank1_pct2' in match.columns
|
||||
if has_rank_pcts:
|
||||
pct_diff = abs(match['rank1_pct1'][0] - match['rank1_pct2'][0])
|
||||
else:
|
||||
pct_diff = None
|
||||
|
||||
# Helper to get display text when not showing p-values
|
||||
def get_alt_text():
|
||||
if eff is not None:
|
||||
return f'{eff:.2f}'
|
||||
elif pct_diff is not None:
|
||||
return f'{pct_diff:.1f}%'
|
||||
else:
|
||||
return '—'
|
||||
|
||||
# Categorize significance level
|
||||
if p_adj is None:
|
||||
sig_cat = 'N/A'
|
||||
text = 'N/A'
|
||||
elif p_adj < 0.001:
|
||||
sig_cat = 'p < 0.001'
|
||||
text = '<.001' if show_p_values else get_alt_text()
|
||||
elif p_adj < 0.01:
|
||||
sig_cat = 'p < 0.01'
|
||||
text = f'{p_adj:.3f}' if show_p_values else get_alt_text()
|
||||
elif p_adj < 0.05:
|
||||
sig_cat = 'p < 0.05'
|
||||
text = f'{p_adj:.3f}' if show_p_values else get_alt_text()
|
||||
else:
|
||||
sig_cat = 'n.s.'
|
||||
text = f'{p_adj:.2f}' if show_p_values else get_alt_text()
|
||||
|
||||
if show_effect_size:
|
||||
text = get_alt_text()
|
||||
|
||||
heatmap_data.append({
|
||||
'row': row_group,
|
||||
'col': col_group,
|
||||
'p_adjusted': p_adj,
|
||||
'p_value': p_val,
|
||||
'significant': sig,
|
||||
'effect_size': eff,
|
||||
'text_label': text,
|
||||
'sig_category': sig_cat,
|
||||
})
|
||||
else:
|
||||
heatmap_data.append({
|
||||
'row': row_group,
|
||||
'col': col_group,
|
||||
'p_adjusted': None,
|
||||
'p_value': None,
|
||||
'significant': None,
|
||||
'effect_size': None,
|
||||
'text_label': 'N/A',
|
||||
'sig_category': 'N/A',
|
||||
})
|
||||
|
||||
heatmap_df = pl.DataFrame(heatmap_data).to_pandas()
|
||||
|
||||
# Define color scale for significance categories
|
||||
sig_domain = ['p < 0.001', 'p < 0.01', 'p < 0.05', 'n.s.', 'Self', 'N/A']
|
||||
sig_range = [
|
||||
ColorPalette.SIG_STRONG, # p < 0.001
|
||||
ColorPalette.SIG_MODERATE, # p < 0.01
|
||||
ColorPalette.SIG_WEAK, # p < 0.05
|
||||
ColorPalette.SIG_NONE, # not significant
|
||||
ColorPalette.SIG_DIAGONAL, # diagonal (self)
|
||||
ColorPalette.NEUTRAL, # N/A
|
||||
]
|
||||
|
||||
# Build tooltip fields based on available data
|
||||
tooltip_fields = [
|
||||
alt.Tooltip('row:N', title='Group 1'),
|
||||
alt.Tooltip('col:N', title='Group 2'),
|
||||
alt.Tooltip('p_value:Q', title='p-value', format='.4f'),
|
||||
alt.Tooltip('p_adjusted:Q', title='Adjusted p', format='.4f'),
|
||||
]
|
||||
# Only add effect_size if it has non-null values (continuous data)
|
||||
has_effect_size = 'effect_size' in heatmap_df.columns and heatmap_df['effect_size'].notna().any()
|
||||
if has_effect_size:
|
||||
tooltip_fields.append(alt.Tooltip('effect_size:Q', title='Effect Size', format='.3f'))
|
||||
# Add rank info for ranking data
|
||||
has_rank_pcts = 'rank1_pct1' in df.columns if isinstance(df, pl.DataFrame) else False
|
||||
if has_rank_pcts:
|
||||
tooltip_fields.append(alt.Tooltip('text_label:N', title='Rank 1 % Diff'))
|
||||
|
||||
# Calculate dimensions
|
||||
cell_size = 45
|
||||
auto_size = n_groups * cell_size + 100
|
||||
chart_width = width or auto_size
|
||||
chart_height = height or auto_size
|
||||
|
||||
# Base heatmap
|
||||
heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode(
|
||||
x=alt.X('col:N', title=None, sort=all_groups,
|
||||
axis=alt.Axis(labelAngle=-45, labelLimit=150)),
|
||||
y=alt.Y('row:N', title=None, sort=all_groups,
|
||||
axis=alt.Axis(labelLimit=150)),
|
||||
color=alt.Color('sig_category:N',
|
||||
scale=alt.Scale(domain=sig_domain, range=sig_range),
|
||||
legend=alt.Legend(
|
||||
title='Significance',
|
||||
orient='right',
|
||||
direction='vertical'
|
||||
)),
|
||||
tooltip=tooltip_fields
|
||||
)
|
||||
|
||||
# Text annotations
|
||||
if show_p_values or show_effect_size:
|
||||
# Add a column for text color based on significance
|
||||
heatmap_df['text_color'] = heatmap_df['sig_category'].apply(
|
||||
lambda x: 'white' if x in ['p < 0.001', 'p < 0.01'] else 'black'
|
||||
)
|
||||
|
||||
text = alt.Chart(heatmap_df).mark_text(
|
||||
fontSize=9,
|
||||
fontWeight='normal'
|
||||
).encode(
|
||||
x=alt.X('col:N', sort=all_groups),
|
||||
y=alt.Y('row:N', sort=all_groups),
|
||||
text='text_label:N',
|
||||
color=alt.Color('text_color:N', scale=None),
|
||||
)
|
||||
chart = (heatmap + text)
|
||||
else:
|
||||
chart = heatmap
|
||||
|
||||
# Build subtitle with test info
|
||||
subtitle_lines = []
|
||||
if metadata:
|
||||
test_info = f"Test: {metadata.get('test_type', 'N/A')}"
|
||||
if metadata.get('overall_p_value') is not None:
|
||||
test_info += f" | Overall p={metadata['overall_p_value']:.4f}"
|
||||
correction = metadata.get('correction', 'none')
|
||||
if correction != 'none':
|
||||
test_info += f" | Correction: {correction}"
|
||||
subtitle_lines.append(test_info)
|
||||
|
||||
title_config = {
|
||||
'text': self._process_title(title),
|
||||
'subtitle': subtitle_lines if subtitle_lines else None,
|
||||
'subtitleColor': 'gray',
|
||||
'subtitleFontSize': 10,
|
||||
'anchor': 'start',
|
||||
}
|
||||
|
||||
chart = chart.properties(
|
||||
title=title_config,
|
||||
width=chart_width,
|
||||
height=chart_height,
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
|
||||
def plot_significance_summary(
|
||||
self,
|
||||
pairwise_df: pl.LazyFrame | pl.DataFrame | None = None,
|
||||
metadata: dict | None = None,
|
||||
title: str = "Significant Differences Summary<br>(Groups with significantly different means)",
|
||||
height: int | None = None,
|
||||
width: int | None = None,
|
||||
) -> alt.Chart:
|
||||
"""Create a summary bar chart showing which groups have significant differences.
|
||||
|
||||
This shows each group with a count of how many other groups it differs from
|
||||
significantly, plus the mean score or Rank 1 percentage for reference.
|
||||
|
||||
Args:
|
||||
pairwise_df: Output from compute_pairwise_significance() or compute_ranking_significance().
|
||||
metadata: Metadata dict from the significance computation (optional).
|
||||
title: Chart title
|
||||
height: Chart height
|
||||
width: Chart width
|
||||
|
||||
Returns:
|
||||
alt.Chart: Altair bar chart with significance count per group
|
||||
"""
|
||||
df = self._ensure_dataframe(pairwise_df)
|
||||
|
||||
# Detect data type: continuous (has mean1/mean2) vs ranking (has rank1_pct1/rank1_pct2)
|
||||
has_means = 'mean1' in df.columns
|
||||
has_ranks = 'rank1_pct1' in df.columns
|
||||
|
||||
# Count significant differences per group
|
||||
sig_df = df.filter(pl.col('significant') == True)
|
||||
|
||||
# Count for each group (appears as either group1 or group2)
|
||||
group1_counts = sig_df.group_by('group1').agg(pl.len().alias('count'))
|
||||
group2_counts = sig_df.group_by('group2').agg(pl.len().alias('count'))
|
||||
|
||||
# Combine counts
|
||||
all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list()))
|
||||
summary_data = []
|
||||
|
||||
for group in all_groups:
|
||||
count1 = group1_counts.filter(pl.col('group1') == group)['count'].to_list()
|
||||
count2 = group2_counts.filter(pl.col('group2') == group)['count'].to_list()
|
||||
total_sig = (count1[0] if count1 else 0) + (count2[0] if count2 else 0)
|
||||
|
||||
# Get score for this group from pairwise data
|
||||
if has_means:
|
||||
# Continuous data - use means
|
||||
scores = df.filter(pl.col('group1') == group)['mean1'].to_list()
|
||||
if not scores:
|
||||
scores = df.filter(pl.col('group2') == group)['mean2'].to_list()
|
||||
score_val = scores[0] if scores else None
|
||||
score_label = 'mean'
|
||||
elif has_ranks:
|
||||
# Ranking data - use Rank 1 percentage
|
||||
scores = df.filter(pl.col('group1') == group)['rank1_pct1'].to_list()
|
||||
if not scores:
|
||||
scores = df.filter(pl.col('group2') == group)['rank1_pct2'].to_list()
|
||||
score_val = scores[0] if scores else None
|
||||
score_label = 'rank1_pct'
|
||||
else:
|
||||
score_val = None
|
||||
score_label = 'score'
|
||||
|
||||
summary_data.append({
|
||||
'group': group,
|
||||
'sig_count': total_sig,
|
||||
'score': score_val,
|
||||
})
|
||||
|
||||
summary_df = pl.DataFrame(summary_data).sort('score', descending=True, nulls_last=True).to_pandas()
|
||||
|
||||
# Create layered chart: bars for sig_count, text for score
|
||||
tooltip_title = 'Mean Score' if has_means else 'Rank 1 %' if has_ranks else 'Score'
|
||||
|
||||
bars = alt.Chart(summary_df).mark_bar(color=ColorPalette.PRIMARY).encode(
|
||||
x=alt.X('group:N', title='Group', sort='-y'),
|
||||
y=alt.Y('sig_count:Q', title='# of Significant Differences'),
|
||||
tooltip=[
|
||||
alt.Tooltip('group:N', title='Group'),
|
||||
alt.Tooltip('sig_count:Q', title='Sig. Differences'),
|
||||
alt.Tooltip('score:Q', title=tooltip_title, format='.1f'),
|
||||
]
|
||||
)
|
||||
|
||||
# Only add text labels if we have scores
|
||||
if summary_df['score'].notna().any():
|
||||
text_format = '.1f' if has_means else '.0f'
|
||||
text_suffix = '%' if has_ranks else ''
|
||||
text = alt.Chart(summary_df).mark_text(
|
||||
dy=-8,
|
||||
color='black',
|
||||
fontSize=9
|
||||
).encode(
|
||||
x=alt.X('group:N', sort='-y'),
|
||||
y=alt.Y('sig_count:Q'),
|
||||
text=alt.Text('score:Q', format=text_format)
|
||||
)
|
||||
chart_layers = bars + text
|
||||
else:
|
||||
chart_layers = bars
|
||||
|
||||
# Build subtitle
|
||||
subtitle = None
|
||||
if metadata:
|
||||
if has_means:
|
||||
subtitle = f"Mean scores shown above bars | α={metadata.get('alpha', 0.05)}"
|
||||
elif has_ranks:
|
||||
subtitle = f"Rank 1 % shown above bars | α={metadata.get('alpha', 0.05)}"
|
||||
else:
|
||||
subtitle = f"α={metadata.get('alpha', 0.05)}"
|
||||
|
||||
title_config = {
|
||||
'text': self._process_title(title),
|
||||
'subtitle': subtitle,
|
||||
'subtitleColor': 'gray',
|
||||
'subtitleFontSize': 10,
|
||||
'anchor': 'start',
|
||||
}
|
||||
|
||||
chart = chart_layers.properties(
|
||||
title=title_config,
|
||||
width=width or 800,
|
||||
height=height or getattr(self, 'plot_height', 400),
|
||||
)
|
||||
|
||||
chart = self._save_plot(chart, title)
|
||||
return chart
|
||||
Reference in New Issue
Block a user