"""Plotting functions for Voice Branding analysis using Altair.""" import re import math from pathlib import Path import altair as alt import pandas as pd import polars as pl from theme import ColorPalette from reference import VOICE_GENDER_MAPPING import hashlib class QualtricsPlotsMixin: """Mixin class for plotting functions in QualtricsSurvey.""" def _process_title(self, title: str) -> str | list[str]: """Process title to handle
tags for Altair.""" if isinstance(title, str) and '
' in title: return title.split('
') return title def _sanitize_filename(self, title: str) -> str: """Convert plot title to a safe filename.""" # Remove HTML tags clean = re.sub(r'<[^>]+>', ' ', title) # Replace special characters with underscores clean = re.sub(r'[^\w\s-]', '', clean) # Replace whitespace with underscores clean = re.sub(r'\s+', '_', clean.strip()) # Remove consecutive underscores clean = re.sub(r'_+', '_', clean) # Lowercase and limit length return clean.lower()[:100] def _get_filter_slug(self) -> str: """Generate a directory-friendly slug based on active filters.""" parts = [] # Mapping of attribute name to (short_code, value, options_attr) filters = [ ('age', 'Age', getattr(self, 'filter_age', None), 'options_age'), ('gender', 'Gen', getattr(self, 'filter_gender', None), 'options_gender'), ('consumer', 'Cons', getattr(self, 'filter_consumer', None), 'options_consumer'), ('ethnicity', 'Eth', getattr(self, 'filter_ethnicity', None), 'options_ethnicity'), ('income', 'Inc', getattr(self, 'filter_income', None), 'options_income'), ] for _, short_code, value, options_attr in filters: if value is None: continue # Ensure value is a list for uniform handling if not isinstance(value, list): value = [value] if len(value) == 0: continue # Check if all options are selected (equivalent to no filter) # We compare the set of selected values to the set of all available options master_list = getattr(self, options_attr, None) if master_list and set(value) == set(master_list): continue if len(value) > 3: # If more than 3 options selected, create a hash of the sorted values # This ensures uniqueness properly while keeping the slug short sorted_vals = sorted([str(v) for v in value]) vals_str = "".join(sorted_vals) # Create short 6-char hash val_hash = hashlib.md5(vals_str.encode()).hexdigest()[:6] val_str = f"{len(value)}_grps_{val_hash}" else: # Join values with '+' clean_values = [] for v in value: # Simple sanitization: keep alphanum and hyphens/dots, remove others s = str(v) # Remove special chars that might be problematic in dir names s = re.sub(r'[^\w\-\.]', '', s) clean_values.append(s) val_str = "+".join(clean_values) parts.append(f"{short_code}-{val_str}") if not parts: return "All_Respondents" return "_".join(parts) def _get_filter_description(self) -> str: """Generate a human-readable description of active filters.""" parts = [] # Mapping of attribute name to (display_name, value, options_attr) filters = [ ('Age', getattr(self, 'filter_age', None), 'options_age'), ('Gender', getattr(self, 'filter_gender', None), 'options_gender'), ('Consumer', getattr(self, 'filter_consumer', None), 'options_consumer'), ('Ethnicity', getattr(self, 'filter_ethnicity', None), 'options_ethnicity'), ('Income', getattr(self, 'filter_income', None), 'options_income'), ] for display_name, value, options_attr in filters: if value is None: continue # Ensure value is a list for uniform handling if not isinstance(value, list): value = [value] if len(value) == 0: continue # Check if all options are selected (equivalent to no filter) master_list = getattr(self, options_attr, None) if master_list and set(value) == set(master_list): continue # Use original values for display (full list) clean_values = [str(v) for v in value] val_str = ", ".join(clean_values) # Use UPPERCASE for category name to distinguish from values parts.append(f"{display_name.upper()}: {val_str}") if not parts: return "" # Join with clear separator - double space for visual break return "Filters: " + " — ".join(parts) def _add_filter_footnote(self, chart: alt.Chart) -> alt.Chart: """Add a footnote with active filters to the chart. Uses chart subtitle for filter text to avoid layout issues with vconcat. Returns the modified chart (or original if no filters). """ filter_text = self._get_filter_description() # Skip if no filters active - return original chart if not filter_text: return chart # Wrap text into multiple lines at ~100 chars, but don't break mid-word max_line_length = 100 words = filter_text.split() lines = [] current_line = "" for word in words: test_line = f"{current_line} {word}".strip() if current_line else word if len(test_line) <= max_line_length: current_line = test_line else: if current_line: lines.append(current_line) current_line = word if current_line: lines.append(current_line) # Get existing title from chart spec chart_spec = chart.to_dict() existing_title = chart_spec.get('title', '') # Handle different title formats (string vs dict vs list) if isinstance(existing_title, (str, list)): title_config = { 'text': existing_title, 'subtitle': lines, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } elif isinstance(existing_title, dict): title_config = existing_title.copy() title_config['subtitle'] = lines title_config['subtitleColor'] = 'gray' title_config['subtitleFontSize'] = 10 title_config['anchor'] = 'start' else: # No existing title, just add filters as subtitle title_config = { 'text': '', 'subtitle': lines, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } return chart.properties(title=title_config) def _save_plot(self, chart: alt.Chart, title: str) -> alt.Chart: """Save chart to PNG file if fig_save_dir is set. Returns the (potentially modified) chart with filter footnote added. """ # Add filter footnote - returns combined chart if filters active chart = self._add_filter_footnote(chart) if hasattr(self, 'fig_save_dir') and self.fig_save_dir: path = Path(self.fig_save_dir) # Add filter slug subfolder filter_slug = self._get_filter_slug() path = path / filter_slug if not path.exists(): path.mkdir(parents=True, exist_ok=True) filename = f"{self._sanitize_filename(title)}.png" filepath = path / filename # Use vl_convert directly with theme config for consistent rendering import vl_convert as vlc from theme import jpmc_altair_theme # Get chart spec and theme config chart_spec = chart.to_dict() theme_config = jpmc_altair_theme()['config'] png_data = vlc.vegalite_to_png( vl_spec=chart_spec, scale=2.0, ppi=72, config=theme_config ) with open(filepath, 'wb') as f: f.write(png_data) print(f"Saved plot to {filepath}") return chart def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame: """Ensure data is an eager DataFrame, collecting if necessary.""" df = data if data is not None else getattr(self, 'data_filtered', None) if df is None: raise ValueError("No data provided and self.data_filtered is None.") if isinstance(df, pl.LazyFrame): return df.collect() return df def _clean_voice_label(self, col_name: str) -> str: """Extract and clean voice name from column name for display. Handles patterns like: - 'Voice_Scale__The_Coach' -> 'The Coach' - 'Character_Ranking_The_Coach' -> 'The Coach' - 'Top_3_Voices_ranking__Familiar_Friend' -> 'Familiar Friend' """ # First split by __ if present label = col_name.split('__')[-1] if '__' in col_name else col_name # Remove common prefixes label = label.replace('Character_Ranking_', '') label = label.replace('Top_3_Voices_ranking_', '') # Replace underscores with spaces label = label.replace('_', ' ').strip() return label def _get_voice_gender(self, voice_label: str) -> str: """Get the gender of a voice from its label. Parameters: voice_label: Voice label (e.g., 'V14', 'Voice 14', etc.) Returns: 'Male' or 'Female', defaults to 'Male' if not found """ # Extract voice code (e.g., 'V14' from 'Voice 14' or 'V14') voice_code = None # Try to find VXX pattern match = re.search(r'V(\d+)', voice_label) if match: voice_code = f"V{match.group(1)}" else: # Try to extract number and prepend V match = re.search(r'(\d+)', voice_label) if match: voice_code = f"V{match.group(1)}" if voice_code and voice_code in VOICE_GENDER_MAPPING: return VOICE_GENDER_MAPPING[voice_code] return "Male" # Default to Male if unknown def _get_gender_color(self, gender: str, color_type: str = "primary") -> str: """Get the appropriate color based on gender. Parameters: gender: 'Male' or 'Female' color_type: One of 'primary', 'rank_1', 'rank_2', 'rank_3', 'neutral' Returns: Hex color string """ color_map = { "Male": { "primary": ColorPalette.GENDER_MALE, "rank_1": ColorPalette.GENDER_MALE_RANK_1, "rank_2": ColorPalette.GENDER_MALE_RANK_2, "rank_3": ColorPalette.GENDER_MALE_RANK_3, "neutral": ColorPalette.GENDER_MALE_NEUTRAL, }, "Female": { "primary": ColorPalette.GENDER_FEMALE, "rank_1": ColorPalette.GENDER_FEMALE_RANK_1, "rank_2": ColorPalette.GENDER_FEMALE_RANK_2, "rank_3": ColorPalette.GENDER_FEMALE_RANK_3, "neutral": ColorPalette.GENDER_FEMALE_NEUTRAL, } } return color_map.get(gender, color_map["Male"]).get(color_type, ColorPalette.PRIMARY) def plot_average_scores_with_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "General Impression (1-10)\nPer Voice with Number of Participants Who Rated It", x_label: str = "Stimuli", y_label: str = "Average General Impression Rating (1-10)", color: str = ColorPalette.PRIMARY, height: int | None = None, width: int | str | None = None, domain: list[float] | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar plot showing average scores and count of non-null values for each column. Parameters: color_gender: If True, color bars by voice gender (blue=male, pink=female). """ df = self._ensure_dataframe(data) # Calculate stats for each column (exclude _recordId) stats = [] for col in [c for c in df.columns if c != '_recordId']: avg_score = df[col].mean() non_null_count = df[col].drop_nulls().len() label = self._clean_voice_label(col) gender = self._get_voice_gender(label) if color_gender else None stats.append({ 'voice': label, 'average': avg_score, 'count': non_null_count, 'gender': gender }) # Convert to pandas for Altair (sort by average descending) stats_df = pl.DataFrame(stats).sort('average', descending=True).to_pandas() if domain is None: domain = [stats_df['average'].min(), stats_df['average'].max()] # Base bar chart - use y2 to explicitly start bars at domain minimum if color_gender: bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('voice:N', title=x_label, sort='-y'), y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)), y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge) color=alt.Color('gender:N', scale=alt.Scale(domain=['Male', 'Female'], range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]), legend=alt.Legend(orient='top', direction='horizontal', title='Gender')), tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('average:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('gender:N', title='Gender') ] ) else: bars = alt.Chart(stats_df).mark_bar(color=color).encode( x=alt.X('voice:N', title=x_label, sort='-y'), y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)), y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge) tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('average:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count') ] ) # Text overlay for counts text = alt.Chart(stats_df).mark_text( dy=-5, color='black', fontSize=10 ).encode( x=alt.X('voice:N', sort='-y'), y=alt.Y('average:Q'), text=alt.Text('count:Q') ) # Combine layers chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_top3_ranking_distribution( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Top 3 Rankings Distribution\nCount of 1st, 2nd, and 3rd Place Votes per Voice", x_label: str = "Voices", y_label: str = "Number of Mentions in Top 3", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a stacked bar chart showing how often each voice was ranked 1st, 2nd, or 3rd.""" df = self._ensure_dataframe(data) # Calculate stats per column stats = [] for col in [c for c in df.columns if c != '_recordId']: rank1 = df.filter(pl.col(col) == 1).height rank2 = df.filter(pl.col(col) == 2).height rank3 = df.filter(pl.col(col) == 3).height total = rank1 + rank2 + rank3 if total > 0: label = self._clean_voice_label(col) # Add 3 rows (one per rank) stats.append({'voice': label, 'rank': 'Rank 1 (1st Choice)', 'count': rank1, 'total': total}) stats.append({'voice': label, 'rank': 'Rank 2 (2nd Choice)', 'count': rank2, 'total': total}) stats.append({'voice': label, 'rank': 'Rank 3 (3rd Choice)', 'count': rank3, 'total': total}) # Convert to long format, sort by total stats_df = pl.DataFrame(stats).to_pandas() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['rank'], bind='legend') # Create stacked bar chart with interactive legend chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('voice:N', title=x_label, sort=alt.EncodingSortField(field='total', op='sum', order='descending')), y=alt.Y('count:Q', title=y_label, stack='zero'), color=alt.Color('rank:N', scale=alt.Scale(domain=['Rank 1 (1st Choice)', 'Rank 2 (2nd Choice)', 'Rank 3 (3rd Choice)'], range=[ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3]), legend=alt.Legend(orient='top', direction='horizontal', title=None)), order=alt.Order('rank:N', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_ranking_distribution( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Rankings Distribution\n(1st to 3rd Place)", x_label: str = "Item", y_label: str = "Number of Votes", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a stacked bar chart showing the distribution of rankings (1st to 3rd). Parameters: color_gender: If True, color bars by voice gender with rank intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) stats = [] ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: r1 = df.filter(pl.col(col) == 1).height r2 = df.filter(pl.col(col) == 2).height r3 = df.filter(pl.col(col) == 3).height # r4 = df.filter(pl.col(col) == 4).height total = r1 + r2 + r3 if total > 0: label = self._clean_voice_label(col) gender = self._get_voice_gender(label) if color_gender else None stats.append({'item': label, 'rank': 'Rank 1 (Best)', 'count': r1, 'total': total, 'gender': gender, 'rank_order': 1}) stats.append({'item': label, 'rank': 'Rank 2', 'count': r2, 'total': total, 'gender': gender, 'rank_order': 2}) stats.append({'item': label, 'rank': 'Rank 3', 'count': r3, 'total': total, 'gender': gender, 'rank_order': 3}) # stats.append({'item': label, 'rank': 'Rank 4 (Worst)', 'count': r4, 'total': total, 'gender': gender, 'rank_order': 4}) if not stats: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') stats_df = pl.DataFrame(stats).to_pandas() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['rank'], bind='legend') if color_gender: # Add gender_rank column for combined color encoding stats_df['gender_rank'] = stats_df['gender'] + ' - ' + stats_df['rank'] # Define combined domain and range for gender + rank domain = [ 'Male - Rank 1 (Best)', 'Male - Rank 2', 'Male - Rank 3', 'Female - Rank 1 (Best)', 'Female - Rank 2', 'Female - Rank 3' ] range_colors = [ ColorPalette.GENDER_MALE_RANK_1, ColorPalette.GENDER_MALE_RANK_2, ColorPalette.GENDER_MALE_RANK_3, ColorPalette.GENDER_FEMALE_RANK_1, ColorPalette.GENDER_FEMALE_RANK_2, ColorPalette.GENDER_FEMALE_RANK_3 ] chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=alt.EncodingSortField(field='total', order='descending')), y=alt.Y('count:Q', title=y_label, stack='zero'), color=alt.Color('gender_rank:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None, columns=3)), order=alt.Order('rank_order:Q', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('gender:N', title='Gender') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) else: chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=alt.EncodingSortField(field='total', order='descending')), y=alt.Y('count:Q', title=y_label, stack='zero'), color=alt.Color('rank:N', scale=alt.Scale(domain=['Rank 1 (Best)', 'Rank 2', 'Rank 3'], range=[ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3]), legend=alt.Legend(orient='top', direction='horizontal', title=None)), order=alt.Order('rank_order:Q', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_most_ranked_1( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Most Popular Choice\n(Number of Times Ranked 1st)", x_label: str = "Item", y_label: str = "Count of 1st Place Rankings", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar chart showing which item was ranked #1 the most. Top 3 highlighted. Parameters: color_gender: If True, color bars by voice gender with highlight/neutral intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) stats = [] ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: count_rank_1 = df.filter(pl.col(col) == 1).height label = self._clean_voice_label(col) gender = self._get_voice_gender(label) if color_gender else None stats.append({'item': label, 'count': count_rank_1, 'gender': gender}) # Convert and sort stats_df = pl.DataFrame(stats).sort('count', descending=True) # Add rank column for coloring (1-3 vs 4+) stats_df = stats_df.with_row_index('rank_index') stats_df = stats_df.with_columns( pl.when(pl.col('rank_index') < 3) .then(pl.lit('Top 3')) .otherwise(pl.lit('Other')) .alias('category') ).to_pandas() if color_gender: # Add gender_category column for combined color encoding stats_df['gender_category'] = stats_df['gender'] + ' - ' + stats_df['category'] # Define combined domain and range for gender + category domain = ['Male - Top 3', 'Male - Other', 'Female - Top 3', 'Female - Other'] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_NEUTRAL, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ] chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('gender_category:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('count:Q', title='1st Place Votes'), alt.Tooltip('gender:N', title='Gender') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) else: # Bar chart with conditional color chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 3', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('count:Q', title='1st Place Votes') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_weighted_ranking_score( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Weighted Popularity Score\n(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label: str = "Character Personality", y_label: str = "Total Weighted Score", color: str = ColorPalette.PRIMARY, height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar chart showing the weighted ranking score for each character. Parameters: color_gender: If True, color bars by voice gender (blue=male, pink=female). """ weighted_df = self._ensure_dataframe(data).to_pandas() if color_gender: # Add gender column based on Character name weighted_df['gender'] = weighted_df['Character'].apply(self._get_voice_gender) # Bar chart with gender coloring bars = alt.Chart(weighted_df).mark_bar().encode( x=alt.X('Character:N', title=x_label, sort='-y'), y=alt.Y('Weighted Score:Q', title=y_label), color=alt.Color('gender:N', scale=alt.Scale(domain=['Male', 'Female'], range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]), legend=alt.Legend(orient='top', direction='horizontal', title='Gender')), tooltip=[ alt.Tooltip('Character:N'), alt.Tooltip('Weighted Score:Q', title='Score'), alt.Tooltip('gender:N', title='Gender') ] ) else: # Bar chart bars = alt.Chart(weighted_df).mark_bar(color=color).encode( x=alt.X('Character:N', title=x_label, sort='-y'), y=alt.Y('Weighted Score:Q', title=y_label), tooltip=[ alt.Tooltip('Character:N'), alt.Tooltip('Weighted Score:Q', title='Score') ] ) # Text overlay text = bars.mark_text( dy=-5, color='white', fontSize=11 ).encode( text='Weighted Score:Q' ) chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_voice_selection_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, target_column: str = "8_Combined", title: str = "Most Frequently Chosen Voices\n(Top 8 Highlighted)", x_label: str = "Voice", y_label: str = "Number of Times Chosen", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar plot showing the frequency of voice selections. Parameters: color_gender: If True, color bars by voice gender with highlight/neutral intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) if target_column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{target_column}' not found"]})).mark_text().encode(text='text:N') # Process data: split, explode, count stats_df = ( df.select(pl.col(target_column)) .drop_nulls() .with_columns(pl.col(target_column).str.split(",")) .explode(target_column) .with_columns(pl.col(target_column).str.strip_chars()) .filter(pl.col(target_column) != "") .group_by(target_column) .agg(pl.len().alias("count")) .sort("count", descending=True) .with_row_index('rank_index') .with_columns( pl.when(pl.col('rank_index') < 8) .then(pl.lit('Top 8')) .otherwise(pl.lit('Other')) .alias('category') ) .to_pandas() ) if color_gender: # Add gender column based on voice label stats_df['gender'] = stats_df[target_column].apply(self._get_voice_gender) # Add gender_category column for combined color encoding stats_df['gender_category'] = stats_df['gender'] + ' - ' + stats_df['category'] # Define combined domain and range for gender + category domain = ['Male - Top 8', 'Male - Other', 'Female - Top 8', 'Female - Other'] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_NEUTRAL, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ] chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('gender_category:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None)), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='Selections'), alt.Tooltip('gender:N', title='Gender') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) else: chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 8', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='Selections') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_top3_selection_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, target_column: str = "3_Ranked", title: str = "Most Frequently Chosen Top 3 Voices\n(Top 3 Highlighted)", x_label: str = "Voice", y_label: str = "Count of Mentions in Top 3", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Question: Which 3 voices are chosen the most out of 18? Parameters: color_gender: If True, color bars by voice gender with highlight/neutral intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) if target_column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{target_column}' not found"]})).mark_text().encode(text='text:N') stats_df = ( df.select(pl.col(target_column)) .drop_nulls() .with_columns(pl.col(target_column).str.split(",")) .explode(target_column) .with_columns(pl.col(target_column).str.strip_chars()) .filter(pl.col(target_column) != "") .group_by(target_column) .agg(pl.len().alias("count")) .sort("count", descending=True) .with_row_index('rank_index') .with_columns( pl.when(pl.col('rank_index') < 3) .then(pl.lit('Top 3')) .otherwise(pl.lit('Other')) .alias('category') ) .to_pandas() ) if color_gender: # Add gender column based on voice label stats_df['gender'] = stats_df[target_column].apply(self._get_voice_gender) # Add gender_category column for combined color encoding stats_df['gender_category'] = stats_df['gender'] + ' - ' + stats_df['category'] # Define combined domain and range for gender + category domain = ['Male - Top 3', 'Male - Other', 'Female - Top 3', 'Female - Other'] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_NEUTRAL, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ] chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('gender_category:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None)), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='In Top 3'), alt.Tooltip('gender:N', title='Gender') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) else: chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 3', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='In Top 3') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_trait_scores( self, data: pl.LazyFrame | pl.DataFrame | None = None, trait_description: str = None, left_anchor: str = None, right_anchor: str = None, title: str = "Speaking Style Trait Analysis", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Plot scores for a single speaking style trait across multiple voices.""" df = self._ensure_dataframe(data) if df.is_empty(): return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') required_cols = ["Voice", "score"] if not all(col in df.columns for col in required_cols): return alt.Chart(pd.DataFrame({'text': ['Missing required columns']})).mark_text().encode(text='text:N') # Calculate stats: Mean, Count stats = ( df.filter(pl.col("score").is_not_null()) .group_by("Voice") .agg([ pl.col("score").mean().alias("mean_score"), pl.col("score").count().alias("count") ]) .sort("mean_score", descending=False) # Ascending for bottom-to-top display .to_pandas() ) # Extract anchors from data if not provided if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns: head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1) if not head.is_empty(): if left_anchor is None: left_anchor = head["Left_Anchor"][0] if right_anchor is None: right_anchor = head["Right_Anchor"][0] if trait_description is None: if left_anchor and right_anchor: trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}" elif "Description" in df.columns: head = df.filter(pl.col("Description").is_not_null()).head(1) trait_description = head["Description"][0] if not head.is_empty() else "" else: trait_description = "" # Horizontal bar chart - use x2 to explicitly start bars at x=1 bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5])), x2=alt.datum(1), # Bars start at x=1 (left edge of domain) y=alt.Y('Voice:N', title='Voice', sort='-x'), tooltip=[ alt.Tooltip('Voice:N'), alt.Tooltip('mean_score:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count') ] ) # Count text at end of bars (right-aligned inside bar) text = alt.Chart(stats).mark_text( align='right', baseline='middle', color='white', fontSize=12, dx=-5 # Slight padding from bar end ).encode( x='mean_score:Q', y=alt.Y('Voice:N', sort='-x'), text='count:Q' ) # Combine layers chart = (bars + text).properties( title={ "text": self._process_title(title), "subtitle": [trait_description, "(Numbers on bars indicate respondent count)"] }, width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_correlation( self, style_color: str, style_traits: list[str], data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores (1-5) and Voice Scale (1-10).""" df = self._ensure_dataframe(data) if title is None: title = f"Speaking style and voice scale 1-10 correlations" trait_correlations = [] # Calculate correlations for i, trait in enumerate(style_traits): subset = df.filter(pl.col("Right_Anchor") == trait) valid_data = subset.select(["score", "Voice_Scale_Score"]).drop_nulls() if valid_data.height > 1: corr_val = valid_data.select(pl.corr("score", "Voice_Scale_Score")).item() # Wrap trait text at '|' for display trait_display = trait.replace('|', '\n') trait_correlations.append({ "trait_display": trait_display, "trait_index": f"Trait {i+1}", "correlation": corr_val if corr_val is not None else 0.0 }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() # Conditional color based on sign chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0)), y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1])), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('correlation:Q', format='.2f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_color_correlation( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Speaking Style and Voice Scale 1-10 Correlations
(Average by Color)", width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plot high-level correlation showing one bar per speaking style color. Original use-case: "I want to create high-level correlation plots between 'green, blue, orange, red' speaking styles and the 'voice scale scores'. I want to go to one plot with one bar for each color." Args: data: DataFrame with columns [Color, correlation, n_traits] from utils.transform_speaking_style_color_correlation title: Chart title (supports
for line breaks) width: Chart width in pixels height: Chart height in pixels Returns: Altair chart with one bar per speaking style color """ df = self._ensure_dataframe(data) # Conditional color based on sign (matches plot_speaking_style_correlation) chart = alt.Chart(df.to_pandas()).mark_bar().encode( x=alt.X('Color:N', title=None, axis=alt.Axis(labelAngle=0), sort=["Green", "Blue", "Orange", "Red"]), y=alt.Y('correlation:Q', title='Average Correlation', scale=alt.Scale(domain=[-1, 1])), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('Color:N', title='Speaking Style'), alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'), alt.Tooltip('n_traits:Q', title='# Traits') ] ).properties( title=self._process_title(title), width=width or 400, height=height or 350 ) chart = self._save_plot(chart, title) return chart def plot_demographic_distribution( self, column: str, data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, height: int | None = None, width: int | str | None = None, show_counts: bool = True, ) -> alt.Chart: """Create a horizontal bar chart showing the distribution of respondents by a demographic column. Designed to be compact so multiple charts (approx. 6) can fit on one slide. Uses horizontal bars for better readability with many categories. Parameters: column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity'). data: Optional DataFrame. If None, uses self.data_filtered. title: Chart title. If None, auto-generates based on column name. height: Chart height in pixels (default: auto-sized based on categories). width: Chart width in pixels (default: 280 for compact layout). show_counts: If True, display count labels on the bars. Returns: alt.Chart: An Altair horizontal bar chart showing the distribution. """ df = self._ensure_dataframe(data) if column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N') # Count values in the column, including nulls stats_df = ( df.select(pl.col(column)) .with_columns(pl.col(column).fill_null("(No Response)")) .group_by(column) .agg(pl.len().alias("count")) .sort("count", descending=True) .to_pandas() ) if stats_df.empty: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') # Calculate percentages total = stats_df['count'].sum() stats_df['percentage'] = (stats_df['count'] / total * 100).round(1) # Generate title if not provided if title is None: clean_col = column.replace('_', ' ').replace('/', ' / ') title = f"Distribution: {clean_col}" # Calculate appropriate height based on number of categories num_categories = len(stats_df) bar_height = 18 # pixels per bar calculated_height = max(120, num_categories * bar_height + 40) # min 120px, +40 for title/padding # Horizontal bar chart - categories on Y axis, counts on X axis bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)), y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)), tooltip=[ alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('percentage:Q', title='Percentage', format='.1f') ] ) # Add count labels at end of bars if show_counts: text = alt.Chart(stats_df).mark_text( align='left', baseline='middle', dx=3, # Offset from bar end fontSize=9, color=ColorPalette.TEXT ).encode( x='count:Q', y=alt.Y(f'{column}:N', sort='-x'), text='count:Q' ) chart = (bars + text) else: chart = bars # Compact dimensions for 6-per-slide layout chart = chart.properties( title=self._process_title(title), width=width or 200, height=height or calculated_height ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_ranking_correlation( self, style_color: str, style_traits: list[str], data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores (1-5) and Voice Ranking Points (0-3).""" df = self._ensure_dataframe(data) if title is None: title = f"Speaking style {style_color} and voice ranking points correlations" trait_correlations = [] for i, trait in enumerate(style_traits): subset = df.filter(pl.col("Right_Anchor") == trait) valid_data = subset.select(["score", "Ranking_Points"]).drop_nulls() if valid_data.height > 1: corr_val = valid_data.select(pl.corr("score", "Ranking_Points")).item() trait_display = trait.replace('|', '\n') trait_correlations.append({ "trait_display": trait_display, "trait_index": f"Trait {i+1}", "correlation": corr_val if corr_val is not None else 0.0 }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0)), y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1])), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('correlation:Q', format='.2f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) chart = self._save_plot(chart, title) return chart def plot_traits_wordcloud( self, data: pl.LazyFrame | pl.DataFrame | None = None, column: str = 'Top_3_Traits', title: str = "Most Prominent Personality Traits", width: int = 1600, height: int = 800, background_color: str = 'white', random_state: int = 23, ): """Create a word cloud visualization of personality traits from survey data. Args: data: Polars DataFrame or LazyFrame containing trait data column: Name of column containing comma-separated traits title: Title for the word cloud width: Width of the word cloud image in pixels height: Height of the word cloud image in pixels background_color: Background color for the word cloud random_state: Random seed for reproducible word cloud generation (default: 23) Returns: matplotlib.figure.Figure: The word cloud figure for display in notebooks """ import matplotlib.pyplot as plt from wordcloud import WordCloud from collections import Counter import random df = self._ensure_dataframe(data) # Extract and split traits traits_list = [] for row in df[column].drop_nulls(): # Split by comma and clean whitespace traits = [trait.strip() for trait in row.split(',')] traits_list.extend(traits) # Create frequency dictionary trait_freq = Counter(traits_list) # Set random seed for color selection random.seed(random_state) # Color function using JPMC colors def color_func(word, font_size, position, orientation, random_state=None, **kwargs): colors = [ ColorPalette.PRIMARY, ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3, ] return random.choice(colors) # Generate word cloud wordcloud = WordCloud( width=width, height=height, background_color=background_color, color_func=color_func, relative_scaling=0.5, min_font_size=10, prefer_horizontal=0.7, collocations=False, # Treat each word independently random_state=random_state # Seed for reproducible layout ).generate_from_frequencies(trait_freq) # Create matplotlib figure fig, ax = plt.subplots(figsize=(width/100, height/100), dpi=100) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title(title, fontsize=16, pad=20, color=ColorPalette.TEXT) plt.tight_layout(pad=0) # Save figure if directory specified (using same pattern as other plots) if hasattr(self, 'fig_save_dir') and self.fig_save_dir: save_path = Path(self.fig_save_dir) # Add filter slug subfolder filter_slug = self._get_filter_slug() save_path = save_path / filter_slug if not save_path.exists(): save_path.mkdir(parents=True, exist_ok=True) # Use _sanitize_filename for consistency filename = f"{self._sanitize_filename(title)}.png" filepath = save_path / filename # Save as PNG at high resolution fig.savefig(filepath, dpi=300, bbox_inches='tight', facecolor='white') print(f"Word cloud saved to: {filepath}") return fig def plot_character_trait_frequency( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Trait Frequency per Brand Character", x_label: str = "Trait", y_label: str = "Frequency (Times Chosen)", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a grouped bar plot showing how often each trait is chosen per character. Original request: "I need a bar plot that shows the frequency of the times each trait is chosen per brand character" Expects data with columns: Character, Trait, Count (as produced by transform_character_trait_frequency). """ df = self._ensure_dataframe(data) # Ensure we have the expected columns required_cols = {'Character', 'Trait', 'Count'} if not required_cols.issubset(set(df.columns)): return alt.Chart(pd.DataFrame({'text': ['Data must have Character, Trait, Count columns']})).mark_text().encode(text='text:N') # Convert to pandas for Altair plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df # Calculate total per trait for sorting (traits with highest overall frequency first) trait_totals = plot_df.groupby('Trait')['Count'].sum().sort_values(ascending=False) trait_order = trait_totals.index.tolist() # Get unique characters for color mapping characters = plot_df['Character'].unique().tolist() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['Character'], bind='legend') chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('Trait:N', title=x_label, sort=trait_order, axis=alt.Axis(labelAngle=-45, labelLimit=200)), y=alt.Y('Count:Q', title=y_label), xOffset='Character:N', color=alt.Color('Character:N', scale=alt.Scale(domain=characters, range=ColorPalette.CATEGORICAL[:len(characters)]), legend=alt.Legend(orient='top', direction='horizontal', title='Character')), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('Character:N', title='Character'), alt.Tooltip('Trait:N', title='Trait'), alt.Tooltip('Count:Q', title='Frequency') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 900, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_single_character_trait_frequency( self, data: pl.LazyFrame | pl.DataFrame | None = None, character_name: str = "Character", bar_color: str = ColorPalette.PRIMARY, highlight_color: str = ColorPalette.NEUTRAL, title: str | None = None, x_label: str = "Trait", y_label: str = "Frequency", trait_sort_order: list[str] | None = None, height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a bar plot showing trait frequency for a single character. Original request: "I need a bar plot that shows the frequency of the times each trait is chosen per brand character. The function should be generalized so that it can be used 4 times, once for each character. Each character should use a slightly different color. Original traits should be highlighted." This function creates one plot per character. Call it 4 times (once per character) to generate all plots for a slide. Args: data: DataFrame with columns ['trait', 'count', 'is_original'] as produced by transform_character_trait_frequency() character_name: Name of the character (for title). E.g., "Bank Teller" bar_color: Main bar color for non-original traits. Use ColorPalette constants like ColorPalette.CHARACTER_BANK_TELLER highlight_color: Lighter color for original/expected traits. Use the matching highlight like ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT title: Custom title. If None, auto-generates from character_name x_label: X-axis label y_label: Y-axis label trait_sort_order: Optional list of traits for consistent sorting across all character plots. If None, sorts by count descending. height: Chart height width: Chart width Returns: alt.Chart: Altair bar chart """ df = self._ensure_dataframe(data) # Ensure we have the expected columns required_cols = {'trait', 'count', 'is_original'} if not required_cols.issubset(set(df.columns)): return alt.Chart(pd.DataFrame({ 'text': ['Data must have trait, count, is_original columns'] })).mark_text().encode(text='text:N') # Convert to pandas for Altair plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df # Determine sort order if trait_sort_order is not None: # Use provided order, append any missing traits at the end (sorted by count) known_traits = set(trait_sort_order) extra_traits = plot_df[~plot_df['trait'].isin(known_traits)].sort_values( 'count', ascending=False )['trait'].tolist() sort_order = trait_sort_order + extra_traits else: # Default: sort by count descending sort_order = plot_df.sort_values('count', ascending=False)['trait'].tolist() # Create category column for color encoding plot_df['category'] = plot_df['is_original'].map({ True: 'Original Trait', False: 'Other Trait' }) # Generate title if not provided if title is None: title = f"{character_name}
Trait Selection Frequency" # Build title config with sort order note as subtitle sort_note = "Sorted by total frequency across all characters" if trait_sort_order else "Sorted by frequency (descending)" title_text = self._process_title(title) title_config = { 'text': title_text, 'subtitle': sort_note, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } # Create HORIZONTAL bar chart with conditional coloring # Reverse sort order for horizontal bars (highest at top) reversed_sort = list(reversed(sort_order)) bars = alt.Chart(plot_df).mark_bar().encode( y=alt.Y('trait:N', title=x_label, sort=reversed_sort, axis=alt.Axis(labelLimit=200)), x=alt.X('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale( domain=['Original Trait', 'Other Trait'], range=[highlight_color, bar_color] ), legend=alt.Legend( orient='top', direction='horizontal', title=None )), tooltip=[ alt.Tooltip('trait:N', title='Trait'), alt.Tooltip('count:Q', title='Frequency'), alt.Tooltip('category:N', title='Type') ] ) # Add count labels on bars (to the right of bars for horizontal) text = alt.Chart(plot_df).mark_text( dx=12, color='black', fontSize=10, align='left' ).encode( y=alt.Y('trait:N', sort=reversed_sort), x=alt.X('count:Q'), text=alt.Text('count:Q') ) chart = (bars + text).properties( title=title_config, width=width or 400, height=height or getattr(self, 'plot_height', 450) ) chart = self._save_plot(chart, title) return chart def plot_significance_heatmap( self, pairwise_df: pl.LazyFrame | pl.DataFrame | None = None, metadata: dict | None = None, title: str = "Pairwise Statistical Significance
(Adjusted p-values)", show_p_values: bool = True, show_effect_size: bool = False, height: int | None = None, width: int | None = None, ) -> alt.Chart: """Create a heatmap showing pairwise statistical significance between groups. Original use-case: "I need to test for statistical significance and present this in a logical manner - as a heatmap or similar visualization." This function visualizes the output of compute_pairwise_significance() as a color-coded heatmap where color intensity indicates significance level. Args: pairwise_df: Output from compute_pairwise_significance(). Expected columns: ['group1', 'group2', 'p_value', 'p_adjusted', 'significant'] metadata: Metadata dict from compute_pairwise_significance() (optional). Used to add test information to the plot subtitle. title: Chart title (supports
for line breaks) show_p_values: Whether to display p-values as text annotations show_effect_size: Whether to display effect sizes instead of p-values height: Chart height (default: auto-sized based on groups) width: Chart width (default: auto-sized based on groups) Returns: alt.Chart: Altair heatmap chart """ df = self._ensure_dataframe(pairwise_df) # Get unique groups all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list())) n_groups = len(all_groups) # Create symmetric matrix data for heatmap # We need both directions (A,B) and (B,A) for the full matrix heatmap_data = [] for row_group in all_groups: for col_group in all_groups: if row_group == col_group: # Diagonal - self comparison heatmap_data.append({ 'row': row_group, 'col': col_group, 'p_adjusted': None, 'p_value': None, 'significant': None, 'effect_size': None, 'text_label': '—', 'sig_category': 'Self', }) else: # Find the comparison (could be in either order) match = df.filter( ((pl.col('group1') == row_group) & (pl.col('group2') == col_group)) | ((pl.col('group1') == col_group) & (pl.col('group2') == row_group)) ) if match.height > 0: p_adj = match['p_adjusted'][0] p_val = match['p_value'][0] sig = match['significant'][0] eff = match['effect_size'][0] if 'effect_size' in match.columns else None # For ranking data, we can show Rank 1 % difference has_rank_pcts = 'rank1_pct1' in match.columns and 'rank1_pct2' in match.columns if has_rank_pcts: pct_diff = abs(match['rank1_pct1'][0] - match['rank1_pct2'][0]) else: pct_diff = None # Helper to get display text when not showing p-values def get_alt_text(): if eff is not None: return f'{eff:.2f}' elif pct_diff is not None: return f'{pct_diff:.1f}%' else: return '—' # Categorize significance level if p_adj is None: sig_cat = 'N/A' text = 'N/A' elif p_adj < 0.001: sig_cat = 'p < 0.001' text = '<.001' if show_p_values else get_alt_text() elif p_adj < 0.01: sig_cat = 'p < 0.01' text = f'{p_adj:.3f}' if show_p_values else get_alt_text() elif p_adj < 0.05: sig_cat = 'p < 0.05' text = f'{p_adj:.3f}' if show_p_values else get_alt_text() else: sig_cat = 'n.s.' text = f'{p_adj:.2f}' if show_p_values else get_alt_text() if show_effect_size: text = get_alt_text() heatmap_data.append({ 'row': row_group, 'col': col_group, 'p_adjusted': p_adj, 'p_value': p_val, 'significant': sig, 'effect_size': eff, 'text_label': text, 'sig_category': sig_cat, }) else: heatmap_data.append({ 'row': row_group, 'col': col_group, 'p_adjusted': None, 'p_value': None, 'significant': None, 'effect_size': None, 'text_label': 'N/A', 'sig_category': 'N/A', }) heatmap_df = pl.DataFrame(heatmap_data).to_pandas() # Define color scale for significance categories sig_domain = ['p < 0.001', 'p < 0.01', 'p < 0.05', 'n.s.', 'Self', 'N/A'] sig_range = [ ColorPalette.SIG_STRONG, # p < 0.001 ColorPalette.SIG_MODERATE, # p < 0.01 ColorPalette.SIG_WEAK, # p < 0.05 ColorPalette.SIG_NONE, # not significant ColorPalette.SIG_DIAGONAL, # diagonal (self) ColorPalette.NEUTRAL, # N/A ] # Build tooltip fields based on available data tooltip_fields = [ alt.Tooltip('row:N', title='Group 1'), alt.Tooltip('col:N', title='Group 2'), alt.Tooltip('p_value:Q', title='p-value', format='.4f'), alt.Tooltip('p_adjusted:Q', title='Adjusted p', format='.4f'), ] # Only add effect_size if it has non-null values (continuous data) has_effect_size = 'effect_size' in heatmap_df.columns and heatmap_df['effect_size'].notna().any() if has_effect_size: tooltip_fields.append(alt.Tooltip('effect_size:Q', title='Effect Size', format='.3f')) # Add rank info for ranking data has_rank_pcts = 'rank1_pct1' in df.columns if isinstance(df, pl.DataFrame) else False if has_rank_pcts: tooltip_fields.append(alt.Tooltip('text_label:N', title='Rank 1 % Diff')) # Calculate dimensions cell_size = 45 auto_size = n_groups * cell_size + 100 chart_width = width or auto_size chart_height = height or auto_size # Base heatmap heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode( x=alt.X('col:N', title=None, sort=all_groups, axis=alt.Axis(labelAngle=-45, labelLimit=150)), y=alt.Y('row:N', title=None, sort=all_groups, axis=alt.Axis(labelLimit=150)), color=alt.Color('sig_category:N', scale=alt.Scale(domain=sig_domain, range=sig_range), legend=alt.Legend( title='Significance', orient='right', direction='vertical' )), tooltip=tooltip_fields ) # Text annotations if show_p_values or show_effect_size: # Add a column for text color based on significance heatmap_df['text_color'] = heatmap_df['sig_category'].apply( lambda x: 'white' if x in ['p < 0.001', 'p < 0.01'] else 'black' ) text = alt.Chart(heatmap_df).mark_text( fontSize=9, fontWeight='normal' ).encode( x=alt.X('col:N', sort=all_groups), y=alt.Y('row:N', sort=all_groups), text='text_label:N', color=alt.Color('text_color:N', scale=None), ) chart = (heatmap + text) else: chart = heatmap # Build subtitle with test info subtitle_lines = [] if metadata: test_info = f"Test: {metadata.get('test_type', 'N/A')}" if metadata.get('overall_p_value') is not None: test_info += f" | Overall p={metadata['overall_p_value']:.4f}" correction = metadata.get('correction', 'none') if correction != 'none': test_info += f" | Correction: {correction}" subtitle_lines.append(test_info) title_config = { 'text': self._process_title(title), 'subtitle': subtitle_lines if subtitle_lines else None, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } chart = chart.properties( title=title_config, width=chart_width, height=chart_height, ) chart = self._save_plot(chart, title) return chart def plot_significance_summary( self, pairwise_df: pl.LazyFrame | pl.DataFrame | None = None, metadata: dict | None = None, title: str = "Significant Differences Summary
(Groups with significantly different means)", height: int | None = None, width: int | None = None, ) -> alt.Chart: """Create a summary bar chart showing which groups have significant differences. This shows each group with a count of how many other groups it differs from significantly, plus the mean score or Rank 1 percentage for reference. Args: pairwise_df: Output from compute_pairwise_significance() or compute_ranking_significance(). metadata: Metadata dict from the significance computation (optional). title: Chart title height: Chart height width: Chart width Returns: alt.Chart: Altair bar chart with significance count per group """ df = self._ensure_dataframe(pairwise_df) # Detect data type: continuous (has mean1/mean2) vs ranking (has rank1_pct1/rank1_pct2) has_means = 'mean1' in df.columns has_ranks = 'rank1_pct1' in df.columns # Count significant differences per group sig_df = df.filter(pl.col('significant') == True) # Count for each group (appears as either group1 or group2) group1_counts = sig_df.group_by('group1').agg(pl.len().alias('count')) group2_counts = sig_df.group_by('group2').agg(pl.len().alias('count')) # Combine counts all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list())) summary_data = [] for group in all_groups: count1 = group1_counts.filter(pl.col('group1') == group)['count'].to_list() count2 = group2_counts.filter(pl.col('group2') == group)['count'].to_list() total_sig = (count1[0] if count1 else 0) + (count2[0] if count2 else 0) # Get score for this group from pairwise data if has_means: # Continuous data - use means scores = df.filter(pl.col('group1') == group)['mean1'].to_list() if not scores: scores = df.filter(pl.col('group2') == group)['mean2'].to_list() score_val = scores[0] if scores else None score_label = 'mean' elif has_ranks: # Ranking data - use Rank 1 percentage scores = df.filter(pl.col('group1') == group)['rank1_pct1'].to_list() if not scores: scores = df.filter(pl.col('group2') == group)['rank1_pct2'].to_list() score_val = scores[0] if scores else None score_label = 'rank1_pct' else: score_val = None score_label = 'score' summary_data.append({ 'group': group, 'sig_count': total_sig, 'score': score_val, }) summary_df = pl.DataFrame(summary_data).sort('score', descending=True, nulls_last=True).to_pandas() # Create layered chart: bars for sig_count, text for score tooltip_title = 'Mean Score' if has_means else 'Rank 1 %' if has_ranks else 'Score' bars = alt.Chart(summary_df).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('group:N', title='Group', sort='-y'), y=alt.Y('sig_count:Q', title='# of Significant Differences'), tooltip=[ alt.Tooltip('group:N', title='Group'), alt.Tooltip('sig_count:Q', title='Sig. Differences'), alt.Tooltip('score:Q', title=tooltip_title, format='.1f'), ] ) # Only add text labels if we have scores if summary_df['score'].notna().any(): text_format = '.1f' if has_means else '.0f' text_suffix = '%' if has_ranks else '' text = alt.Chart(summary_df).mark_text( dy=-8, color='black', fontSize=9 ).encode( x=alt.X('group:N', sort='-y'), y=alt.Y('sig_count:Q'), text=alt.Text('score:Q', format=text_format) ) chart_layers = bars + text else: chart_layers = bars # Build subtitle subtitle = None if metadata: if has_means: subtitle = f"Mean scores shown above bars | α={metadata.get('alpha', 0.05)}" elif has_ranks: subtitle = f"Rank 1 % shown above bars | α={metadata.get('alpha', 0.05)}" else: subtitle = f"α={metadata.get('alpha', 0.05)}" title_config = { 'text': self._process_title(title), 'subtitle': subtitle, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } chart = chart_layers.properties( title=title_config, width=width or 800, height=height or getattr(self, 'plot_height', 400), ) chart = self._save_plot(chart, title) return chart