"""Plotting functions for Voice Branding analysis using Altair.""" import re import math import textwrap from pathlib import Path import altair as alt import pandas as pd import polars as pl from theme import ColorPalette from reference import VOICE_GENDER_MAPPING import hashlib class QualtricsPlotsMixin: """Mixin class for plotting functions in QualtricsSurvey.""" def _process_title(self, title: str) -> str | list[str]: """Process title to handle
tags for Altair.""" if isinstance(title, str) and '
' in title: return title.split('
') return title def _sanitize_filename(self, title: str) -> str: """Convert plot title to a safe filename.""" # Remove HTML tags clean = re.sub(r'<[^>]+>', ' ', title) # Replace special characters with underscores clean = re.sub(r'[^\w\s-]', '', clean) # Replace whitespace with underscores clean = re.sub(r'\s+', '_', clean.strip()) # Remove consecutive underscores clean = re.sub(r'_+', '_', clean) # Lowercase and limit length return clean.lower()[:100] def _get_filter_slug(self) -> str: """Generate a directory-friendly slug based on active filters.""" parts = [] # Mapping of attribute name to (short_code, value, options_attr) filters = [ ('age', 'Age', getattr(self, 'filter_age', None), 'options_age'), ('gender', 'Gen', getattr(self, 'filter_gender', None), 'options_gender'), ('consumer', 'Cons', getattr(self, 'filter_consumer', None), 'options_consumer'), ('ethnicity', 'Eth', getattr(self, 'filter_ethnicity', None), 'options_ethnicity'), ('income', 'Inc', getattr(self, 'filter_income', None), 'options_income'), ('business_owner', 'BizOwn', getattr(self, 'filter_business_owner', None), 'options_business_owner'), ('employment_status', 'Emp', getattr(self, 'filter_employment_status', None), 'options_employment_status'), ('personal_products', 'Prod', getattr(self, 'filter_personal_products', None), 'options_personal_products'), ('ai_user', 'AI', getattr(self, 'filter_ai_user', None), 'options_ai_user'), ('investable_assets', 'InvAsts', getattr(self, 'filter_investable_assets', None), 'options_investable_assets'), ('industry', 'Ind', getattr(self, 'filter_industry', None), 'options_industry'), ] for _, short_code, value, options_attr in filters: if value is None: continue # Ensure value is a list for uniform handling if not isinstance(value, list): value = [value] if len(value) == 0: continue # Check if all options are selected (equivalent to no filter) # We compare the set of selected values to the set of all available options master_list = getattr(self, options_attr, None) if master_list and set(value) == set(master_list): continue if len(value) > 3: # If more than 3 options selected, create a hash of the sorted values # This ensures uniqueness properly while keeping the slug short sorted_vals = sorted([str(v) for v in value]) vals_str = "".join(sorted_vals) # Create short 6-char hash val_hash = hashlib.md5(vals_str.encode()).hexdigest()[:6] val_str = f"{len(value)}_grps_{val_hash}" else: # Join values with '+' clean_values = [] for v in value: # Simple sanitization: keep alphanum and hyphens/dots, remove others s = str(v) # Remove special chars that might be problematic in dir names s = re.sub(r'[^\w\-\.]', '', s) clean_values.append(s) val_str = "+".join(clean_values) parts.append(f"{short_code}-{val_str}") # Append straight-liner threshold if set sl_threshold = getattr(self, 'sl_threshold', None) if sl_threshold is not None: parts.append(f"SL-gte{sl_threshold}") if not parts: return "All_Respondents" return "_".join(parts) def _get_filter_description(self) -> str: """Generate a human-readable description of active filters. Includes sample size (from _last_sample_size) prepended to the filter text. Format: "Sample size: | Filters: ..." or "Sample size: " if no filters. """ parts = [] # Mapping of attribute name to (display_name, value, options_attr) filters = [ ('Age', getattr(self, 'filter_age', None), 'options_age'), ('Gender', getattr(self, 'filter_gender', None), 'options_gender'), ('Consumer', getattr(self, 'filter_consumer', None), 'options_consumer'), ('Ethnicity', getattr(self, 'filter_ethnicity', None), 'options_ethnicity'), ('Income', getattr(self, 'filter_income', None), 'options_income'), ('Business Owner', getattr(self, 'filter_business_owner', None), 'options_business_owner'), ('Employment Status', getattr(self, 'filter_employment_status', None), 'options_employment_status'), ('Personal Products', getattr(self, 'filter_personal_products', None), 'options_personal_products'), ('AI User', getattr(self, 'filter_ai_user', None), 'options_ai_user'), ('Investable Assets', getattr(self, 'filter_investable_assets', None), 'options_investable_assets'), ('Industry', getattr(self, 'filter_industry', None), 'options_industry'), ] for display_name, value, options_attr in filters: if value is None: continue # Ensure value is a list for uniform handling if not isinstance(value, list): value = [value] if len(value) == 0: continue # Check if all options are selected (equivalent to no filter) master_list = getattr(self, options_attr, None) if master_list and set(value) == set(master_list): continue # Special handling for Ethnicity: detect single-value ethnicity filters # When filtering by one ethnicity (e.g., "White or Caucasian"), multiple options # may be selected (all options containing that value). Display just the common value # only if ALL options containing that value are selected. if display_name.lower() == 'ethnicity' and len(value) > 1 and master_list: # Find common individual ethnicity values across all selected options # Each option may be comma-separated (e.g., "White or Caucasian, Hispanic or Latino") value_sets = [ set(v.strip() for v in opt.split(',')) for opt in value ] # Intersect all sets to find common values common_values = value_sets[0] for vs in value_sets[1:]: common_values = common_values.intersection(vs) # If exactly one common value, check if ALL options containing it are selected if len(common_values) == 1: common_val = common_values.pop() # Find all options in master list that contain this common value all_options_with_value = [ opt for opt in master_list if common_val in [v.strip() for v in opt.split(',')] ] # Only simplify if we selected ALL options containing this value if set(value) == set(all_options_with_value): val_str = common_val else: clean_values = [str(v) for v in value] val_str = ", ".join(clean_values) else: # No single common value - fall back to full list clean_values = [str(v) for v in value] val_str = ", ".join(clean_values) else: # Use original values for display (full list) clean_values = [str(v) for v in value] val_str = ", ".join(clean_values) # Use UPPERCASE for category name to distinguish from values parts.append(f"{display_name.upper()}: {val_str}") # Get sample size from the filtered dataset (not from transformed plot data) sample_size = self._get_filtered_sample_size() sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else "" # Append straight-liner threshold if set sl_threshold = getattr(self, 'sl_threshold', None) if sl_threshold is not None: parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups") if not parts: # No filters active - return just sample size (or empty string if no sample size) return sample_prefix # Join with clear separator - double space for visual break filter_text = "Filters: " + " — ".join(parts) if sample_prefix: return f"{sample_prefix} | {filter_text}" return filter_text def _add_filter_footnote(self, chart: alt.Chart) -> alt.Chart: """Add a footnote with active filters to the chart. Uses chart subtitle for filter text to avoid layout issues with vconcat. Returns the modified chart (or original if no filters). """ filter_text = self._get_filter_description() # Skip if no filters active - return original chart if not filter_text: return chart # Wrap text into multiple lines at ~100 chars, but don't break mid-word max_line_length = 100 words = filter_text.split() lines = [] current_line = "" for word in words: test_line = f"{current_line} {word}".strip() if current_line else word if len(test_line) <= max_line_length: current_line = test_line else: if current_line: lines.append(current_line) current_line = word if current_line: lines.append(current_line) # Get existing title from chart spec chart_spec = chart.to_dict() existing_title = chart_spec.get('title', '') # Handle different title formats (string vs dict vs list) if isinstance(existing_title, (str, list)): title_config = { 'text': existing_title, 'subtitle': lines, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } elif isinstance(existing_title, dict): title_config = existing_title.copy() title_config['subtitle'] = lines title_config['subtitleColor'] = 'gray' title_config['subtitleFontSize'] = 10 title_config['anchor'] = 'start' else: # No existing title, just add filters as subtitle title_config = { 'text': '', 'subtitle': lines, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } return chart.properties(title=title_config) def _save_plot(self, chart: alt.Chart, title: str, filename: str | None = None, skip_footnote: bool = False) -> alt.Chart: """Save chart to PNG file if fig_save_dir is set. Args: chart: The Altair chart to save title: Chart title (used for filename if filename not provided) filename: Optional explicit filename (without extension). If provided, this is used instead of deriving from title. skip_footnote: If True, skip adding filter footnote (use when footnote was already added to a sub-chart before vconcat). Returns the (potentially modified) chart with filter footnote added. """ # Add filter footnote - returns combined chart if filters active if not skip_footnote: chart = self._add_filter_footnote(chart) if hasattr(self, 'fig_save_dir') and self.fig_save_dir: path = Path(self.fig_save_dir) # Add filter slug subfolder filter_slug = self._get_filter_slug() path = path / filter_slug if not path.exists(): path.mkdir(parents=True, exist_ok=True) # Use explicit filename if provided, otherwise derive from title base_name = filename if filename else self._sanitize_filename(title) filename = f"{base_name}.png" filepath = path / filename # Use vl_convert directly with theme config for consistent rendering import vl_convert as vlc from theme import jpmc_altair_theme # Get chart spec and theme config chart_spec = chart.to_dict() theme_config = jpmc_altair_theme()['config'] png_data = vlc.vegalite_to_png( vl_spec=chart_spec, scale=2.0, ppi=72, config=theme_config ) with open(filepath, 'wb') as f: f.write(png_data) print(f"Saved plot to {filepath}") return chart def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame: """Ensure data is an eager DataFrame, collecting if necessary.""" df = data if data is not None else getattr(self, 'data_filtered', None) if df is None: raise ValueError("No data provided and self.data_filtered is None.") if isinstance(df, pl.LazyFrame): df = df.collect() return df def _get_filtered_sample_size(self) -> int | None: """Get the sample size from the filtered dataset (self.data_filtered). This returns the number of respondents in the filtered dataset, not the size of any transformed/aggregated data passed to plot functions. """ data_filtered = getattr(self, 'data_filtered', None) if data_filtered is None: return None if isinstance(data_filtered, pl.LazyFrame): return data_filtered.select(pl.len()).collect().item() return data_filtered.height def _clean_voice_label(self, col_name: str) -> str: """Extract and clean voice name from column name for display. Handles patterns like: - 'Voice_Scale__The_Coach' -> 'The Coach' - 'Character_Ranking_The_Coach' -> 'The Coach' - 'Top_3_Voices_ranking__Familiar_Friend' -> 'Familiar Friend' """ # First split by __ if present label = col_name.split('__')[-1] if '__' in col_name else col_name # Remove common prefixes label = label.replace('Character_Ranking_', '') label = label.replace('Top_3_Voices_ranking_', '') # Replace underscores with spaces label = label.replace('_', ' ').strip() return label def _get_voice_gender(self, voice_label: str) -> str: """Get the gender of a voice from its label. Parameters: voice_label: Voice label (e.g., 'V14', 'Voice 14', etc.) Returns: 'Male' or 'Female', defaults to 'Male' if not found """ # Extract voice code (e.g., 'V14' from 'Voice 14' or 'V14') voice_code = None # Try to find VXX pattern match = re.search(r'V(\d+)', voice_label) if match: voice_code = f"V{match.group(1)}" else: # Try to extract number and prepend V match = re.search(r'(\d+)', voice_label) if match: voice_code = f"V{match.group(1)}" if voice_code and voice_code in VOICE_GENDER_MAPPING: return VOICE_GENDER_MAPPING[voice_code] return "Male" # Default to Male if unknown def _get_gender_color(self, gender: str, color_type: str = "primary") -> str: """Get the appropriate color based on gender. Parameters: gender: 'Male' or 'Female' color_type: One of 'primary', 'rank_1', 'rank_2', 'rank_3', 'neutral' Returns: Hex color string """ color_map = { "Male": { "primary": ColorPalette.GENDER_MALE, "rank_1": ColorPalette.GENDER_MALE_RANK_1, "rank_2": ColorPalette.GENDER_MALE_RANK_2, "rank_3": ColorPalette.GENDER_MALE_RANK_3, "neutral": ColorPalette.GENDER_MALE_NEUTRAL, }, "Female": { "primary": ColorPalette.GENDER_FEMALE, "rank_1": ColorPalette.GENDER_FEMALE_RANK_1, "rank_2": ColorPalette.GENDER_FEMALE_RANK_2, "rank_3": ColorPalette.GENDER_FEMALE_RANK_3, "neutral": ColorPalette.GENDER_FEMALE_NEUTRAL, } } return color_map.get(gender, color_map["Male"]).get(color_type, ColorPalette.PRIMARY) def plot_average_scores_with_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "General Impression (1-10)\nPer Voice with Number of Participants Who Rated It", filename: str | None = None, x_label: str = "Stimuli", y_label: str = "Average General Impression Rating (1-10)", color: str = ColorPalette.PRIMARY, height: int | None = None, width: int | str | None = None, domain: list[float] | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar plot showing average scores and count of non-null values for each column. Parameters: filename: Optional explicit filename (without extension) for saving. color_gender: If True, color bars by voice gender (blue=male, pink=female). """ df = self._ensure_dataframe(data) # Calculate stats for each column (exclude _recordId) stats = [] for col in [c for c in df.columns if c != '_recordId']: avg_score = df[col].mean() non_null_count = df[col].drop_nulls().len() label = self._clean_voice_label(col) gender = self._get_voice_gender(label) if color_gender else None stats.append({ 'voice': label, 'average': avg_score, 'count': non_null_count, 'gender': gender }) # Convert to pandas for Altair (sort by average descending) stats_df = pl.DataFrame(stats).sort('average', descending=True).to_pandas() if domain is None: domain = [stats_df['average'].min(), stats_df['average'].max()] # Base bar chart - use y2 to explicitly start bars at domain minimum if color_gender: bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('voice:N', title=x_label, sort='-y', axis=alt.Axis(grid=False)), y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain), axis=alt.Axis(grid=True)), y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge) color=alt.Color('gender:N', scale=alt.Scale(domain=['Male', 'Female'], range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]), legend=alt.Legend(orient='top', direction='horizontal', title='Gender')), tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('average:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('gender:N', title='Gender') ] ) # Text overlay - inherit color from bars via mark_text text = bars.mark_text(dy=-5, fontSize=10).encode( text=alt.Text('count:Q') ) else: bars = alt.Chart(stats_df).mark_bar(color=color).encode( x=alt.X('voice:N', title=x_label, sort='-y', axis=alt.Axis(grid=False)), y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain), axis=alt.Axis(grid=True)), y2=alt.datum(domain[0]), # Bars start at domain minimum (bottom edge) tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('average:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count') ] ) # Text overlay for counts text = alt.Chart(stats_df).mark_text( dy=-5, color='black', fontSize=10 ).encode( x=alt.X('voice:N', sort='-y'), y=alt.Y('average:Q'), text=alt.Text('count:Q') ) # Combine layers chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title, filename=filename) return chart def plot_top3_ranking_distribution( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Top 3 Rankings Distribution\nCount of 1st, 2nd, and 3rd Place Votes per Voice", x_label: str = "Voices", y_label: str = "Number of Mentions in Top 3", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a stacked bar chart showing how often each voice was ranked 1st, 2nd, or 3rd.""" df = self._ensure_dataframe(data) # Calculate stats per column stats = [] for col in [c for c in df.columns if c != '_recordId']: rank1 = df.filter(pl.col(col) == 1).height rank2 = df.filter(pl.col(col) == 2).height rank3 = df.filter(pl.col(col) == 3).height total = rank1 + rank2 + rank3 if total > 0: label = self._clean_voice_label(col) # Add 3 rows (one per rank) stats.append({'voice': label, 'rank': 'Rank 1 (1st Choice)', 'count': rank1, 'total': total}) stats.append({'voice': label, 'rank': 'Rank 2 (2nd Choice)', 'count': rank2, 'total': total}) stats.append({'voice': label, 'rank': 'Rank 3 (3rd Choice)', 'count': rank3, 'total': total}) # Convert to long format, sort by total stats_df = pl.DataFrame(stats).to_pandas() # Compute explicit sort order by total (descending) sort_order = stats_df.drop_duplicates('voice').sort_values('total', ascending=False)['voice'].tolist() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['rank'], bind='legend') # Create stacked bar chart with interactive legend bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('voice:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, stack='zero', axis=alt.Axis(grid=True)), color=alt.Color('rank:N', scale=alt.Scale(domain=['Rank 1 (1st Choice)', 'Rank 2 (2nd Choice)', 'Rank 3 (3rd Choice)'], range=[ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3]), legend=alt.Legend(orient='top', direction='horizontal', title=None)), order=alt.Order('rank:N', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count') ] ) # Text layer showing totals on top of bars text = alt.Chart(stats_df).transform_filter( alt.datum.rank == 'Rank 1 (1st Choice)' ).mark_text(dy=-10, color='black').encode( x=alt.X('voice:N', sort=sort_order), y=alt.Y('total:Q'), text=alt.Text('total:Q') ) chart = alt.layer(bars, text).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_ranking_distribution( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Rankings Distribution\n(1st to 3rd Place)", x_label: str = "Item", y_label: str = "Number of Votes", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a stacked bar chart showing the distribution of rankings (1st to 3rd). Parameters: color_gender: If True, color bars by voice gender with rank intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) stats = [] ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: r1 = df.filter(pl.col(col) == 1).height r2 = df.filter(pl.col(col) == 2).height r3 = df.filter(pl.col(col) == 3).height # r4 = df.filter(pl.col(col) == 4).height total = r1 + r2 + r3 if total > 0: label = self._clean_voice_label(col) gender = self._get_voice_gender(label) if color_gender else None stats.append({'item': label, 'rank': 'Rank 1 (Best)', 'count': r1, 'total': total, 'gender': gender, 'rank_order': 1}) stats.append({'item': label, 'rank': 'Rank 2', 'count': r2, 'total': total, 'gender': gender, 'rank_order': 2}) stats.append({'item': label, 'rank': 'Rank 3', 'count': r3, 'total': total, 'gender': gender, 'rank_order': 3}) # stats.append({'item': label, 'rank': 'Rank 4 (Worst)', 'count': r4, 'total': total, 'gender': gender, 'rank_order': 4}) if not stats: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') stats_df = pl.DataFrame(stats).to_pandas() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['rank'], bind='legend') # Compute explicit sort order by total (descending) sort_order = stats_df.drop_duplicates('item').sort_values('total', ascending=False)['item'].tolist() if color_gender: # Add gender_rank column for combined color encoding stats_df['gender_rank'] = stats_df['gender'] + ' - ' + stats_df['rank'] # Define combined domain and range for gender + rank domain = [ 'Male - Rank 1 (Best)', 'Male - Rank 2', 'Male - Rank 3', 'Female - Rank 1 (Best)', 'Female - Rank 2', 'Female - Rank 3' ] range_colors = [ ColorPalette.GENDER_MALE_RANK_1, ColorPalette.GENDER_MALE_RANK_2, ColorPalette.GENDER_MALE_RANK_3, ColorPalette.GENDER_FEMALE_RANK_1, ColorPalette.GENDER_FEMALE_RANK_2, ColorPalette.GENDER_FEMALE_RANK_3 ] bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, stack='zero', axis=alt.Axis(grid=True)), color=alt.Color('gender_rank:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None, columns=3)), order=alt.Order('rank_order:Q', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('gender:N', title='Gender') ] ) else: bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, stack='zero', axis=alt.Axis(grid=True)), color=alt.Color('rank:N', scale=alt.Scale(domain=['Rank 1 (Best)', 'Rank 2', 'Rank 3'], range=[ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3]), legend=alt.Legend(orient='top', direction='horizontal', title=None)), order=alt.Order('rank_order:Q', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count') ] ) # Text layer showing totals on top of bars if color_gender: # Create a separate chart for totals with gender coloring text_df = stats_df.drop_duplicates('item')[['item', 'total', 'gender']] text = alt.Chart(text_df).mark_text(dy=-10).encode( x=alt.X('item:N', sort=sort_order), y=alt.Y('total:Q'), text=alt.Text('total:Q'), color=alt.condition( alt.datum.gender == 'Female', alt.value(ColorPalette.GENDER_FEMALE), alt.value(ColorPalette.GENDER_MALE) ) ) else: text = alt.Chart(stats_df).transform_filter( alt.datum.rank_order == 1 ).mark_text(dy=-10, color='black').encode( x=alt.X('item:N', sort=sort_order), y=alt.Y('total:Q'), text=alt.Text('total:Q') ) chart = alt.layer(bars, text).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_most_ranked_1( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Most Popular Choice\n(Number of Times Ranked 1st)", x_label: str = "Item", y_label: str = "Count of 1st Place Rankings", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar chart showing which item was ranked #1 the most. Top 3 highlighted. Parameters: color_gender: If True, color bars by voice gender with highlight/neutral intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) stats = [] ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: count_rank_1 = df.filter(pl.col(col) == 1).height label = self._clean_voice_label(col) gender = self._get_voice_gender(label) if color_gender else None stats.append({'item': label, 'count': count_rank_1, 'gender': gender}) # Convert and sort stats_df = pl.DataFrame(stats).sort('count', descending=True) sort_order = stats_df['item'].to_list() # Add rank column for coloring (1-3 vs 4+) stats_df = stats_df.with_row_index('rank_index') stats_df = stats_df.with_columns( pl.when(pl.col('rank_index') < 3) .then(pl.lit('Top 3')) .otherwise(pl.lit('Other')) .alias('category') ).to_pandas() if color_gender: # Add gender_category column for combined color encoding stats_df['gender_category'] = stats_df['gender'] + ' - ' + stats_df['category'] # Define combined domain and range for gender + category domain = ['Male - Top 3', 'Male - Other', 'Female - Top 3', 'Female - Other'] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_NEUTRAL, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ] bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('gender_category:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('count:Q', title='1st Place Votes'), alt.Tooltip('gender:N', title='Gender') ] ) # Create text layer with gender coloring using conditional text = alt.Chart(stats_df).mark_text(dy=-5, fontSize=10).encode( x=alt.X('item:N', sort=sort_order), y=alt.Y('count:Q'), text=alt.Text('count:Q'), color=alt.condition( alt.datum.gender == 'Female', alt.value(ColorPalette.GENDER_FEMALE), alt.value(ColorPalette.GENDER_MALE) ) ) chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) else: # Bar chart with conditional color bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 3', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('count:Q', title='1st Place Votes') ] ) # Text overlay for counts text = alt.Chart(stats_df).mark_text( dy=-5, color='black', fontSize=10 ).encode( x=alt.X('item:N', sort=sort_order), y=alt.Y('count:Q'), text=alt.Text('count:Q') ) chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_weighted_ranking_score( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Weighted Popularity Score\n(1st=3pts, 2nd=2pts, 3rd=1pt)", filename: str | None = None, x_label: str = "Character Personality", y_label: str = "Total Weighted Score", color: str = ColorPalette.PRIMARY, height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar chart showing the weighted ranking score for each character. Parameters: filename: Optional explicit filename (without extension) for saving. color_gender: If True, color bars by voice gender (blue=male, pink=female). """ weighted_df = self._ensure_dataframe(data).to_pandas() weighted_df.sort_values('Weighted Score', ascending=False, inplace=True) sort_order = weighted_df['Character'].tolist() if color_gender: # Add gender column based on Character name weighted_df['gender'] = weighted_df['Character'].apply(self._get_voice_gender) # Bar chart with gender coloring bars = alt.Chart(weighted_df).mark_bar().encode( x=alt.X('Character:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('Weighted Score:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('gender:N', scale=alt.Scale(domain=['Male', 'Female'], range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]), legend=alt.Legend(orient='top', direction='horizontal', title='Gender')), tooltip=[ alt.Tooltip('Character:N'), alt.Tooltip('Weighted Score:Q', title='Score'), alt.Tooltip('gender:N', title='Gender') ] ) else: # Bar chart bars = alt.Chart(weighted_df).mark_bar(color=color).encode( x=alt.X('Character:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('Weighted Score:Q', title=y_label, axis=alt.Axis(grid=True)), tooltip=[ alt.Tooltip('Character:N'), alt.Tooltip('Weighted Score:Q', title='Score') ] ) # Text overlay text = bars.mark_text( dy=-5, color='black', fontSize=11 ).encode( text='Weighted Score:Q' ) chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title, filename=filename) return chart def plot_voice_selection_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, target_column: str = "8_Combined", title: str = "Most Frequently Chosen Voices\n(Top 8 Highlighted)", x_label: str = "Voice", y_label: str = "Number of Times Chosen", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Create a bar plot showing the frequency of voice selections. Parameters: color_gender: If True, color bars by voice gender with highlight/neutral intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) if target_column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{target_column}' not found"]})).mark_text().encode(text='text:N') # Process data: split, explode, count stats_df = ( df.select(pl.col(target_column)) .drop_nulls() .with_columns(pl.col(target_column).str.split(",")) .explode(target_column) .with_columns(pl.col(target_column).str.strip_chars()) .filter(pl.col(target_column) != "") .group_by(target_column) .agg(pl.len().alias("count")) .sort("count", descending=True) .with_row_index('rank_index') .with_columns( pl.when(pl.col('rank_index') < 8) .then(pl.lit('Top 8')) .otherwise(pl.lit('Other')) .alias('category') ) .to_pandas() ) # Compute explicit sort order by count (descending) sort_order = stats_df.sort_values('count', ascending=False)[target_column].tolist() # Add gender column for all cases when color_gender is True (needed for text layer) if color_gender: stats_df['gender'] = stats_df[target_column].apply(self._get_voice_gender) # Add gender_category column for combined color encoding stats_df['gender_category'] = stats_df['gender'] + ' - ' + stats_df['category'] # Define combined domain and range for gender + category domain = ['Male - Top 8', 'Male - Other', 'Female - Top 8', 'Female - Other'] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_NEUTRAL, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ] bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('gender_category:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None)), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='Selections'), alt.Tooltip('gender:N', title='Gender') ] ) # Text layer with gender coloring using conditional text = alt.Chart(stats_df).mark_text(dy=-10).encode( x=alt.X(f'{target_column}:N', sort=sort_order), y=alt.Y('count:Q'), text=alt.Text('count:Q'), color=alt.condition( alt.datum.gender == 'Female', alt.value(ColorPalette.GENDER_FEMALE), alt.value(ColorPalette.GENDER_MALE) ) ) else: bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 8', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='Selections') ] ) # Text layer with black color text = alt.Chart(stats_df).mark_text(dy=-10, color='black').encode( x=alt.X(f'{target_column}:N', sort=sort_order), y=alt.Y('count:Q'), text=alt.Text('count:Q') ) chart = alt.layer(bars, text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_top3_selection_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, target_column: str = "3_Ranked", title: str = "Most Frequently Chosen Top 3 Voices\n(Top 3 Highlighted)", x_label: str = "Voice", y_label: str = "Count of Mentions in Top 3", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Question: Which 3 voices are chosen the most out of 18? Parameters: color_gender: If True, color bars by voice gender with highlight/neutral intensity (blue shades=male, pink shades=female). """ df = self._ensure_dataframe(data) if target_column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{target_column}' not found"]})).mark_text().encode(text='text:N') stats_df = ( df.select(pl.col(target_column)) .drop_nulls() .with_columns(pl.col(target_column).str.split(",")) .explode(target_column) .with_columns(pl.col(target_column).str.strip_chars()) .filter(pl.col(target_column) != "") .group_by(target_column) .agg(pl.len().alias("count")) .sort("count", descending=True) .with_row_index('rank_index') .with_columns( pl.when(pl.col('rank_index') < 3) .then(pl.lit('Top 3')) .otherwise(pl.lit('Other')) .alias('category') ) .to_pandas() ) # Compute explicit sort order by count (descending) sort_order = stats_df.sort_values('count', ascending=False)[target_column].tolist() # Add gender column for all cases when color_gender is True (needed for text layer) if color_gender: stats_df['gender'] = stats_df[target_column].apply(self._get_voice_gender) # Add gender_category column for combined color encoding stats_df['gender_category'] = stats_df['gender'] + ' - ' + stats_df['category'] # Define combined domain and range for gender + category domain = ['Male - Top 3', 'Male - Other', 'Female - Top 3', 'Female - Other'] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_NEUTRAL, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ] bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('gender_category:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(orient='top', direction='horizontal', title=None)), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='In Top 3'), alt.Tooltip('gender:N', title='Gender') ] ) # Text layer with gender coloring using conditional text = alt.Chart(stats_df).mark_text(dy=-10).encode( x=alt.X(f'{target_column}:N', sort=sort_order), y=alt.Y('count:Q'), text=alt.Text('count:Q'), color=alt.condition( alt.datum.gender == 'Female', alt.value(ColorPalette.GENDER_FEMALE), alt.value(ColorPalette.GENDER_MALE) ) ) else: bars = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort=sort_order, axis=alt.Axis(grid=False)), y=alt.Y('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 3', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='In Top 3') ] ) # Text layer with black color text = alt.Chart(stats_df).mark_text(dy=-10, color='black').encode( x=alt.X(f'{target_column}:N', sort=sort_order), y=alt.Y('count:Q'), text=alt.Text('count:Q') ) chart = alt.layer(bars, text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_trait_scores( self, data: pl.LazyFrame | pl.DataFrame | None = None, trait_description: str = None, left_anchor: str = None, right_anchor: str = None, title: str = "Speaking Style Trait Analysis", height: int | None = None, width: int | str | None = None, color_gender: bool = False, ) -> alt.Chart: """Plot scores for a single speaking style trait across multiple voices.""" df = self._ensure_dataframe(data) if df.is_empty(): return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') required_cols = ["Voice", "score"] if not all(col in df.columns for col in required_cols): return alt.Chart(pd.DataFrame({'text': ['Missing required columns']})).mark_text().encode(text='text:N') # Calculate stats: Mean, Count stats = ( df.filter(pl.col("score").is_not_null()) .group_by("Voice") .agg([ pl.col("score").mean().alias("mean_score"), pl.col("score").count().alias("count") ]) .sort("mean_score", descending=False) # Ascending for bottom-to-top display .to_pandas() ) # Extract anchors from data if not provided if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns: head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1) if not head.is_empty(): if left_anchor is None: left_anchor = head["Left_Anchor"][0] if right_anchor is None: right_anchor = head["Right_Anchor"][0] if trait_description is None: if left_anchor and right_anchor: trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}" elif "Description" in df.columns: head = df.filter(pl.col("Description").is_not_null()).head(1) trait_description = head["Description"][0] if not head.is_empty() else "" else: trait_description = "" if color_gender: stats['gender'] = stats['Voice'].apply(self._get_voice_gender) bars = alt.Chart(stats).mark_bar().encode( x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), x2=alt.datum(1), # Bars start at x=1 (left edge of domain) y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)), color=alt.Color('gender:N', scale=alt.Scale(domain=['Male', 'Female'], range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]), legend=alt.Legend(orient='top', direction='horizontal', title='Gender')), tooltip=[ alt.Tooltip('Voice:N'), alt.Tooltip('mean_score:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('gender:N', title='Gender') ] ) text = alt.Chart(stats).mark_text( align='left', baseline='middle', dx=5, fontSize=12 ).encode( x='mean_score:Q', y=alt.Y('Voice:N', sort='-x'), text='count:Q', color=alt.condition( alt.datum.gender == 'Female', alt.value(ColorPalette.GENDER_FEMALE), alt.value(ColorPalette.GENDER_MALE) ) ) else: # Horizontal bar chart - use x2 to explicitly start bars at x=1 bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), x2=alt.datum(1), # Bars start at x=1 (left edge of domain) y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)), tooltip=[ alt.Tooltip('Voice:N'), alt.Tooltip('mean_score:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count') ] ) # Count text at end of bars text = alt.Chart(stats).mark_text( align='left', baseline='middle', color='black', fontSize=12, dx=5 ).encode( x='mean_score:Q', y=alt.Y('Voice:N', sort='-x'), text='count:Q' ) # Combine layers chart = (bars + text).properties( title={ "text": self._process_title(title), "subtitle": [trait_description, "(Numbers near bars indicate respondent count)"] }, width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_trait_scores_comparison( self, data_all: pl.LazyFrame | pl.DataFrame, data_clean: pl.LazyFrame | pl.DataFrame, trait_description: str = None, title: str = "Speaking Style Trait Analysis (Comparison)", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Plot scores comparing All Respondents vs Cleaned data (excl. straight-liners) in grouped bars.""" # Helper to process each dataframe def get_stats(d, group_label): df = self._ensure_dataframe(d) if df.is_empty(): return None return ( df.filter(pl.col("score").is_not_null()) .group_by("Voice") .agg([ pl.col("score").mean().alias("mean_score"), pl.col("score").count().alias("count") ]) .with_columns(pl.lit(group_label).alias("dataset")) .to_pandas() ) stats_all = get_stats(data_all, "All Respondents") stats_clean = get_stats(data_clean, "Excl. Straight-Liners") if stats_all is None or stats_clean is None: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') # Combine stats = pd.concat([stats_all, stats_clean]) # Determine sort order using "All Respondents" data (Desc) sort_order = stats_all.sort_values('mean_score', ascending=False)['Voice'].tolist() # Add gender and combined category for color stats['gender'] = stats['Voice'].apply(self._get_voice_gender) stats['color_group'] = stats.apply( lambda x: f"{x['gender']} - {x['dataset']}", axis=1 ) # Define Color Scale domain = [ 'Male - All Respondents', 'Male - Excl. Straight-Liners', 'Female - All Respondents', 'Female - Excl. Straight-Liners' ] range_colors = [ ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_RANK_3, ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_RANK_3 ] # Base chart base = alt.Chart(stats).encode( y=alt.Y('Voice:N', title='Voice', sort=sort_order, axis=alt.Axis(grid=False)), ) bars = base.mark_bar().encode( x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), x2=alt.datum(1), yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']), color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=alt.Legend(title='Dataset', orient='top', columns=2)), tooltip=[ alt.Tooltip('Voice:N'), alt.Tooltip('dataset:N', title='Dataset'), alt.Tooltip('mean_score:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('gender:N', title='Gender') ] ) text = base.mark_text(align='left', baseline='middle', dx=5, fontSize=9).encode( x=alt.X('mean_score:Q'), yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']), text=alt.Text('count:Q'), color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=None) ) chart = (bars + text).properties( title={ "text": self._process_title(title), "subtitle": [trait_description if trait_description else "", "(Lighter shade = Straight-liners removed)"] }, width=width or 800, height=height or getattr(self, 'plot_height', 600) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_scale_correlation( self, style_color: str, style_traits: list[str], data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, filename: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores (1-5) and Voice Scale (1-10). Args: filename: Optional explicit filename (without extension) for saving. If not provided, filename is derived from title. """ df = self._ensure_dataframe(data) if title is None: title = f"Speaking style and voice scale 1-10 correlations" trait_correlations = [] # Calculate correlations for i, trait in enumerate(style_traits): subset = df.filter(pl.col("Right_Anchor") == trait) valid_data = subset.select(["score", "Voice_Scale_Score"]).drop_nulls() if valid_data.height > 1: corr_val = valid_data.select(pl.corr("score", "Voice_Scale_Score")).item() # Wrap trait text at '|' for display trait_display = trait.replace('|', '\n') trait_correlations.append({ "trait_display": trait_display, "trait_index": f"Trait {i+1}", "correlation": corr_val if corr_val is not None else 0.0 }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() # Conditional color based on sign chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)), y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('correlation:Q', format='.2f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) chart = self._save_plot(chart, title, filename=filename) return chart def _create_gender_correlation_legend(self) -> alt.Chart: """Create a custom legend for gender correlation plots with dual-color swatches. Horizontal layout below the chart: [■][■] Male [■][■] Female """ # Horizontal layout: Male at x=0-2, Female at x=5-7 (gap for whitespace) legend_data = pd.DataFrame([ {"x": 0, "color": ColorPalette.CORR_MALE_POSITIVE}, {"x": 1, "color": ColorPalette.CORR_MALE_NEGATIVE}, {"x": 5, "color": ColorPalette.CORR_FEMALE_POSITIVE}, {"x": 6, "color": ColorPalette.CORR_FEMALE_NEGATIVE}, ]) # Color blocks blocks = alt.Chart(legend_data).mark_rect(width=12, height=12).encode( x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])), y=alt.value(6), color=alt.Color('color:N', scale=None), ) # Labels positioned after each pair of blocks label_data = pd.DataFrame([ {"x": 2.3, "label": "Male"}, {"x": 7.3, "label": "Female"}, ]) labels = alt.Chart(label_data).mark_text(align='left', baseline='middle', fontSize=11).encode( x=alt.X('x:Q', axis=None, scale=alt.Scale(domain=[0, 9])), y=alt.value(6), text='label:N' ) legend = (blocks + labels).properties(width=200, height=20) return legend def plot_speaking_style_scale_correlation_by_gender( self, style_color: str, style_traits: list[str], data_male: pl.LazyFrame | pl.DataFrame, data_female: pl.LazyFrame | pl.DataFrame, title: str | None = None, filename: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores and Voice Scale, with grouped bars comparing male vs female voices. Args: style_color: The speaking style color (e.g., "Green", "Blue") style_traits: List of traits for this style data_male: DataFrame filtered to male voices only data_female: DataFrame filtered to female voices only title: Chart title filename: Optional explicit filename for saving width: Chart width in pixels height: Chart height in pixels Returns: Altair chart with grouped bars (male/female) per trait """ df_male = self._ensure_dataframe(data_male) df_female = self._ensure_dataframe(data_female) if title is None: title = f"Speaking style {style_color} and voice scale 1-10 correlations (by Voice Gender)" trait_correlations = [] for i, trait in enumerate(style_traits): trait_display = trait.replace('|', '\n') # Male correlation subset_m = df_male.filter(pl.col("Right_Anchor") == trait) valid_m = subset_m.select(["score", "Voice_Scale_Score"]).drop_nulls() if valid_m.height > 1: corr_m = valid_m.select(pl.corr("score", "Voice_Scale_Score")).item() corr_val = corr_m if corr_m is not None else 0.0 trait_correlations.append({ "trait_display": trait_display, "Gender": "Male", "correlation": corr_val, "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg" }) # Female correlation subset_f = df_female.filter(pl.col("Right_Anchor") == trait) valid_f = subset_f.select(["score", "Voice_Scale_Score"]).drop_nulls() if valid_f.height > 1: corr_f = valid_f.select(pl.corr("score", "Voice_Scale_Score")).item() corr_val = corr_f if corr_f is not None else 0.0 trait_correlations.append({ "trait_display": trait_display, "Gender": "Female", "correlation": corr_val, "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg" }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() main_chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)), xOffset='Gender:N', y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), color=alt.Color('color_key:N', scale=alt.Scale( domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'], range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE, ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE] ), legend=None), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('Gender:N'), alt.Tooltip('correlation:Q', format='.3f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) # Add filter footnote to main chart before combining with legend main_chart = self._add_filter_footnote(main_chart) # Add custom legend below the chart legend = self._create_gender_correlation_legend() chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent') chart = self._save_plot(chart, title, filename=filename, skip_footnote=True) return chart def plot_speaking_style_ranking_correlation_by_gender( self, style_color: str, style_traits: list[str], data_male: pl.LazyFrame | pl.DataFrame, data_female: pl.LazyFrame | pl.DataFrame, title: str | None = None, filename: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores and Voice Ranking Points, with grouped bars comparing male vs female voices. Args: style_color: The speaking style color (e.g., "Green", "Blue") style_traits: List of traits for this style data_male: DataFrame filtered to male voices only data_female: DataFrame filtered to female voices only title: Chart title filename: Optional explicit filename for saving width: Chart width in pixels height: Chart height in pixels Returns: Altair chart with grouped bars (male/female) per trait """ df_male = self._ensure_dataframe(data_male) df_female = self._ensure_dataframe(data_female) if title is None: title = f"Speaking style {style_color} and voice ranking points correlations (by Voice Gender)" trait_correlations = [] for i, trait in enumerate(style_traits): trait_display = trait.replace('|', '\n') # Male correlation subset_m = df_male.filter(pl.col("Right_Anchor") == trait) valid_m = subset_m.select(["score", "Ranking_Points"]).drop_nulls() if valid_m.height > 1: corr_m = valid_m.select(pl.corr("score", "Ranking_Points")).item() corr_val = corr_m if corr_m is not None else 0.0 trait_correlations.append({ "trait_display": trait_display, "Gender": "Male", "correlation": corr_val, "color_key": "Male_Pos" if corr_val >= 0 else "Male_Neg" }) # Female correlation subset_f = df_female.filter(pl.col("Right_Anchor") == trait) valid_f = subset_f.select(["score", "Ranking_Points"]).drop_nulls() if valid_f.height > 1: corr_f = valid_f.select(pl.corr("score", "Ranking_Points")).item() corr_val = corr_f if corr_f is not None else 0.0 trait_correlations.append({ "trait_display": trait_display, "Gender": "Female", "correlation": corr_val, "color_key": "Female_Pos" if corr_val >= 0 else "Female_Neg" }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() main_chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title='Speaking Style Trait', axis=alt.Axis(labelAngle=0, grid=False)), xOffset='Gender:N', y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), color=alt.Color('color_key:N', scale=alt.Scale( domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'], range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE, ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE] ), legend=None), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('Gender:N'), alt.Tooltip('correlation:Q', format='.3f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) # Add filter footnote to main chart before combining with legend main_chart = self._add_filter_footnote(main_chart) # Add custom legend below the chart legend = self._create_gender_correlation_legend() chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent') chart = self._save_plot(chart, title, filename=filename, skip_footnote=True) return chart def plot_speaking_style_color_correlation( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Speaking Style and Voice Scale 1-10 Correlations
(Average by Color)", filename: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plot high-level correlation showing one bar per speaking style color. Original use-case: "I want to create high-level correlation plots between 'green, blue, orange, red' speaking styles and the 'voice scale scores'. I want to go to one plot with one bar for each color." Args: data: DataFrame with columns [Color, correlation, n_traits] from utils.transform_speaking_style_color_correlation title: Chart title (supports
for line breaks) filename: Optional explicit filename (without extension) for saving. If not provided, filename is derived from title. width: Chart width in pixels height: Chart height in pixels Returns: Altair chart with one bar per speaking style color """ df = self._ensure_dataframe(data) # Conditional color based on sign (matches plot_speaking_style_correlation) chart = alt.Chart(df.to_pandas()).mark_bar().encode( x=alt.X('Color:N', title=None, axis=alt.Axis(labelAngle=0, grid=False), sort=["Green", "Blue", "Orange", "Red"]), y=alt.Y('correlation:Q', title='Average Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('Color:N', title='Speaking Style'), alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'), alt.Tooltip('n_traits:Q', title='# Traits') ] ).properties( title=self._process_title(title), width=width or 400, height=height or 350 ) chart = self._save_plot(chart, title, filename=filename) return chart def plot_speaking_style_color_correlation_by_gender( self, data_male: pl.LazyFrame | pl.DataFrame, data_female: pl.LazyFrame | pl.DataFrame, speaking_styles: dict[str, list[str]], target_column: str = "Voice_Scale_Score", title: str = "Speaking Style Colors Correlation (by Voice Gender)", filename: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plot correlation by speaking style color with grouped bars for male vs female voices. Args: data_male: DataFrame filtered to male voices only data_female: DataFrame filtered to female voices only speaking_styles: Dictionary mapping color names to their constituent traits target_column: The column to correlate against ("Voice_Scale_Score" or "Ranking_Points") title: Chart title filename: Optional explicit filename for saving width: Chart width in pixels height: Chart height in pixels Returns: Altair chart with grouped bars (male/female) per color """ import utils df_male = self._ensure_dataframe(data_male) df_female = self._ensure_dataframe(data_female) # Get correlations for each gender color_corr_male, _ = utils.transform_speaking_style_color_correlation( df_male, speaking_styles, target_column=target_column ) color_corr_female, _ = utils.transform_speaking_style_color_correlation( df_female, speaking_styles, target_column=target_column ) # Add gender column and color_key based on correlation sign color_corr_male = color_corr_male.with_columns([ pl.lit("Male").alias("Gender"), pl.when(pl.col("correlation") >= 0) .then(pl.lit("Male_Pos")) .otherwise(pl.lit("Male_Neg")) .alias("color_key") ]) color_corr_female = color_corr_female.with_columns([ pl.lit("Female").alias("Gender"), pl.when(pl.col("correlation") >= 0) .then(pl.lit("Female_Pos")) .otherwise(pl.lit("Female_Neg")) .alias("color_key") ]) combined = pl.concat([color_corr_male, color_corr_female]) main_chart = alt.Chart(combined.to_pandas()).mark_bar().encode( x=alt.X('Color:N', title='Speaking Style Color', axis=alt.Axis(labelAngle=0, grid=False), sort=["Green", "Blue", "Orange", "Red"]), xOffset='Gender:N', y=alt.Y('correlation:Q', title='Average Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), color=alt.Color('color_key:N', scale=alt.Scale( domain=['Male_Pos', 'Female_Pos', 'Male_Neg', 'Female_Neg'], range=[ColorPalette.CORR_MALE_POSITIVE, ColorPalette.CORR_FEMALE_POSITIVE, ColorPalette.CORR_MALE_NEGATIVE, ColorPalette.CORR_FEMALE_NEGATIVE] ), legend=None), tooltip=[ alt.Tooltip('Color:N', title='Speaking Style'), alt.Tooltip('Gender:N'), alt.Tooltip('correlation:Q', format='.3f', title='Avg Correlation'), alt.Tooltip('n_traits:Q', title='# Traits') ] ).properties( title=self._process_title(title), width=width or 400, height=height or 350 ) # Add filter footnote to main chart before combining with legend main_chart = self._add_filter_footnote(main_chart) # Add custom legend below the chart legend = self._create_gender_correlation_legend() chart = alt.vconcat(main_chart, legend, spacing=10).resolve_scale(color='independent') chart = self._save_plot(chart, title, filename=filename, skip_footnote=True) return chart def plot_demographic_distribution( self, column: str, data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, height: int | None = None, width: int | str | None = None, show_counts: bool = True, ) -> alt.Chart: """Create a horizontal bar chart showing the distribution of respondents by a demographic column. Designed to be compact so multiple charts (approx. 6) can fit on one slide. Uses horizontal bars for better readability with many categories. Parameters: column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity'). data: Optional DataFrame. If None, uses self.data_filtered. title: Chart title. If None, auto-generates based on column name. height: Chart height in pixels (default: auto-sized based on categories). width: Chart width in pixels (default: 280 for compact layout). show_counts: If True, display count labels on the bars. Returns: alt.Chart: An Altair horizontal bar chart showing the distribution. """ df = self._ensure_dataframe(data) if column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N') # Count values in the column, including nulls stats_df = ( df.select(pl.col(column)) .with_columns(pl.col(column).fill_null("(No Response)")) .group_by(column) .agg(pl.len().alias("count")) .to_pandas() ) # Apply sorting logic if column == 'Age': # Custom sort for Age ranges # Example values: "18 to 21 years", "25 to 34 years", "70 years or more" # Extract first number to sort by stats_df['sort_key'] = stats_df[column].apply( lambda x: int(re.search(r'\d+', str(x)).group()) if re.search(r'\d+', str(x)) else 999 ) # Use EncodingSortField for Age to avoid schema issues with list-based labels sort_order = alt.EncodingSortField(field="sort_key", order="ascending") else: # Default sort by count descending sort_order = '-x' if stats_df.empty: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') # Calculate percentages total = stats_df['count'].sum() stats_df['percentage'] = (stats_df['count'] / total * 100).round(1) # Clean y-labels by replacing underscores and wrapping long text import textwrap stats_df['clean_label'] = stats_df[column].astype(str).str.replace('_', ' ').apply( lambda x: textwrap.wrap(x, width=25) if isinstance(x, str) else [str(x)] ) # Calculate max lines for height adjustment max_lines = stats_df['clean_label'].apply(len).max() if not stats_df.empty else 1 # Generate title if not provided if title is None: clean_col = column.replace('_', ' ').replace('/', ' / ') title = f"Distribution: {clean_col}" # Calculate appropriate height based on number of categories and wrapping num_categories = len(stats_df) bar_height = max(20, max_lines * 15) # pixels per bar, scale with lines calculated_height = max(120, num_categories * bar_height + 40) # min 120px, +40 for title/padding # Horizontal bar chart - categories on Y axis, counts on X axis bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=True)), y=alt.Y('clean_label:N', title=None, sort=sort_order, axis=alt.Axis(labelLimit=300, grid=False)), tooltip=[ alt.Tooltip('clean_label:N', title=column.replace('_', ' ')), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('percentage:Q', title='Percentage', format='.1f') ] ) # Add count labels at end of bars if show_counts: text = alt.Chart(stats_df).mark_text( align='left', baseline='middle', dx=3, # Offset from bar end fontSize=9, color=ColorPalette.TEXT ).encode( x='count:Q', y=alt.Y('clean_label:N', sort=sort_order), text='count:Q' ) chart = (bars + text) else: chart = bars # Compact dimensions for 6-per-slide layout chart = chart.properties( title=self._process_title(title), width=width or 200, height=height or calculated_height ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_ranking_correlation( self, style_color: str, style_traits: list[str], data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, filename: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores (1-5) and Voice Ranking Points (0-3). Args: filename: Optional explicit filename (without extension) for saving. If not provided, filename is derived from title. """ df = self._ensure_dataframe(data) if title is None: title = f"Speaking style {style_color} and voice ranking points correlations" trait_correlations = [] for i, trait in enumerate(style_traits): subset = df.filter(pl.col("Right_Anchor") == trait) valid_data = subset.select(["score", "Ranking_Points"]).drop_nulls() if valid_data.height > 1: corr_val = valid_data.select(pl.corr("score", "Ranking_Points")).item() trait_display = trait.replace('|', '\n') trait_correlations.append({ "trait_display": trait_display, "trait_index": f"Trait {i+1}", "correlation": corr_val if corr_val is not None else 0.0 }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0, grid=False)), y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1]), axis=alt.Axis(grid=True)), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('correlation:Q', format='.2f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) chart = self._save_plot(chart, title, filename=filename) return chart def plot_traits_wordcloud( self, data: pl.LazyFrame | pl.DataFrame | None = None, column: str = 'Top_3_Traits', title: str = "Most Prominent Personality Traits", width: int = 1600, height: int = 800, background_color: str = 'white', random_state: int = 23, ): """Create a word cloud visualization of personality traits from survey data. Args: data: Polars DataFrame or LazyFrame containing trait data column: Name of column containing comma-separated traits title: Title for the word cloud width: Width of the word cloud image in pixels height: Height of the word cloud image in pixels background_color: Background color for the word cloud random_state: Random seed for reproducible word cloud generation (default: 23) Returns: matplotlib.figure.Figure: The word cloud figure for display in notebooks """ import matplotlib.pyplot as plt from wordcloud import WordCloud from collections import Counter import random df = self._ensure_dataframe(data) # Extract and split traits traits_list = [] for row in df[column].drop_nulls(): # Split by comma and clean whitespace traits = [trait.strip() for trait in row.split(',')] traits_list.extend(traits) # Create frequency dictionary trait_freq = Counter(traits_list) # Handle empty data gracefully - return empty figure with message if not trait_freq: fig, ax = plt.subplots(figsize=(width/100, height/100), dpi=100) ax.text(0.5, 0.5, "No trait data available for current filter", ha='center', va='center', fontsize=14, color='gray', transform=ax.transAxes) ax.axis('off') ax.set_title(title, fontsize=16, pad=20, color=ColorPalette.TEXT) return fig # Set random seed for color selection random.seed(random_state) # Color function using JPMC colors def color_func(word, font_size, position, orientation, random_state=None, **kwargs): colors = [ ColorPalette.PRIMARY, ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3, ] return random.choice(colors) # Generate word cloud wordcloud = WordCloud( width=width, height=height, background_color=background_color, color_func=color_func, relative_scaling=0.5, min_font_size=10, prefer_horizontal=0.7, collocations=False, # Treat each word independently random_state=random_state # Seed for reproducible layout ).generate_from_frequencies(trait_freq) # Create matplotlib figure fig, ax = plt.subplots(figsize=(width/100, height/100), dpi=100) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') # Add title with filter subtitle (similar to _add_filter_footnote for Altair charts) filter_text = self._get_filter_description() if filter_text: # Wrap filter text to prevent excessively long lines wrapped_lines = textwrap.wrap(filter_text, width=100) wrapped_text = '\n'.join(wrapped_lines) # Use suptitle for main title (auto-positioned above axes) fig.suptitle(title, fontsize=16, color=ColorPalette.TEXT, y=1.02) # Use ax.set_title for filter text (positioned relative to axes, not figure) ax.set_title(wrapped_text, fontsize=10, color='lightgrey', loc='left', pad=5) else: ax.set_title(title, fontsize=16, pad=20, color=ColorPalette.TEXT) plt.tight_layout(pad=0) # Save figure if directory specified (using same pattern as other plots) if hasattr(self, 'fig_save_dir') and self.fig_save_dir: save_path = Path(self.fig_save_dir) # Add filter slug subfolder filter_slug = self._get_filter_slug() save_path = save_path / filter_slug if not save_path.exists(): save_path.mkdir(parents=True, exist_ok=True) # Use _sanitize_filename for consistency filename = f"{self._sanitize_filename(title)}.png" filepath = save_path / filename # Save as PNG at high resolution fig.savefig(filepath, dpi=300, bbox_inches='tight', facecolor='white') print(f"Word cloud saved to: {filepath}") return fig def plot_character_trait_frequency( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Trait Frequency per Brand Character", x_label: str = "Trait", y_label: str = "Frequency (Times Chosen)", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a grouped bar plot showing how often each trait is chosen per character. Original request: "I need a bar plot that shows the frequency of the times each trait is chosen per brand character" Expects data with columns: Character, Trait, Count (as produced by transform_character_trait_frequency). """ df = self._ensure_dataframe(data) # Ensure we have the expected columns required_cols = {'Character', 'Trait', 'Count'} if not required_cols.issubset(set(df.columns)): return alt.Chart(pd.DataFrame({'text': ['Data must have Character, Trait, Count columns']})).mark_text().encode(text='text:N') # Convert to pandas for Altair plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df # Calculate total per trait for sorting (traits with highest overall frequency first) trait_totals = plot_df.groupby('Trait')['Count'].sum().sort_values(ascending=False) trait_order = trait_totals.index.tolist() # Get unique characters for color mapping characters = plot_df['Character'].unique().tolist() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['Character'], bind='legend') chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('Trait:N', title=x_label, sort=trait_order, axis=alt.Axis(labelAngle=-45, labelLimit=200, grid=False)), y=alt.Y('Count:Q', title=y_label, axis=alt.Axis(grid=True)), xOffset='Character:N', color=alt.Color('Character:N', scale=alt.Scale(domain=characters, range=ColorPalette.CATEGORICAL[:len(characters)]), legend=alt.Legend(orient='top', direction='horizontal', title='Character')), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('Character:N', title='Character'), alt.Tooltip('Trait:N', title='Trait'), alt.Tooltip('Count:Q', title='Frequency') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 900, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_single_character_trait_frequency( self, data: pl.LazyFrame | pl.DataFrame | None = None, character_name: str = "Character", bar_color: str = ColorPalette.PRIMARY, highlight_color: str = ColorPalette.NEUTRAL, title: str | None = None, x_label: str = "Trait", y_label: str = "Frequency", trait_sort_order: list[str] | None = None, height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a bar plot showing trait frequency for a single character. Original request: "I need a bar plot that shows the frequency of the times each trait is chosen per brand character. The function should be generalized so that it can be used 4 times, once for each character. Each character should use a slightly different color. Original traits should be highlighted." This function creates one plot per character. Call it 4 times (once per character) to generate all plots for a slide. Args: data: DataFrame with columns ['trait', 'count', 'is_original'] as produced by transform_character_trait_frequency() character_name: Name of the character (for title). E.g., "Bank Teller" bar_color: Main bar color for non-original traits. Use ColorPalette constants like ColorPalette.CHARACTER_BANK_TELLER highlight_color: Lighter color for original/expected traits. Use the matching highlight like ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT title: Custom title. If None, auto-generates from character_name x_label: X-axis label y_label: Y-axis label trait_sort_order: Optional list of traits for consistent sorting across all character plots. If None, sorts by count descending. height: Chart height width: Chart width Returns: alt.Chart: Altair bar chart """ df = self._ensure_dataframe(data) # Ensure we have the expected columns required_cols = {'trait', 'count', 'is_original'} if not required_cols.issubset(set(df.columns)): return alt.Chart(pd.DataFrame({ 'text': ['Data must have trait, count, is_original columns'] })).mark_text().encode(text='text:N') # Convert to pandas for Altair plot_df = df.to_pandas() if hasattr(df, 'to_pandas') else df # Determine sort order if trait_sort_order is not None: # Use provided order, append any missing traits at the end (sorted by count) known_traits = set(trait_sort_order) extra_traits = plot_df[~plot_df['trait'].isin(known_traits)].sort_values( 'count', ascending=False )['trait'].tolist() sort_order = trait_sort_order + extra_traits else: # Default: sort by count descending sort_order = plot_df.sort_values('count', ascending=False)['trait'].tolist() # Create category column for color encoding plot_df['category'] = plot_df['is_original'].map({ True: 'Original Trait', False: 'Other Trait' }) # Generate title if not provided if title is None: title = f"{character_name}
Trait Selection Frequency" # Build title config with sort order note as subtitle sort_note = "Sorted by total frequency across all characters" if trait_sort_order else "Sorted by frequency (descending)" title_text = self._process_title(title) title_config = { 'text': title_text, 'subtitle': sort_note, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } # Create HORIZONTAL bar chart with conditional coloring # Reverse sort order for horizontal bars (highest at top) reversed_sort = list(reversed(sort_order)) bars = alt.Chart(plot_df).mark_bar().encode( y=alt.Y('trait:N', title=x_label, sort=reversed_sort, axis=alt.Axis(labelLimit=200, grid=False)), x=alt.X('count:Q', title=y_label, axis=alt.Axis(grid=True)), color=alt.Color('category:N', scale=alt.Scale( domain=['Original Trait', 'Other Trait'], range=[highlight_color, bar_color] ), legend=alt.Legend( orient='top', direction='horizontal', title=None )), tooltip=[ alt.Tooltip('trait:N', title='Trait'), alt.Tooltip('count:Q', title='Frequency'), alt.Tooltip('category:N', title='Type') ] ) # Add count labels on bars (to the right of bars for horizontal) text = alt.Chart(plot_df).mark_text( dx=12, color='black', fontSize=10, align='left' ).encode( y=alt.Y('trait:N', sort=reversed_sort), x=alt.X('count:Q'), text=alt.Text('count:Q') ) chart = (bars + text).properties( title=title_config, width=width or 400, height=height or getattr(self, 'plot_height', 450) ) chart = self._save_plot(chart, title) return chart def plot_significance_heatmap( self, pairwise_df: pl.LazyFrame | pl.DataFrame | None = None, metadata: dict | None = None, title: str = "Pairwise Statistical Significance
(Adjusted p-values)", show_p_values: bool = True, show_effect_size: bool = False, height: int | None = None, width: int | None = None, ) -> alt.Chart: """Create a heatmap showing pairwise statistical significance between groups. Original use-case: "I need to test for statistical significance and present this in a logical manner - as a heatmap or similar visualization." This function visualizes the output of compute_pairwise_significance() as a color-coded heatmap where color intensity indicates significance level. Args: pairwise_df: Output from compute_pairwise_significance(). Expected columns: ['group1', 'group2', 'p_value', 'p_adjusted', 'significant'] metadata: Metadata dict from compute_pairwise_significance() (optional). Used to add test information to the plot subtitle. title: Chart title (supports
for line breaks) show_p_values: Whether to display p-values as text annotations show_effect_size: Whether to display effect sizes instead of p-values height: Chart height (default: auto-sized based on groups) width: Chart width (default: auto-sized based on groups) Returns: alt.Chart: Altair heatmap chart """ df = self._ensure_dataframe(pairwise_df) # Get unique groups all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list())) n_groups = len(all_groups) # Create symmetric matrix data for heatmap # We need both directions (A,B) and (B,A) for the full matrix heatmap_data = [] for row_group in all_groups: for col_group in all_groups: if row_group == col_group: # Diagonal - self comparison heatmap_data.append({ 'row': row_group, 'col': col_group, 'p_adjusted': None, 'p_value': None, 'significant': None, 'effect_size': None, 'text_label': '—', 'sig_category': 'Self', }) else: # Find the comparison (could be in either order) match = df.filter( ((pl.col('group1') == row_group) & (pl.col('group2') == col_group)) | ((pl.col('group1') == col_group) & (pl.col('group2') == row_group)) ) if match.height > 0: p_adj = match['p_adjusted'][0] p_val = match['p_value'][0] sig = match['significant'][0] eff = match['effect_size'][0] if 'effect_size' in match.columns else None # For ranking data, we can show Rank 1 % difference has_rank_pcts = 'rank1_pct1' in match.columns and 'rank1_pct2' in match.columns if has_rank_pcts: pct_diff = abs(match['rank1_pct1'][0] - match['rank1_pct2'][0]) else: pct_diff = None # Helper to get display text when not showing p-values def get_alt_text(): if eff is not None: return f'{eff:.2f}' elif pct_diff is not None: return f'{pct_diff:.1f}%' else: return '—' # Categorize significance level if p_adj is None: sig_cat = 'N/A' text = 'N/A' elif p_adj < 0.001: sig_cat = 'p < 0.001' text = '<.001' if show_p_values else get_alt_text() elif p_adj < 0.01: sig_cat = 'p < 0.01' text = f'{p_adj:.3f}' if show_p_values else get_alt_text() elif p_adj < 0.05: sig_cat = 'p < 0.05' text = f'{p_adj:.3f}' if show_p_values else get_alt_text() else: sig_cat = 'n.s.' text = f'{p_adj:.2f}' if show_p_values else get_alt_text() if show_effect_size: text = get_alt_text() heatmap_data.append({ 'row': row_group, 'col': col_group, 'p_adjusted': p_adj, 'p_value': p_val, 'significant': sig, 'effect_size': eff, 'text_label': text, 'sig_category': sig_cat, }) else: heatmap_data.append({ 'row': row_group, 'col': col_group, 'p_adjusted': None, 'p_value': None, 'significant': None, 'effect_size': None, 'text_label': 'N/A', 'sig_category': 'N/A', }) heatmap_df = pl.DataFrame(heatmap_data).to_pandas() # Define color scale for significance categories sig_domain = ['p < 0.001', 'p < 0.01', 'p < 0.05', 'n.s.', 'Self', 'N/A'] sig_range = [ ColorPalette.SIG_STRONG, # p < 0.001 ColorPalette.SIG_MODERATE, # p < 0.01 ColorPalette.SIG_WEAK, # p < 0.05 ColorPalette.SIG_NONE, # not significant ColorPalette.SIG_DIAGONAL, # diagonal (self) ColorPalette.NEUTRAL, # N/A ] # Build tooltip fields based on available data tooltip_fields = [ alt.Tooltip('row:N', title='Group 1'), alt.Tooltip('col:N', title='Group 2'), alt.Tooltip('p_value:Q', title='p-value', format='.4f'), alt.Tooltip('p_adjusted:Q', title='Adjusted p', format='.4f'), ] # Only add effect_size if it has non-null values (continuous data) has_effect_size = 'effect_size' in heatmap_df.columns and heatmap_df['effect_size'].notna().any() if has_effect_size: tooltip_fields.append(alt.Tooltip('effect_size:Q', title='Effect Size', format='.3f')) # Add rank info for ranking data has_rank_pcts = 'rank1_pct1' in df.columns if isinstance(df, pl.DataFrame) else False if has_rank_pcts: tooltip_fields.append(alt.Tooltip('text_label:N', title='Rank 1 % Diff')) # Calculate dimensions cell_size = 45 auto_size = n_groups * cell_size + 100 chart_width = width or auto_size chart_height = height or auto_size # Base heatmap heatmap = alt.Chart(heatmap_df).mark_rect(stroke='white', strokeWidth=1).encode( x=alt.X('col:N', title=None, sort=all_groups, axis=alt.Axis(labelAngle=-45, labelLimit=150, grid=False)), y=alt.Y('row:N', title=None, sort=all_groups, axis=alt.Axis(labelLimit=150, grid=False)), color=alt.Color('sig_category:N', scale=alt.Scale(domain=sig_domain, range=sig_range), legend=alt.Legend( title='Significance', orient='right', direction='vertical' )), tooltip=tooltip_fields ) # Text annotations if show_p_values or show_effect_size: # Add a column for text color based on significance heatmap_df['text_color'] = heatmap_df['sig_category'].apply( lambda x: 'white' if x in ['p < 0.001', 'p < 0.01'] else 'black' ) text = alt.Chart(heatmap_df).mark_text( fontSize=9, fontWeight='normal' ).encode( x=alt.X('col:N', sort=all_groups), y=alt.Y('row:N', sort=all_groups), text='text_label:N', color=alt.Color('text_color:N', scale=None), ) chart = (heatmap + text) else: chart = heatmap # Build subtitle with test info subtitle_lines = [] if metadata: test_info = f"Test: {metadata.get('test_type', 'N/A')}" if metadata.get('overall_p_value') is not None: test_info += f" | Overall p={metadata['overall_p_value']:.4f}" correction = metadata.get('correction', 'none') if correction != 'none': test_info += f" | Correction: {correction}" subtitle_lines.append(test_info) title_config = { 'text': self._process_title(title), 'subtitle': subtitle_lines if subtitle_lines else None, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } chart = chart.properties( title=title_config, width=chart_width, height=chart_height, ) chart = self._save_plot(chart, title) return chart def plot_significance_summary( self, pairwise_df: pl.LazyFrame | pl.DataFrame | None = None, metadata: dict | None = None, title: str = "Significant Differences Summary
(Groups with significantly different means)", height: int | None = None, width: int | None = None, ) -> alt.Chart: """Create a summary bar chart showing which groups have significant differences. This shows each group with a count of how many other groups it differs from significantly, plus the mean score or Rank 1 percentage for reference. Args: pairwise_df: Output from compute_pairwise_significance() or compute_ranking_significance(). metadata: Metadata dict from the significance computation (optional). title: Chart title height: Chart height width: Chart width Returns: alt.Chart: Altair bar chart with significance count per group """ df = self._ensure_dataframe(pairwise_df) # Detect data type: continuous (has mean1/mean2) vs ranking (has rank1_pct1/rank1_pct2) has_means = 'mean1' in df.columns has_ranks = 'rank1_pct1' in df.columns # Count significant differences per group sig_df = df.filter(pl.col('significant') == True) # Count for each group (appears as either group1 or group2) group1_counts = sig_df.group_by('group1').agg(pl.len().alias('count')) group2_counts = sig_df.group_by('group2').agg(pl.len().alias('count')) # Combine counts all_groups = sorted(set(df['group1'].to_list() + df['group2'].to_list())) summary_data = [] for group in all_groups: count1 = group1_counts.filter(pl.col('group1') == group)['count'].to_list() count2 = group2_counts.filter(pl.col('group2') == group)['count'].to_list() total_sig = (count1[0] if count1 else 0) + (count2[0] if count2 else 0) # Get score for this group from pairwise data if has_means: # Continuous data - use means scores = df.filter(pl.col('group1') == group)['mean1'].to_list() if not scores: scores = df.filter(pl.col('group2') == group)['mean2'].to_list() score_val = scores[0] if scores else None score_label = 'mean' elif has_ranks: # Ranking data - use Rank 1 percentage scores = df.filter(pl.col('group1') == group)['rank1_pct1'].to_list() if not scores: scores = df.filter(pl.col('group2') == group)['rank1_pct2'].to_list() score_val = scores[0] if scores else None score_label = 'rank1_pct' else: score_val = None score_label = 'score' summary_data.append({ 'group': group, 'sig_count': total_sig, 'score': score_val, }) summary_df = pl.DataFrame(summary_data).sort('score', descending=True, nulls_last=True).to_pandas() # Create layered chart: bars for sig_count, text for score tooltip_title = 'Mean Score' if has_means else 'Rank 1 %' if has_ranks else 'Score' bars = alt.Chart(summary_df).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('group:N', title='Group', sort='-y', axis=alt.Axis(grid=False)), y=alt.Y('sig_count:Q', title='# of Significant Differences', axis=alt.Axis(grid=True)), tooltip=[ alt.Tooltip('group:N', title='Group'), alt.Tooltip('sig_count:Q', title='Sig. Differences'), alt.Tooltip('score:Q', title=tooltip_title, format='.1f'), ] ) # Only add text labels if we have scores if summary_df['score'].notna().any(): text_format = '.1f' if has_means else '.0f' text_suffix = '%' if has_ranks else '' text = alt.Chart(summary_df).mark_text( dy=-8, color='black', fontSize=9 ).encode( x=alt.X('group:N', sort='-y'), y=alt.Y('sig_count:Q'), text=alt.Text('score:Q', format=text_format) ) chart_layers = bars + text else: chart_layers = bars # Build subtitle subtitle = None if metadata: if has_means: subtitle = f"Mean scores shown above bars | α={metadata.get('alpha', 0.05)}" elif has_ranks: subtitle = f"Rank 1 % shown above bars | α={metadata.get('alpha', 0.05)}" else: subtitle = f"α={metadata.get('alpha', 0.05)}" title_config = { 'text': self._process_title(title), 'subtitle': subtitle, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } chart = chart_layers.properties( title=title_config, width=width or 800, height=height or getattr(self, 'plot_height', 400), ) chart = self._save_plot(chart, title) return chart def plot_straight_liner_repeat_offenders( self, cumulative_df: pl.DataFrame | pd.DataFrame, title: str = "Straight-Liner Repeat Offenders\n(Cumulative Distribution)", height: int | None = None, width: int | str | None = None, total_respondents: int | None = None, ) -> alt.Chart: """Plot the cumulative distribution of straight-liner repeat offenders. Shows how many respondents straight-lined at N or more question groups, for every observed threshold. Parameters: cumulative_df: DataFrame with columns ``threshold`` (int), ``count`` (int) and ``pct`` (float, 0-100). Each row represents "≥ threshold question groups". title: Chart title. height: Chart height in pixels. width: Chart width in pixels. total_respondents: If provided, shown in the subtitle for context. Returns: The Altair chart object (already saved if ``fig_save_dir`` is configured). """ if isinstance(cumulative_df, pl.DataFrame): plot_df = cumulative_df.to_pandas() else: plot_df = cumulative_df.copy() # Build readable x-axis labels ("≥1", "≥2", …) plot_df["label"] = plot_df["threshold"].apply(lambda t: f"≥{t}") # Explicit sort order so Altair keeps ascending threshold sort_order = plot_df.sort_values("threshold")["label"].tolist() # --- Bars: respondent count --- bars = alt.Chart(plot_df).mark_bar( color=ColorPalette.PRIMARY ).encode( x=alt.X( "label:N", title="Number of Straight-Lined Question Groups", sort=sort_order, axis=alt.Axis(grid=False), ), y=alt.Y( "count:Q", title="Number of Respondents", axis=alt.Axis(grid=True), ), tooltip=[ alt.Tooltip("label:N", title="Threshold"), alt.Tooltip("count:Q", title="Respondents"), alt.Tooltip("pct:Q", title="% of Total", format=".1f"), ], ) # --- Text: count + percentage above each bar --- text = alt.Chart(plot_df).mark_text( dy=-10, color="black", fontSize=11 ).encode( x=alt.X("label:N", sort=sort_order), y=alt.Y("count:Q"), text=alt.Text("count_label:N"), ) # Build a combined label column "N (xx.x%)" plot_df["count_label"] = plot_df.apply( lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1 ) # Rebuild text layer with the updated df text = alt.Chart(plot_df).mark_text( dy=-10, color="black", fontSize=11 ).encode( x=alt.X("label:N", sort=sort_order), y=alt.Y("count:Q"), text=alt.Text("count_label:N"), ) # --- Subtitle --- subtitle_parts = [] if total_respondents is not None: subtitle_parts.append( f"Total respondents: {total_respondents}" ) subtitle_parts.append( "Each bar shows how many respondents straight-lined " "at least that many question groups" ) subtitle = " | ".join(subtitle_parts) title_config = { "text": self._process_title(title), "subtitle": subtitle, "subtitleColor": "gray", "subtitleFontSize": 10, "anchor": "start", } chart = alt.layer(bars, text).properties( title=title_config, width=width or 800, height=height or getattr(self, "plot_height", 400), ) chart = self._save_plot(chart, title) return chart def plot_straight_liner_per_question( self, per_question_df: pl.DataFrame | pd.DataFrame, title: str = "Straight-Lining Frequency per Question Group", height: int | None = None, width: int | str | None = None, total_respondents: int | None = None, ) -> alt.Chart: """Plot how often each question group is straight-lined. Parameters: per_question_df: DataFrame with columns ``question`` (str, human-readable name), ``count`` (int) and ``pct`` (float, 0-100). Sorted descending by count. title: Chart title. height: Chart height in pixels. width: Chart width in pixels. total_respondents: Shown in subtitle for context. Returns: The Altair chart (saved if ``fig_save_dir`` is set). """ if isinstance(per_question_df, pl.DataFrame): plot_df = per_question_df.to_pandas() else: plot_df = per_question_df.copy() # Sort order: largest count at top. Altair y-axis nominal sort places # the first list element at the top, so descending order is correct. sort_order = plot_df.sort_values("count", ascending=False)["question"].tolist() # Combined label "N (xx.x%)" plot_df["count_label"] = plot_df.apply( lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1 ) # --- Horizontal Bars --- bars = alt.Chart(plot_df).mark_bar( color=ColorPalette.PRIMARY, ).encode( y=alt.Y( "question:N", title=None, sort=sort_order, axis=alt.Axis(grid=False, labelLimit=250, labelAngle=0), ), x=alt.X( "count:Q", title="Number of Straight-Liners", axis=alt.Axis(grid=True), ), tooltip=[ alt.Tooltip("question:N", title="Question"), alt.Tooltip("count:Q", title="Straight-Liners"), alt.Tooltip("pct:Q", title="% of Respondents", format=".1f"), ], ) # --- Text labels to the right of bars --- text = alt.Chart(plot_df).mark_text( align="left", dx=4, color="black", fontSize=10, ).encode( y=alt.Y("question:N", sort=sort_order), x=alt.X("count:Q"), text=alt.Text("count_label:N"), ) # --- Subtitle --- subtitle_parts = [] if total_respondents is not None: subtitle_parts.append(f"Total respondents: {total_respondents}") subtitle_parts.append( "Count and share of respondents who straight-lined each question group" ) subtitle = " | ".join(subtitle_parts) title_config = { "text": self._process_title(title), "subtitle": subtitle, "subtitleColor": "gray", "subtitleFontSize": 10, "anchor": "start", } # Scale height with number of questions for readable bar spacing n_questions = len(plot_df) auto_height = max(400, n_questions * 22) chart = alt.layer(bars, text).properties( title=title_config, width=width or 700, height=height or auto_height, ) chart = self._save_plot(chart, title) return chart