"""Plotting functions for Voice Branding analysis using Altair.""" import re import math from pathlib import Path import altair as alt import pandas as pd import polars as pl from theme import ColorPalette import hashlib class QualtricsPlotsMixin: """Mixin class for plotting functions in QualtricsSurvey.""" def _process_title(self, title: str) -> str | list[str]: """Process title to handle
tags for Altair.""" if isinstance(title, str) and '
' in title: return title.split('
') return title def _sanitize_filename(self, title: str) -> str: """Convert plot title to a safe filename.""" # Remove HTML tags clean = re.sub(r'<[^>]+>', ' ', title) # Replace special characters with underscores clean = re.sub(r'[^\w\s-]', '', clean) # Replace whitespace with underscores clean = re.sub(r'\s+', '_', clean.strip()) # Remove consecutive underscores clean = re.sub(r'_+', '_', clean) # Lowercase and limit length return clean.lower()[:100] def _get_filter_slug(self) -> str: """Generate a directory-friendly slug based on active filters.""" parts = [] # Mapping of attribute name to (short_code, value, options_attr) filters = [ ('age', 'Age', getattr(self, 'filter_age', None), 'options_age'), ('gender', 'Gen', getattr(self, 'filter_gender', None), 'options_gender'), ('consumer', 'Cons', getattr(self, 'filter_consumer', None), 'options_consumer'), ('ethnicity', 'Eth', getattr(self, 'filter_ethnicity', None), 'options_ethnicity'), ('income', 'Inc', getattr(self, 'filter_income', None), 'options_income'), ] for _, short_code, value, options_attr in filters: if value is None: continue # Ensure value is a list for uniform handling if not isinstance(value, list): value = [value] if len(value) == 0: continue # Check if all options are selected (equivalent to no filter) # We compare the set of selected values to the set of all available options master_list = getattr(self, options_attr, None) if master_list and set(value) == set(master_list): continue if len(value) > 3: # If more than 3 options selected, create a hash of the sorted values # This ensures uniqueness properly while keeping the slug short sorted_vals = sorted([str(v) for v in value]) vals_str = "".join(sorted_vals) # Create short 6-char hash val_hash = hashlib.md5(vals_str.encode()).hexdigest()[:6] val_str = f"{len(value)}_grps_{val_hash}" else: # Join values with '+' clean_values = [] for v in value: # Simple sanitization: keep alphanum and hyphens/dots, remove others s = str(v) # Remove special chars that might be problematic in dir names s = re.sub(r'[^\w\-\.]', '', s) clean_values.append(s) val_str = "+".join(clean_values) parts.append(f"{short_code}-{val_str}") if not parts: return "All_Respondents" return "_".join(parts) def _get_filter_description(self) -> str: """Generate a human-readable description of active filters.""" parts = [] # Mapping of attribute name to (display_name, value, options_attr) filters = [ ('Age', getattr(self, 'filter_age', None), 'options_age'), ('Gender', getattr(self, 'filter_gender', None), 'options_gender'), ('Consumer', getattr(self, 'filter_consumer', None), 'options_consumer'), ('Ethnicity', getattr(self, 'filter_ethnicity', None), 'options_ethnicity'), ('Income', getattr(self, 'filter_income', None), 'options_income'), ] for display_name, value, options_attr in filters: if value is None: continue # Ensure value is a list for uniform handling if not isinstance(value, list): value = [value] if len(value) == 0: continue # Check if all options are selected (equivalent to no filter) master_list = getattr(self, options_attr, None) if master_list and set(value) == set(master_list): continue # Use original values for display (full list) clean_values = [str(v) for v in value] val_str = ", ".join(clean_values) # Use UPPERCASE for category name to distinguish from values parts.append(f"{display_name.upper()}: {val_str}") if not parts: return "" # Join with clear separator - double space for visual break return "Filters: " + " — ".join(parts) def _add_filter_footnote(self, chart: alt.Chart) -> alt.Chart: """Add a footnote with active filters to the chart. Uses chart subtitle for filter text to avoid layout issues with vconcat. Returns the modified chart (or original if no filters). """ filter_text = self._get_filter_description() # Skip if no filters active - return original chart if not filter_text: return chart # Wrap text into multiple lines at ~100 chars, but don't break mid-word max_line_length = 100 words = filter_text.split() lines = [] current_line = "" for word in words: test_line = f"{current_line} {word}".strip() if current_line else word if len(test_line) <= max_line_length: current_line = test_line else: if current_line: lines.append(current_line) current_line = word if current_line: lines.append(current_line) # Get existing title from chart spec chart_spec = chart.to_dict() existing_title = chart_spec.get('title', '') # Handle different title formats (string vs dict vs list) if isinstance(existing_title, (str, list)): title_config = { 'text': existing_title, 'subtitle': lines, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } elif isinstance(existing_title, dict): title_config = existing_title.copy() title_config['subtitle'] = lines title_config['subtitleColor'] = 'gray' title_config['subtitleFontSize'] = 10 title_config['anchor'] = 'start' else: # No existing title, just add filters as subtitle title_config = { 'text': '', 'subtitle': lines, 'subtitleColor': 'gray', 'subtitleFontSize': 10, 'anchor': 'start', } return chart.properties(title=title_config) def _save_plot(self, chart: alt.Chart, title: str) -> alt.Chart: """Save chart to PNG file if fig_save_dir is set. Returns the (potentially modified) chart with filter footnote added. """ # Add filter footnote - returns combined chart if filters active chart = self._add_filter_footnote(chart) if hasattr(self, 'fig_save_dir') and self.fig_save_dir: path = Path(self.fig_save_dir) # Add filter slug subfolder filter_slug = self._get_filter_slug() path = path / filter_slug if not path.exists(): path.mkdir(parents=True, exist_ok=True) filename = f"{self._sanitize_filename(title)}.png" filepath = path / filename # Use vl_convert directly with theme config for consistent rendering import vl_convert as vlc from theme import jpmc_altair_theme # Get chart spec and theme config chart_spec = chart.to_dict() theme_config = jpmc_altair_theme()['config'] png_data = vlc.vegalite_to_png( vl_spec=chart_spec, scale=2.0, ppi=72, config=theme_config ) with open(filepath, 'wb') as f: f.write(png_data) return chart def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame: """Ensure data is an eager DataFrame, collecting if necessary.""" df = data if data is not None else getattr(self, 'data_filtered', None) if df is None: raise ValueError("No data provided and self.data_filtered is None.") if isinstance(df, pl.LazyFrame): return df.collect() return df def _clean_voice_label(self, col_name: str) -> str: """Extract and clean voice name from column name for display. Handles patterns like: - 'Voice_Scale__The_Coach' -> 'The Coach' - 'Character_Ranking_The_Coach' -> 'The Coach' - 'Top_3_Voices_ranking__Familiar_Friend' -> 'Familiar Friend' """ # First split by __ if present label = col_name.split('__')[-1] if '__' in col_name else col_name # Remove common prefixes label = label.replace('Character_Ranking_', '') label = label.replace('Top_3_Voices_ranking_', '') # Replace underscores with spaces label = label.replace('_', ' ').strip() return label def plot_average_scores_with_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "General Impression (1-10)\nPer Voice with Number of Participants Who Rated It", x_label: str = "Stimuli", y_label: str = "Average General Impression Rating (1-10)", color: str = ColorPalette.PRIMARY, height: int | None = None, width: int | str | None = None, domain: list[float] | None = None, ) -> alt.Chart: """Create a bar plot showing average scores and count of non-null values for each column.""" df = self._ensure_dataframe(data) # Calculate stats for each column (exclude _recordId) stats = [] for col in [c for c in df.columns if c != '_recordId']: avg_score = df[col].mean() non_null_count = df[col].drop_nulls().len() label = self._clean_voice_label(col) stats.append({ 'voice': label, 'average': avg_score, 'count': non_null_count }) # Convert to pandas for Altair (sort by average descending) stats_df = pl.DataFrame(stats).sort('average', descending=True).to_pandas() if domain is None: domain = [stats_df['average'].min(), stats_df['average'].max()] # Base bar chart bars = alt.Chart(stats_df).mark_bar(color=color).encode( x=alt.X('voice:N', title=x_label, sort='-y'), y=alt.Y('average:Q', title=y_label, scale=alt.Scale(domain=domain)), tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('average:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count') ] ) # Text overlay for counts text = alt.Chart(stats_df).mark_text( dy=-5, color='black', fontSize=10 ).encode( x=alt.X('voice:N', sort='-y'), y=alt.Y('average:Q'), text=alt.Text('count:Q') ) # Combine layers chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_top3_ranking_distribution( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Top 3 Rankings Distribution\nCount of 1st, 2nd, and 3rd Place Votes per Voice", x_label: str = "Voices", y_label: str = "Number of Mentions in Top 3", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a stacked bar chart showing how often each voice was ranked 1st, 2nd, or 3rd.""" df = self._ensure_dataframe(data) # Calculate stats per column stats = [] for col in [c for c in df.columns if c != '_recordId']: rank1 = df.filter(pl.col(col) == 1).height rank2 = df.filter(pl.col(col) == 2).height rank3 = df.filter(pl.col(col) == 3).height total = rank1 + rank2 + rank3 if total > 0: label = self._clean_voice_label(col) # Add 3 rows (one per rank) stats.append({'voice': label, 'rank': 'Rank 1 (1st Choice)', 'count': rank1, 'total': total}) stats.append({'voice': label, 'rank': 'Rank 2 (2nd Choice)', 'count': rank2, 'total': total}) stats.append({'voice': label, 'rank': 'Rank 3 (3rd Choice)', 'count': rank3, 'total': total}) # Convert to long format, sort by total stats_df = pl.DataFrame(stats).to_pandas() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['rank'], bind='legend') # Create stacked bar chart with interactive legend chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('voice:N', title=x_label, sort=alt.EncodingSortField(field='total', op='sum', order='descending')), y=alt.Y('count:Q', title=y_label, stack='zero'), color=alt.Color('rank:N', scale=alt.Scale(domain=['Rank 1 (1st Choice)', 'Rank 2 (2nd Choice)', 'Rank 3 (3rd Choice)'], range=[ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3]), legend=alt.Legend(orient='top', direction='horizontal', title=None)), order=alt.Order('rank:N', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('voice:N', title='Voice'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_ranking_distribution( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Rankings Distribution\n(1st to 4th Place)", x_label: str = "Item", y_label: str = "Number of Votes", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a stacked bar chart showing the distribution of rankings (1st to 4th).""" df = self._ensure_dataframe(data) stats = [] ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: r1 = df.filter(pl.col(col) == 1).height r2 = df.filter(pl.col(col) == 2).height r3 = df.filter(pl.col(col) == 3).height r4 = df.filter(pl.col(col) == 4).height total = r1 + r2 + r3 + r4 if total > 0: label = self._clean_voice_label(col) stats.append({'item': label, 'rank': 'Rank 1 (Best)', 'count': r1, 'rank1': r1}) stats.append({'item': label, 'rank': 'Rank 2', 'count': r2, 'rank1': r1}) stats.append({'item': label, 'rank': 'Rank 3', 'count': r3, 'rank1': r1}) stats.append({'item': label, 'rank': 'Rank 4 (Worst)', 'count': r4, 'rank1': r1}) if not stats: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') stats_df = pl.DataFrame(stats).to_pandas() # Interactive legend selection - click to filter selection = alt.selection_point(fields=['rank'], bind='legend') chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort=alt.EncodingSortField(field='rank1', order='descending')), y=alt.Y('count:Q', title=y_label, stack='zero'), color=alt.Color('rank:N', scale=alt.Scale(domain=['Rank 1 (Best)', 'Rank 2', 'Rank 3', 'Rank 4 (Worst)'], range=[ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3, ColorPalette.RANK_4]), legend=alt.Legend(orient='top', direction='horizontal', title=None)), order=alt.Order('rank:N', sort='ascending'), opacity=alt.condition(selection, alt.value(1), alt.value(0.2)), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('rank:N', title='Rank'), alt.Tooltip('count:Q', title='Count') ] ).add_params(selection).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_most_ranked_1( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Most Popular Choice\n(Number of Times Ranked 1st)", x_label: str = "Item", y_label: str = "Count of 1st Place Rankings", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a bar chart showing which item was ranked #1 the most. Top 3 highlighted.""" df = self._ensure_dataframe(data) stats = [] ranking_cols = [c for c in df.columns if c != '_recordId'] for col in ranking_cols: count_rank_1 = df.filter(pl.col(col) == 1).height label = self._clean_voice_label(col) stats.append({'item': label, 'count': count_rank_1}) # Convert and sort stats_df = pl.DataFrame(stats).sort('count', descending=True) # Add rank column for coloring (1-3 vs 4+) stats_df = stats_df.with_row_index('rank_index') stats_df = stats_df.with_columns( pl.when(pl.col('rank_index') < 3) .then(pl.lit('Top 3')) .otherwise(pl.lit('Other')) .alias('category') ).to_pandas() # Bar chart with conditional color chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X('item:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 3', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip('item:N', title='Item'), alt.Tooltip('count:Q', title='1st Place Votes') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_weighted_ranking_score( self, data: pl.LazyFrame | pl.DataFrame | None = None, title: str = "Weighted Popularity Score\n(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label: str = "Character Personality", y_label: str = "Total Weighted Score", color: str = ColorPalette.PRIMARY, height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a bar chart showing the weighted ranking score for each character.""" weighted_df = self._ensure_dataframe(data).to_pandas() # Bar chart bars = alt.Chart(weighted_df).mark_bar(color=color).encode( x=alt.X('Character:N', title=x_label, sort='-y'), y=alt.Y('Weighted Score:Q', title=y_label), tooltip=[ alt.Tooltip('Character:N'), alt.Tooltip('Weighted Score:Q', title='Score') ] ) # Text overlay text = bars.mark_text( dy=-5, color='white', fontSize=11 ).encode( text='Weighted Score:Q' ) chart = (bars + text).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_voice_selection_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, target_column: str = "8_Combined", title: str = "Most Frequently Chosen Voices\n(Top 8 Highlighted)", x_label: str = "Voice", y_label: str = "Number of Times Chosen", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Create a bar plot showing the frequency of voice selections.""" df = self._ensure_dataframe(data) if target_column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{target_column}' not found"]})).mark_text().encode(text='text:N') # Process data: split, explode, count stats_df = ( df.select(pl.col(target_column)) .drop_nulls() .with_columns(pl.col(target_column).str.split(",")) .explode(target_column) .with_columns(pl.col(target_column).str.strip_chars()) .filter(pl.col(target_column) != "") .group_by(target_column) .agg(pl.len().alias("count")) .sort("count", descending=True) .with_row_index('rank_index') .with_columns( pl.when(pl.col('rank_index') < 8) .then(pl.lit('Top 8')) .otherwise(pl.lit('Other')) .alias('category') ) .to_pandas() ) chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 8', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='Selections') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_top3_selection_counts( self, data: pl.LazyFrame | pl.DataFrame | None = None, target_column: str = "3_Ranked", title: str = "Most Frequently Chosen Top 3 Voices\n(Top 3 Highlighted)", x_label: str = "Voice", y_label: str = "Count of Mentions in Top 3", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Question: Which 3 voices are chosen the most out of 18?""" df = self._ensure_dataframe(data) if target_column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{target_column}' not found"]})).mark_text().encode(text='text:N') stats_df = ( df.select(pl.col(target_column)) .drop_nulls() .with_columns(pl.col(target_column).str.split(",")) .explode(target_column) .with_columns(pl.col(target_column).str.strip_chars()) .filter(pl.col(target_column) != "") .group_by(target_column) .agg(pl.len().alias("count")) .sort("count", descending=True) .with_row_index('rank_index') .with_columns( pl.when(pl.col('rank_index') < 3) .then(pl.lit('Top 3')) .otherwise(pl.lit('Other')) .alias('category') ) .to_pandas() ) chart = alt.Chart(stats_df).mark_bar().encode( x=alt.X(f'{target_column}:N', title=x_label, sort='-y'), y=alt.Y('count:Q', title=y_label), color=alt.Color('category:N', scale=alt.Scale(domain=['Top 3', 'Other'], range=[ColorPalette.PRIMARY, ColorPalette.NEUTRAL]), legend=None), tooltip=[ alt.Tooltip(f'{target_column}:N', title='Voice'), alt.Tooltip('count:Q', title='In Top 3') ] ).properties( title=self._process_title(title), width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_trait_scores( self, data: pl.LazyFrame | pl.DataFrame | None = None, trait_description: str = None, left_anchor: str = None, right_anchor: str = None, title: str = "Speaking Style Trait Analysis", height: int | None = None, width: int | str | None = None, ) -> alt.Chart: """Plot scores for a single speaking style trait across multiple voices.""" df = self._ensure_dataframe(data) if df.is_empty(): return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') required_cols = ["Voice", "score"] if not all(col in df.columns for col in required_cols): return alt.Chart(pd.DataFrame({'text': ['Missing required columns']})).mark_text().encode(text='text:N') # Calculate stats: Mean, Count stats = ( df.filter(pl.col("score").is_not_null()) .group_by("Voice") .agg([ pl.col("score").mean().alias("mean_score"), pl.col("score").count().alias("count") ]) .sort("mean_score", descending=False) # Ascending for bottom-to-top display .to_pandas() ) # Extract anchors from data if not provided if (left_anchor is None or right_anchor is None) and "Left_Anchor" in df.columns: head = df.filter(pl.col("Left_Anchor").is_not_null()).head(1) if not head.is_empty(): if left_anchor is None: left_anchor = head["Left_Anchor"][0] if right_anchor is None: right_anchor = head["Right_Anchor"][0] if trait_description is None: if left_anchor and right_anchor: trait_description = f"{left_anchor.split('|')[0]} vs. {right_anchor.split('|')[0]}" elif "Description" in df.columns: head = df.filter(pl.col("Description").is_not_null()).head(1) trait_description = head["Description"][0] if not head.is_empty() else "" else: trait_description = "" # Horizontal bar chart - use x2 to explicitly start bars at x=1 bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5])), x2=alt.datum(1), # Bars start at x=1 (left edge of domain) y=alt.Y('Voice:N', title='Voice', sort='-x'), tooltip=[ alt.Tooltip('Voice:N'), alt.Tooltip('mean_score:Q', title='Average', format='.2f'), alt.Tooltip('count:Q', title='Count') ] ) # Count text at end of bars (right-aligned inside bar) text = alt.Chart(stats).mark_text( align='right', baseline='middle', color='white', fontSize=12, dx=-5 # Slight padding from bar end ).encode( x='mean_score:Q', y=alt.Y('Voice:N', sort='-x'), text='count:Q' ) # Combine layers chart = (bars + text).properties( title={ "text": self._process_title(title), "subtitle": [trait_description, "(Numbers on bars indicate respondent count)"] }, width=width or 800, height=height or getattr(self, 'plot_height', 400) ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_correlation( self, style_color: str, style_traits: list[str], data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores (1-5) and Voice Scale (1-10).""" df = self._ensure_dataframe(data) if title is None: title = f"Speaking style and voice scale 1-10 correlations" trait_correlations = [] # Calculate correlations for i, trait in enumerate(style_traits): subset = df.filter(pl.col("Right_Anchor") == trait) valid_data = subset.select(["score", "Voice_Scale_Score"]).drop_nulls() if valid_data.height > 1: corr_val = valid_data.select(pl.corr("score", "Voice_Scale_Score")).item() # Wrap trait text at '|' for display trait_display = trait.replace('|', '\n') trait_correlations.append({ "trait_display": trait_display, "trait_index": f"Trait {i+1}", "correlation": corr_val if corr_val is not None else 0.0 }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() # Conditional color based on sign chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0)), y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1])), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('correlation:Q', format='.2f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) chart = self._save_plot(chart, title) return chart def plot_demographic_distribution( self, column: str, data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, height: int | None = None, width: int | str | None = None, show_counts: bool = True, ) -> alt.Chart: """Create a horizontal bar chart showing the distribution of respondents by a demographic column. Designed to be compact so multiple charts (approx. 6) can fit on one slide. Uses horizontal bars for better readability with many categories. Parameters: column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity'). data: Optional DataFrame. If None, uses self.data_filtered. title: Chart title. If None, auto-generates based on column name. height: Chart height in pixels (default: auto-sized based on categories). width: Chart width in pixels (default: 280 for compact layout). show_counts: If True, display count labels on the bars. Returns: alt.Chart: An Altair horizontal bar chart showing the distribution. """ df = self._ensure_dataframe(data) if column not in df.columns: return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N') # Count values in the column, including nulls stats_df = ( df.select(pl.col(column)) .with_columns(pl.col(column).fill_null("(No Response)")) .group_by(column) .agg(pl.len().alias("count")) .sort("count", descending=True) .to_pandas() ) if stats_df.empty: return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') # Calculate percentages total = stats_df['count'].sum() stats_df['percentage'] = (stats_df['count'] / total * 100).round(1) # Generate title if not provided if title is None: clean_col = column.replace('_', ' ').replace('/', ' / ') title = f"Distribution: {clean_col}" # Calculate appropriate height based on number of categories num_categories = len(stats_df) bar_height = 18 # pixels per bar calculated_height = max(120, num_categories * bar_height + 40) # min 120px, +40 for title/padding # Horizontal bar chart - categories on Y axis, counts on X axis bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode( x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)), y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)), tooltip=[ alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')), alt.Tooltip('count:Q', title='Count'), alt.Tooltip('percentage:Q', title='Percentage', format='.1f') ] ) # Add count labels at end of bars if show_counts: text = alt.Chart(stats_df).mark_text( align='left', baseline='middle', dx=3, # Offset from bar end fontSize=9, color=ColorPalette.TEXT ).encode( x='count:Q', y=alt.Y(f'{column}:N', sort='-x'), text='count:Q' ) chart = (bars + text) else: chart = bars # Compact dimensions for 6-per-slide layout chart = chart.properties( title=self._process_title(title), width=width or 200, height=height or calculated_height ) chart = self._save_plot(chart, title) return chart def plot_speaking_style_ranking_correlation( self, style_color: str, style_traits: list[str], data: pl.LazyFrame | pl.DataFrame | None = None, title: str | None = None, width: int | str | None = None, height: int | None = None, ) -> alt.Chart: """Plots correlation between Speaking Style Trait Scores (1-5) and Voice Ranking Points (0-3).""" df = self._ensure_dataframe(data) if title is None: title = f"Speaking style {style_color} and voice ranking points correlations" trait_correlations = [] for i, trait in enumerate(style_traits): subset = df.filter(pl.col("Right_Anchor") == trait) valid_data = subset.select(["score", "Ranking_Points"]).drop_nulls() if valid_data.height > 1: corr_val = valid_data.select(pl.corr("score", "Ranking_Points")).item() trait_display = trait.replace('|', '\n') trait_correlations.append({ "trait_display": trait_display, "trait_index": f"Trait {i+1}", "correlation": corr_val if corr_val is not None else 0.0 }) if not trait_correlations: return alt.Chart(pd.DataFrame({'text': [f"No data for {style_color} Style"]})).mark_text().encode(text='text:N') plot_df = pl.DataFrame(trait_correlations).to_pandas() chart = alt.Chart(plot_df).mark_bar().encode( x=alt.X('trait_display:N', title=None, axis=alt.Axis(labelAngle=0)), y=alt.Y('correlation:Q', title='Correlation', scale=alt.Scale(domain=[-1, 1])), color=alt.condition( alt.datum.correlation >= 0, alt.value('green'), alt.value('red') ), tooltip=[ alt.Tooltip('trait_display:N', title='Trait'), alt.Tooltip('correlation:Q', format='.2f') ] ).properties( title=self._process_title(title), width=width or 800, height=height or 350 ) chart = self._save_plot(chart, title) return chart def plot_traits_wordcloud( self, data: pl.LazyFrame | pl.DataFrame | None = None, column: str = 'Top_3_Traits', title: str = "Most Prominent Personality Traits", width: int = 1600, height: int = 800, background_color: str = 'white', random_state: int = 23, ): """Create a word cloud visualization of personality traits from survey data. Args: data: Polars DataFrame or LazyFrame containing trait data column: Name of column containing comma-separated traits title: Title for the word cloud width: Width of the word cloud image in pixels height: Height of the word cloud image in pixels background_color: Background color for the word cloud random_state: Random seed for reproducible word cloud generation (default: 23) Returns: matplotlib.figure.Figure: The word cloud figure for display in notebooks """ import matplotlib.pyplot as plt from wordcloud import WordCloud from collections import Counter import random df = self._ensure_dataframe(data) # Extract and split traits traits_list = [] for row in df[column].drop_nulls(): # Split by comma and clean whitespace traits = [trait.strip() for trait in row.split(',')] traits_list.extend(traits) # Create frequency dictionary trait_freq = Counter(traits_list) # Set random seed for color selection random.seed(random_state) # Color function using JPMC colors def color_func(word, font_size, position, orientation, random_state=None, **kwargs): colors = [ ColorPalette.PRIMARY, ColorPalette.RANK_1, ColorPalette.RANK_2, ColorPalette.RANK_3, ] return random.choice(colors) # Generate word cloud wordcloud = WordCloud( width=width, height=height, background_color=background_color, color_func=color_func, relative_scaling=0.5, min_font_size=10, prefer_horizontal=0.7, collocations=False, # Treat each word independently random_state=random_state # Seed for reproducible layout ).generate_from_frequencies(trait_freq) # Create matplotlib figure fig, ax = plt.subplots(figsize=(width/100, height/100), dpi=100) ax.imshow(wordcloud, interpolation='bilinear') ax.axis('off') ax.set_title(title, fontsize=16, pad=20, color=ColorPalette.TEXT) plt.tight_layout(pad=0) # Save figure if directory specified (using same pattern as other plots) if hasattr(self, 'fig_save_dir') and self.fig_save_dir: save_path = Path(self.fig_save_dir) # Add filter slug subfolder filter_slug = self._get_filter_slug() save_path = save_path / filter_slug if not save_path.exists(): save_path.mkdir(parents=True, exist_ok=True) # Use _sanitize_filename for consistency filename = f"{self._sanitize_filename(title)}.png" filepath = save_path / filename # Save as PNG at high resolution fig.savefig(filepath, dpi=300, bbox_inches='tight', facecolor='white') print(f"Word cloud saved to: {filepath}") return fig