demographics section done

This commit is contained in:
2026-02-02 09:04:29 +01:00
parent 6b3fcb2f43
commit d770645d8e
6 changed files with 265 additions and 14 deletions

View File

@@ -1,6 +1,7 @@
"""Plotting functions for Voice Branding analysis using Altair."""
import re
import math
from pathlib import Path
import altair as alt
@@ -728,8 +729,6 @@ class JPMCPlotsMixin:
},
width=width or 800,
height=height or getattr(self, 'plot_height', 400)
).configure_view(
strokeWidth=0 # Remove frame which might obscure labels
)
chart = self._save_plot(chart, title)
@@ -794,6 +793,101 @@ class JPMCPlotsMixin:
chart = self._save_plot(chart, title)
return chart
def plot_demographic_distribution(
self,
column: str,
data: pl.LazyFrame | pl.DataFrame | None = None,
title: str | None = None,
height: int | None = None,
width: int | str | None = None,
show_counts: bool = True,
) -> alt.Chart:
"""Create a horizontal bar chart showing the distribution of respondents by a demographic column.
Designed to be compact so multiple charts (approx. 6) can fit on one slide.
Uses horizontal bars for better readability with many categories.
Parameters:
column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity').
data: Optional DataFrame. If None, uses self.data_filtered.
title: Chart title. If None, auto-generates based on column name.
height: Chart height in pixels (default: auto-sized based on categories).
width: Chart width in pixels (default: 280 for compact layout).
show_counts: If True, display count labels on the bars.
Returns:
alt.Chart: An Altair horizontal bar chart showing the distribution.
"""
df = self._ensure_dataframe(data)
if column not in df.columns:
return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N')
# Count values in the column, including nulls
stats_df = (
df.select(pl.col(column))
.with_columns(pl.col(column).fill_null("(No Response)"))
.group_by(column)
.agg(pl.len().alias("count"))
.sort("count", descending=True)
.to_pandas()
)
if stats_df.empty:
return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
# Calculate percentages
total = stats_df['count'].sum()
stats_df['percentage'] = (stats_df['count'] / total * 100).round(1)
# Generate title if not provided
if title is None:
clean_col = column.replace('_', ' ').replace('/', ' / ')
title = f"Distribution: {clean_col}"
# Calculate appropriate height based on number of categories
num_categories = len(stats_df)
bar_height = 18 # pixels per bar
calculated_height = max(120, num_categories * bar_height + 40) # min 120px, +40 for title/padding
# Horizontal bar chart - categories on Y axis, counts on X axis
bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode(
x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)),
y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)),
tooltip=[
alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')),
alt.Tooltip('count:Q', title='Count'),
alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
]
)
# Add count labels at end of bars
if show_counts:
text = alt.Chart(stats_df).mark_text(
align='left',
baseline='middle',
dx=3, # Offset from bar end
fontSize=9,
color=ColorPalette.TEXT
).encode(
x='count:Q',
y=alt.Y(f'{column}:N', sort='-x'),
text='count:Q'
)
chart = (bars + text)
else:
chart = bars
# Compact dimensions for 6-per-slide layout
chart = chart.properties(
title=self._process_title(title),
width=width or 200,
height=height or calculated_height
)
chart = self._save_plot(chart, title)
return chart
def plot_speaking_style_ranking_correlation(
self,
style_color: str,