demographics section done

This commit is contained in:
2026-02-02 09:04:29 +01:00
parent 6b3fcb2f43
commit d770645d8e
6 changed files with 265 additions and 14 deletions

View File

@@ -22,7 +22,6 @@ def _():
initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
)
file_browser
return (file_browser,)
@@ -117,7 +116,7 @@ def _(data_validated):
data = data_validated
data.collect()
return
return (data,)
@app.cell(hide_code=True)
@@ -130,6 +129,81 @@ def _():
return
@app.cell
def _(S, data):
demographics = S.get_demographics(data)[0].collect()
demographics
return (demographics,)
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Lucia confirmation missing 'Consumer' data
""")
return
@app.cell
def _(demographics):
# Demographics where 'Consumer' is null
demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
# demographics_no_consumer
return (demographics_no_consumer,)
@app.cell
def _(data_all, demographics_no_consumer):
# check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
return
@app.cell
def _(data_all):
# Check if all business owners are missing a 'Consumer type' in demographics
assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Demographic Distributions
""")
return
@app.cell
def _():
demo_plot_cols = [
'Age',
'Gender',
# 'Race/Ethnicity',
'Bussiness_Owner',
'Consumer'
]
return (demo_plot_cols,)
@app.cell
def _(S, demo_plot_cols, demographics):
_content = """
## Demographic Distributions
"""
for c in demo_plot_cols:
_fig = S.plot_demographic_distribution(
data=demographics,
column=c,
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
)
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
mo.md(_content)
return
@app.cell(hide_code=True)
def _():
mo.md(r"""

70
04_PPTX_Update_Images.py Normal file
View File

@@ -0,0 +1,70 @@
import marimo
__generated_with = "0.19.2"
app = marimo.App(width="medium")
with app.setup:
import marimo as mo
from pathlib import Path
import utils
@app.cell
def _():
mo.md(r"""
# Tag existing images with Alt-Text
Based on image content
""")
return
@app.cell
def _():
TAG_SOURCE = Path('data/reports/Perception-Research-Report.pptx')
TAG_TARGET = Path('data/reports/Perception-Research-Report_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/')
return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET
@app.cell
def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET)
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
# Replace Images using Alt-Text
""")
return
@app.cell
def _():
REPLACE_SOURCE = Path('data/test_replace_source.pptx')
REPLACE_TARGET = Path('data/test_replace_target.pptx')
return REPLACE_SOURCE, REPLACE_TARGET
app._unparsable_cell(
r"""
IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png'
""",
name="_"
)
@app.cell
def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
utils.pptx_replace_named_image(
presentation_path=REPLACE_SOURCE,
target_tag=utils.image_alt_text_generator(IMAGE_FILE),
new_image_path=IMAGE_FILE,
save_path=REPLACE_TARGET)
return
if __name__ == "__main__":
app.run()

View File

@@ -42,14 +42,6 @@ def _(survey):
return
app._unparsable_cell(
r"""
data.
""",
name="_"
)
@app.cell
def _(mo):
mo.md(r"""

View File

@@ -1,6 +1,7 @@
"""Plotting functions for Voice Branding analysis using Altair."""
import re
import math
from pathlib import Path
import altair as alt
@@ -728,8 +729,6 @@ class JPMCPlotsMixin:
},
width=width or 800,
height=height or getattr(self, 'plot_height', 400)
).configure_view(
strokeWidth=0 # Remove frame which might obscure labels
)
chart = self._save_plot(chart, title)
@@ -794,6 +793,101 @@ class JPMCPlotsMixin:
chart = self._save_plot(chart, title)
return chart
def plot_demographic_distribution(
self,
column: str,
data: pl.LazyFrame | pl.DataFrame | None = None,
title: str | None = None,
height: int | None = None,
width: int | str | None = None,
show_counts: bool = True,
) -> alt.Chart:
"""Create a horizontal bar chart showing the distribution of respondents by a demographic column.
Designed to be compact so multiple charts (approx. 6) can fit on one slide.
Uses horizontal bars for better readability with many categories.
Parameters:
column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity').
data: Optional DataFrame. If None, uses self.data_filtered.
title: Chart title. If None, auto-generates based on column name.
height: Chart height in pixels (default: auto-sized based on categories).
width: Chart width in pixels (default: 280 for compact layout).
show_counts: If True, display count labels on the bars.
Returns:
alt.Chart: An Altair horizontal bar chart showing the distribution.
"""
df = self._ensure_dataframe(data)
if column not in df.columns:
return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N')
# Count values in the column, including nulls
stats_df = (
df.select(pl.col(column))
.with_columns(pl.col(column).fill_null("(No Response)"))
.group_by(column)
.agg(pl.len().alias("count"))
.sort("count", descending=True)
.to_pandas()
)
if stats_df.empty:
return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
# Calculate percentages
total = stats_df['count'].sum()
stats_df['percentage'] = (stats_df['count'] / total * 100).round(1)
# Generate title if not provided
if title is None:
clean_col = column.replace('_', ' ').replace('/', ' / ')
title = f"Distribution: {clean_col}"
# Calculate appropriate height based on number of categories
num_categories = len(stats_df)
bar_height = 18 # pixels per bar
calculated_height = max(120, num_categories * bar_height + 40) # min 120px, +40 for title/padding
# Horizontal bar chart - categories on Y axis, counts on X axis
bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode(
x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)),
y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)),
tooltip=[
alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')),
alt.Tooltip('count:Q', title='Count'),
alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
]
)
# Add count labels at end of bars
if show_counts:
text = alt.Chart(stats_df).mark_text(
align='left',
baseline='middle',
dx=3, # Offset from bar end
fontSize=9,
color=ColorPalette.TEXT
).encode(
x='count:Q',
y=alt.Y(f'{column}:N', sort='-x'),
text='count:Q'
)
chart = (bars + text)
else:
chart = bars
# Compact dimensions for 6-per-slide layout
chart = chart.properties(
title=self._process_title(title),
width=width or 200,
height=height or calculated_height
)
chart = self._save_plot(chart, title)
return chart
def plot_speaking_style_ranking_correlation(
self,
style_color: str,

View File

@@ -24,6 +24,20 @@ class ColorPalette:
GRID = "lightgray"
BACKGROUND = "white"
# Extended palette for categorical charts (e.g., pie charts with many categories)
CATEGORICAL = [
"#0077B6", # PRIMARY - Medium Blue
"#004C6D", # RANK_1 - Dark Blue
"#008493", # RANK_2 - Teal
"#5AAE95", # RANK_3 - Sea Green
"#9E9E9E", # RANK_4 - Grey
"#D3D3D3", # NEUTRAL - Light Grey
"#003049", # Dark Navy
"#669BBC", # Light Steel Blue
"#A8DADC", # Pale Cyan
"#457B9D", # Steel Blue
]
def jpmc_altair_theme():
"""JPMC brand theme for Altair charts."""

View File

@@ -13,8 +13,12 @@ from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE
def image_alt_text_generator(fpath):
def image_alt_text_generator(fpath, include_dataset_dirname=False) -> str:
"""convert image file path to alt text
Args:
fpath (str or Path): path to image file, must start with 'figures/'
include_dataset_dirname (bool): whether to include the dataset directory name in the alt text. Recommended to keep False, so that the images do not get tied to a specific dataset export. (Defeats the purpose of assigning alt text to be able to update images when new datasets are exported.)
"""
if not isinstance(fpath, Path):
@@ -23,7 +27,10 @@ def image_alt_text_generator(fpath):
fparts = fpath.parts
assert fparts[0] == 'figures', "Image file path must start with 'figures'"
return Path('/'.join(fparts[2:])).as_posix()
if include_dataset_dirname:
return Path('/'.join(fparts[1:])).as_posix()
else:
return Path('/'.join(fparts[2:])).as_posix()
def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
"""