demographics section done

This commit is contained in:
2026-02-02 09:04:29 +01:00
parent 6b3fcb2f43
commit d770645d8e
6 changed files with 265 additions and 14 deletions

View File

@@ -22,7 +22,6 @@ def _():
initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File" initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
) )
file_browser file_browser
return (file_browser,) return (file_browser,)
@@ -117,7 +116,7 @@ def _(data_validated):
data = data_validated data = data_validated
data.collect() data.collect()
return return (data,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -130,6 +129,81 @@ def _():
return return
@app.cell
def _(S, data):
demographics = S.get_demographics(data)[0].collect()
demographics
return (demographics,)
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Lucia confirmation missing 'Consumer' data
""")
return
@app.cell
def _(demographics):
# Demographics where 'Consumer' is null
demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
# demographics_no_consumer
return (demographics_no_consumer,)
@app.cell
def _(data_all, demographics_no_consumer):
# check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
return
@app.cell
def _(data_all):
# Check if all business owners are missing a 'Consumer type' in demographics
assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Demographic Distributions
""")
return
@app.cell
def _():
demo_plot_cols = [
'Age',
'Gender',
# 'Race/Ethnicity',
'Bussiness_Owner',
'Consumer'
]
return (demo_plot_cols,)
@app.cell
def _(S, demo_plot_cols, demographics):
_content = """
## Demographic Distributions
"""
for c in demo_plot_cols:
_fig = S.plot_demographic_distribution(
data=demographics,
column=c,
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
)
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
mo.md(_content)
return
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(): def _():
mo.md(r""" mo.md(r"""

70
04_PPTX_Update_Images.py Normal file
View File

@@ -0,0 +1,70 @@
import marimo
__generated_with = "0.19.2"
app = marimo.App(width="medium")
with app.setup:
import marimo as mo
from pathlib import Path
import utils
@app.cell
def _():
mo.md(r"""
# Tag existing images with Alt-Text
Based on image content
""")
return
@app.cell
def _():
TAG_SOURCE = Path('data/reports/Perception-Research-Report.pptx')
TAG_TARGET = Path('data/reports/Perception-Research-Report_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/')
return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET
@app.cell
def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET)
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
# Replace Images using Alt-Text
""")
return
@app.cell
def _():
REPLACE_SOURCE = Path('data/test_replace_source.pptx')
REPLACE_TARGET = Path('data/test_replace_target.pptx')
return REPLACE_SOURCE, REPLACE_TARGET
app._unparsable_cell(
r"""
IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png'
""",
name="_"
)
@app.cell
def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
utils.pptx_replace_named_image(
presentation_path=REPLACE_SOURCE,
target_tag=utils.image_alt_text_generator(IMAGE_FILE),
new_image_path=IMAGE_FILE,
save_path=REPLACE_TARGET)
return
if __name__ == "__main__":
app.run()

View File

@@ -42,14 +42,6 @@ def _(survey):
return return
app._unparsable_cell(
r"""
data.
""",
name="_"
)
@app.cell @app.cell
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""

View File

@@ -1,6 +1,7 @@
"""Plotting functions for Voice Branding analysis using Altair.""" """Plotting functions for Voice Branding analysis using Altair."""
import re import re
import math
from pathlib import Path from pathlib import Path
import altair as alt import altair as alt
@@ -728,8 +729,6 @@ class JPMCPlotsMixin:
}, },
width=width or 800, width=width or 800,
height=height or getattr(self, 'plot_height', 400) height=height or getattr(self, 'plot_height', 400)
).configure_view(
strokeWidth=0 # Remove frame which might obscure labels
) )
chart = self._save_plot(chart, title) chart = self._save_plot(chart, title)
@@ -794,6 +793,101 @@ class JPMCPlotsMixin:
chart = self._save_plot(chart, title) chart = self._save_plot(chart, title)
return chart return chart
def plot_demographic_distribution(
self,
column: str,
data: pl.LazyFrame | pl.DataFrame | None = None,
title: str | None = None,
height: int | None = None,
width: int | str | None = None,
show_counts: bool = True,
) -> alt.Chart:
"""Create a horizontal bar chart showing the distribution of respondents by a demographic column.
Designed to be compact so multiple charts (approx. 6) can fit on one slide.
Uses horizontal bars for better readability with many categories.
Parameters:
column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity').
data: Optional DataFrame. If None, uses self.data_filtered.
title: Chart title. If None, auto-generates based on column name.
height: Chart height in pixels (default: auto-sized based on categories).
width: Chart width in pixels (default: 280 for compact layout).
show_counts: If True, display count labels on the bars.
Returns:
alt.Chart: An Altair horizontal bar chart showing the distribution.
"""
df = self._ensure_dataframe(data)
if column not in df.columns:
return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N')
# Count values in the column, including nulls
stats_df = (
df.select(pl.col(column))
.with_columns(pl.col(column).fill_null("(No Response)"))
.group_by(column)
.agg(pl.len().alias("count"))
.sort("count", descending=True)
.to_pandas()
)
if stats_df.empty:
return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
# Calculate percentages
total = stats_df['count'].sum()
stats_df['percentage'] = (stats_df['count'] / total * 100).round(1)
# Generate title if not provided
if title is None:
clean_col = column.replace('_', ' ').replace('/', ' / ')
title = f"Distribution: {clean_col}"
# Calculate appropriate height based on number of categories
num_categories = len(stats_df)
bar_height = 18 # pixels per bar
calculated_height = max(120, num_categories * bar_height + 40) # min 120px, +40 for title/padding
# Horizontal bar chart - categories on Y axis, counts on X axis
bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode(
x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)),
y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)),
tooltip=[
alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')),
alt.Tooltip('count:Q', title='Count'),
alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
]
)
# Add count labels at end of bars
if show_counts:
text = alt.Chart(stats_df).mark_text(
align='left',
baseline='middle',
dx=3, # Offset from bar end
fontSize=9,
color=ColorPalette.TEXT
).encode(
x='count:Q',
y=alt.Y(f'{column}:N', sort='-x'),
text='count:Q'
)
chart = (bars + text)
else:
chart = bars
# Compact dimensions for 6-per-slide layout
chart = chart.properties(
title=self._process_title(title),
width=width or 200,
height=height or calculated_height
)
chart = self._save_plot(chart, title)
return chart
def plot_speaking_style_ranking_correlation( def plot_speaking_style_ranking_correlation(
self, self,
style_color: str, style_color: str,

View File

@@ -24,6 +24,20 @@ class ColorPalette:
GRID = "lightgray" GRID = "lightgray"
BACKGROUND = "white" BACKGROUND = "white"
# Extended palette for categorical charts (e.g., pie charts with many categories)
CATEGORICAL = [
"#0077B6", # PRIMARY - Medium Blue
"#004C6D", # RANK_1 - Dark Blue
"#008493", # RANK_2 - Teal
"#5AAE95", # RANK_3 - Sea Green
"#9E9E9E", # RANK_4 - Grey
"#D3D3D3", # NEUTRAL - Light Grey
"#003049", # Dark Navy
"#669BBC", # Light Steel Blue
"#A8DADC", # Pale Cyan
"#457B9D", # Steel Blue
]
def jpmc_altair_theme(): def jpmc_altair_theme():
"""JPMC brand theme for Altair charts.""" """JPMC brand theme for Altair charts."""

View File

@@ -13,8 +13,12 @@ from pptx import Presentation
from pptx.enum.shapes import MSO_SHAPE_TYPE from pptx.enum.shapes import MSO_SHAPE_TYPE
def image_alt_text_generator(fpath): def image_alt_text_generator(fpath, include_dataset_dirname=False) -> str:
"""convert image file path to alt text """convert image file path to alt text
Args:
fpath (str or Path): path to image file, must start with 'figures/'
include_dataset_dirname (bool): whether to include the dataset directory name in the alt text. Recommended to keep False, so that the images do not get tied to a specific dataset export. (Defeats the purpose of assigning alt text to be able to update images when new datasets are exported.)
""" """
if not isinstance(fpath, Path): if not isinstance(fpath, Path):
@@ -23,7 +27,10 @@ def image_alt_text_generator(fpath):
fparts = fpath.parts fparts = fpath.parts
assert fparts[0] == 'figures', "Image file path must start with 'figures'" assert fparts[0] == 'figures', "Image file path must start with 'figures'"
return Path('/'.join(fparts[2:])).as_posix() if include_dataset_dirname:
return Path('/'.join(fparts[1:])).as_posix()
else:
return Path('/'.join(fparts[2:])).as_posix()
def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path): def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
""" """