demographics section done

2026-02-02 09:04:29 +01:00
parent 6b3fcb2f43
commit d770645d8e
6 changed files with 265 additions and 14 deletions
--- a/03_quant_report.py
+++ b/03_quant_report.py
@@ -22,7 +22,6 @@ def _():
        initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
    )
    file_browser
-
    return (file_browser,)


@@ -117,7 +116,7 @@ def _(data_validated):
    data = data_validated

    data.collect()
-    return
+    return (data,)


@app.cell(hide_code=True)
@@ -130,6 +129,81 @@ def _():
    return


+@app.cell
+def _(S, data):
+    demographics = S.get_demographics(data)[0].collect()
+    demographics
+    return (demographics,)
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    ## Lucia confirmation missing 'Consumer' data
+    """)
+    return
+
+
+@app.cell
+def _(demographics):
+    # Demographics where 'Consumer' is null
+    demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
+    # demographics_no_consumer
+    return (demographics_no_consumer,)
+
+
+@app.cell
+def _(data_all, demographics_no_consumer):
+    # check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
+    assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
+    return
+
+
+@app.cell
+def _(data_all):
+    # Check if all business owners are missing a 'Consumer type' in demographics
+    assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    ## Demographic Distributions
+    """)
+    return
+
+
+@app.cell
+def _():
+    demo_plot_cols = [
+        'Age',
+        'Gender',
+        # 'Race/Ethnicity',
+        'Bussiness_Owner',
+        'Consumer'
+    ]
+    return (demo_plot_cols,)
+
+
+@app.cell
+def _(S, demo_plot_cols, demographics):
+    _content = """
+    ## Demographic Distributions
+
+    """
+    for c in demo_plot_cols:
+        _fig = S.plot_demographic_distribution(
+            data=demographics,
+            column=c,
+            title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
+        )
+        _content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
+
+    mo.md(_content)
+    return
+
+
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -0,0 +1,70 @@
+import marimo
+
+__generated_with = "0.19.2"
+app = marimo.App(width="medium")
+
+with app.setup:
+    import marimo as mo
+    from pathlib import Path
+    import utils
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    # Tag existing images with Alt-Text
+
+    Based on image content
+    """)
+    return
+
+
+@app.cell
+def _():
+    TAG_SOURCE = Path('data/reports/Perception-Research-Report.pptx')
+    TAG_TARGET = Path('data/reports/Perception-Research-Report_tagged.pptx')
+    TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/')
+    return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET
+
+
+@app.cell
+def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
+    utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET)
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    # Replace Images using Alt-Text
+    """)
+    return
+
+
+@app.cell
+def _():
+    REPLACE_SOURCE = Path('data/test_replace_source.pptx')
+    REPLACE_TARGET = Path('data/test_replace_target.pptx')
+    return REPLACE_SOURCE, REPLACE_TARGET
+
+
+app._unparsable_cell(
+    r"""
+    IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png'
+    """,
+    name="_"
+)
+
+
+@app.cell
+def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
+    utils.pptx_replace_named_image(
+        presentation_path=REPLACE_SOURCE,
+        target_tag=utils.image_alt_text_generator(IMAGE_FILE),
+        new_image_path=IMAGE_FILE,
+        save_path=REPLACE_TARGET)
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/99_example_ingest_qualtrics_export.py
+++ b/99_example_ingest_qualtrics_export.py
@@ -42,14 +42,6 @@ def _(survey):
    return


-app._unparsable_cell(
-    r"""
-    data.
-    """,
-    name="_"
-)
-
-
@app.cell
 def _(mo):
    mo.md(r"""
--- a/plots.py
+++ b/plots.py
@@ -1,6 +1,7 @@
 """Plotting functions for Voice Branding analysis using Altair."""

 import re
+import math
 from pathlib import Path

 import altair as alt
@@ -728,8 +729,6 @@ class JPMCPlotsMixin:
            },
            width=width or 800,
            height=height or getattr(self, 'plot_height', 400)
-        ).configure_view(
-            strokeWidth=0  # Remove frame which might obscure labels
        )

        chart = self._save_plot(chart, title)
@@ -794,6 +793,101 @@ class JPMCPlotsMixin:
        chart = self._save_plot(chart, title)
        return chart

+    def plot_demographic_distribution(
+        self,
+        column: str,
+        data: pl.LazyFrame | pl.DataFrame | None = None,
+        title: str | None = None,
+        height: int | None = None,
+        width: int | str | None = None,
+        show_counts: bool = True,
+    ) -> alt.Chart:
+        """Create a horizontal bar chart showing the distribution of respondents by a demographic column.
+
+        Designed to be compact so multiple charts (approx. 6) can fit on one slide.
+        Uses horizontal bars for better readability with many categories.
+
+        Parameters:
+            column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity').
+            data: Optional DataFrame. If None, uses self.data_filtered.
+            title: Chart title. If None, auto-generates based on column name.
+            height: Chart height in pixels (default: auto-sized based on categories).
+            width: Chart width in pixels (default: 280 for compact layout).
+            show_counts: If True, display count labels on the bars.
+
+        Returns:
+            alt.Chart: An Altair horizontal bar chart showing the distribution.
+        """
+        df = self._ensure_dataframe(data)
+
+        if column not in df.columns:
+            return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N')
+
+        # Count values in the column, including nulls
+        stats_df = (
+            df.select(pl.col(column))
+            .with_columns(pl.col(column).fill_null("(No Response)"))
+            .group_by(column)
+            .agg(pl.len().alias("count"))
+            .sort("count", descending=True)
+            .to_pandas()
+        )
+
+        if stats_df.empty:
+            return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
+
+        # Calculate percentages
+        total = stats_df['count'].sum()
+        stats_df['percentage'] = (stats_df['count'] / total * 100).round(1)
+
+        # Generate title if not provided
+        if title is None:
+            clean_col = column.replace('_', ' ').replace('/', ' / ')
+            title = f"Distribution: {clean_col}"
+
+        # Calculate appropriate height based on number of categories
+        num_categories = len(stats_df)
+        bar_height = 18  # pixels per bar
+        calculated_height = max(120, num_categories * bar_height + 40)  # min 120px, +40 for title/padding
+
+        # Horizontal bar chart - categories on Y axis, counts on X axis
+        bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode(
+            x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)),
+            y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)),
+            tooltip=[
+                alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')),
+                alt.Tooltip('count:Q', title='Count'),
+                alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
+            ]
+        )
+
+        # Add count labels at end of bars
+        if show_counts:
+            text = alt.Chart(stats_df).mark_text(
+                align='left',
+                baseline='middle',
+                dx=3,  # Offset from bar end
+                fontSize=9,
+                color=ColorPalette.TEXT
+            ).encode(
+                x='count:Q',
+                y=alt.Y(f'{column}:N', sort='-x'),
+                text='count:Q'
+            )
+            chart = (bars + text)
+        else:
+            chart = bars
+
+        # Compact dimensions for 6-per-slide layout
+        chart = chart.properties(
+            title=self._process_title(title),
+            width=width or 200,
+            height=height or calculated_height
+        )
+
+        chart = self._save_plot(chart, title)
+        return chart
+
    def plot_speaking_style_ranking_correlation(
        self,
        style_color: str,
--- a/theme.py
+++ b/theme.py
@@ -24,6 +24,20 @@ class ColorPalette:
    GRID = "lightgray"
    BACKGROUND = "white"

+    # Extended palette for categorical charts (e.g., pie charts with many categories)
+    CATEGORICAL = [
+        "#0077B6",  # PRIMARY - Medium Blue
+        "#004C6D",  # RANK_1 - Dark Blue
+        "#008493",  # RANK_2 - Teal
+        "#5AAE95",  # RANK_3 - Sea Green
+        "#9E9E9E",  # RANK_4 - Grey
+        "#D3D3D3",  # NEUTRAL - Light Grey
+        "#003049",  # Dark Navy
+        "#669BBC",  # Light Steel Blue
+        "#A8DADC",  # Pale Cyan
+        "#457B9D",  # Steel Blue
+    ]
+

 def jpmc_altair_theme():
    """JPMC brand theme for Altair charts."""
--- a/utils.py
+++ b/utils.py
@@ -13,8 +13,12 @@ from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE


-def image_alt_text_generator(fpath):
+def image_alt_text_generator(fpath, include_dataset_dirname=False) -> str:
    """convert image file path to alt text
+    
+    Args:
+        fpath (str or Path): path to image file, must start with 'figures/'
+        include_dataset_dirname (bool): whether to include the dataset directory name in the alt text. Recommended to keep False, so that the images do not get tied to a specific dataset export. (Defeats the purpose of assigning alt text to be able to update images when new datasets are exported.)
    """

    if not isinstance(fpath, Path):
@@ -23,7 +27,10 @@ def image_alt_text_generator(fpath):
    fparts = fpath.parts
    assert fparts[0] == 'figures', "Image file path must start with 'figures'"

-    return Path('/'.join(fparts[2:])).as_posix()
+    if include_dataset_dirname:
+        return Path('/'.join(fparts[1:])).as_posix()
+    else:
+        return Path('/'.join(fparts[2:])).as_posix()

 def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
    """