demographics section done

2026-02-02 09:04:29 +01:00
parent 6b3fcb2f43
commit d770645d8e
6 changed files with 265 additions and 14 deletions
--- a/03_quant_report.py
+++ b/03_quant_report.py
@@ -22,7 +22,6 @@ def _():
        initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
    )
    file_browser
    return (file_browser,)
@@ -117,7 +116,7 @@ def _(data_validated):
    data = data_validated
    data.collect()
-    return
+    return (data,)
@app.cell(hide_code=True)
@@ -130,6 +129,81 @@ def _():
    return
@app.cell
 def _(S, data):
    demographics = S.get_demographics(data)[0].collect()
    demographics
    return (demographics,)
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
    ## Lucia confirmation missing 'Consumer' data
    """)
    return
@app.cell
 def _(demographics):
    # Demographics where 'Consumer' is null
    demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
    # demographics_no_consumer
    return (demographics_no_consumer,)
@app.cell
 def _(data_all, demographics_no_consumer):
    # check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
    assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
    return
@app.cell
 def _(data_all):
    # Check if all business owners are missing a 'Consumer type' in demographics
    assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
    return
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
    ## Demographic Distributions
    """)
    return
@app.cell
 def _():
    demo_plot_cols = [
        'Age',
        'Gender',
        # 'Race/Ethnicity',
        'Bussiness_Owner',
        'Consumer'
    ]
    return (demo_plot_cols,)
@app.cell
 def _(S, demo_plot_cols, demographics):
    _content = """
    ## Demographic Distributions
    """
    for c in demo_plot_cols:
        _fig = S.plot_demographic_distribution(
            data=demographics,
            column=c,
            title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
        )
        _content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
    mo.md(_content)
    return
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -0,0 +1,70 @@
 import marimo
 __generated_with = "0.19.2"
 app = marimo.App(width="medium")
 with app.setup:
    import marimo as mo
    from pathlib import Path
    import utils
@app.cell
 def _():
    mo.md(r"""
    # Tag existing images with Alt-Text
    Based on image content
    """)
    return
@app.cell
 def _():
    TAG_SOURCE = Path('data/reports/Perception-Research-Report.pptx')
    TAG_TARGET = Path('data/reports/Perception-Research-Report_tagged.pptx')
    TAG_IMAGE_DIR = Path('figures/OneDrive_2026-01-28/')
    return TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET
@app.cell
 def _(TAG_IMAGE_DIR, TAG_SOURCE, TAG_TARGET):
    utils.update_ppt_alt_text(ppt_path=TAG_SOURCE, image_source_dir=TAG_IMAGE_DIR, output_path=TAG_TARGET)
    return
@app.cell(hide_code=True)
 def _():
    mo.md(r"""
    # Replace Images using Alt-Text
    """)
    return
@app.cell
 def _():
    REPLACE_SOURCE = Path('data/test_replace_source.pptx')
    REPLACE_TARGET = Path('data/test_replace_target.pptx')
    return REPLACE_SOURCE, REPLACE_TARGET
 app._unparsable_cell(
    r"""
    IMAGE_FILE = Path('figures/OneDrive_2026-01-28/Cons-Early_Professional/cold_distant_approachable_familiar_warm.png'
    """,
    name="_"
 )
@app.cell
 def _(IMAGE_FILE, REPLACE_SOURCE, REPLACE_TARGET):
    utils.pptx_replace_named_image(
        presentation_path=REPLACE_SOURCE,
        target_tag=utils.image_alt_text_generator(IMAGE_FILE),
        new_image_path=IMAGE_FILE,
        save_path=REPLACE_TARGET)
    return
 if __name__ == "__main__":
    app.run()
--- a/99_example_ingest_qualtrics_export.py
+++ b/99_example_ingest_qualtrics_export.py
@@ -42,14 +42,6 @@ def _(survey):
    return
 app._unparsable_cell(
    r"""
    data.
    """,
    name="_"
 )
@app.cell
 def _(mo):
    mo.md(r"""
--- a/plots.py
+++ b/plots.py
@@ -1,6 +1,7 @@
 """Plotting functions for Voice Branding analysis using Altair."""
 import re
 import math
 from pathlib import Path
 import altair as alt
@@ -728,8 +729,6 @@ class JPMCPlotsMixin:
            },
            width=width or 800,
            height=height or getattr(self, 'plot_height', 400)
        ).configure_view(
            strokeWidth=0  # Remove frame which might obscure labels
        )
        chart = self._save_plot(chart, title)
@@ -794,6 +793,101 @@ class JPMCPlotsMixin:
        chart = self._save_plot(chart, title)
        return chart
    def plot_demographic_distribution(
        self,
        column: str,
        data: pl.LazyFrame | pl.DataFrame | None = None,
        title: str | None = None,
        height: int | None = None,
        width: int | str | None = None,
        show_counts: bool = True,
    ) -> alt.Chart:
        """Create a horizontal bar chart showing the distribution of respondents by a demographic column.
        Designed to be compact so multiple charts (approx. 6) can fit on one slide.
        Uses horizontal bars for better readability with many categories.
        Parameters:
            column: The column name to analyze (e.g., 'Age', 'Gender', 'Race/Ethnicity').
            data: Optional DataFrame. If None, uses self.data_filtered.
            title: Chart title. If None, auto-generates based on column name.
            height: Chart height in pixels (default: auto-sized based on categories).
            width: Chart width in pixels (default: 280 for compact layout).
            show_counts: If True, display count labels on the bars.
        Returns:
            alt.Chart: An Altair horizontal bar chart showing the distribution.
        """
        df = self._ensure_dataframe(data)
        if column not in df.columns:
            return alt.Chart(pd.DataFrame({'text': [f"Column '{column}' not found"]})).mark_text().encode(text='text:N')
        # Count values in the column, including nulls
        stats_df = (
            df.select(pl.col(column))
            .with_columns(pl.col(column).fill_null("(No Response)"))
            .group_by(column)
            .agg(pl.len().alias("count"))
            .sort("count", descending=True)
            .to_pandas()
        )
        if stats_df.empty:
            return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
        # Calculate percentages
        total = stats_df['count'].sum()
        stats_df['percentage'] = (stats_df['count'] / total * 100).round(1)
        # Generate title if not provided
        if title is None:
            clean_col = column.replace('_', ' ').replace('/', ' / ')
            title = f"Distribution: {clean_col}"
        # Calculate appropriate height based on number of categories
        num_categories = len(stats_df)
        bar_height = 18  # pixels per bar
        calculated_height = max(120, num_categories * bar_height + 40)  # min 120px, +40 for title/padding
        # Horizontal bar chart - categories on Y axis, counts on X axis
        bars = alt.Chart(stats_df).mark_bar(color=ColorPalette.PRIMARY).encode(
            x=alt.X('count:Q', title='Count', axis=alt.Axis(grid=False)),
            y=alt.Y(f'{column}:N', title=None, sort='-x', axis=alt.Axis(labelLimit=150)),
            tooltip=[
                alt.Tooltip(f'{column}:N', title=column.replace('_', ' ')),
                alt.Tooltip('count:Q', title='Count'),
                alt.Tooltip('percentage:Q', title='Percentage', format='.1f')
            ]
        )
        # Add count labels at end of bars
        if show_counts:
            text = alt.Chart(stats_df).mark_text(
                align='left',
                baseline='middle',
                dx=3,  # Offset from bar end
                fontSize=9,
                color=ColorPalette.TEXT
            ).encode(
                x='count:Q',
                y=alt.Y(f'{column}:N', sort='-x'),
                text='count:Q'
            )
            chart = (bars + text)
        else:
            chart = bars
        # Compact dimensions for 6-per-slide layout
        chart = chart.properties(
            title=self._process_title(title),
            width=width or 200,
            height=height or calculated_height
        )
        chart = self._save_plot(chart, title)
        return chart
    def plot_speaking_style_ranking_correlation(
        self,
        style_color: str,
--- a/theme.py
+++ b/theme.py
@@ -24,6 +24,20 @@ class ColorPalette:
    GRID = "lightgray"
    BACKGROUND = "white"
    # Extended palette for categorical charts (e.g., pie charts with many categories)
    CATEGORICAL = [
        "#0077B6",  # PRIMARY - Medium Blue
        "#004C6D",  # RANK_1 - Dark Blue
        "#008493",  # RANK_2 - Teal
        "#5AAE95",  # RANK_3 - Sea Green
        "#9E9E9E",  # RANK_4 - Grey
        "#D3D3D3",  # NEUTRAL - Light Grey
        "#003049",  # Dark Navy
        "#669BBC",  # Light Steel Blue
        "#A8DADC",  # Pale Cyan
        "#457B9D",  # Steel Blue
    ]
 def jpmc_altair_theme():
    """JPMC brand theme for Altair charts."""
--- a/utils.py
+++ b/utils.py
@@ -13,8 +13,12 @@ from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE
-def image_alt_text_generator(fpath):
+def image_alt_text_generator(fpath, include_dataset_dirname=False) -> str:
    """convert image file path to alt text
    Args:
        fpath (str or Path): path to image file, must start with 'figures/'
        include_dataset_dirname (bool): whether to include the dataset directory name in the alt text. Recommended to keep False, so that the images do not get tied to a specific dataset export. (Defeats the purpose of assigning alt text to be able to update images when new datasets are exported.)
    """
    if not isinstance(fpath, Path):
@@ -23,7 +27,10 @@ def image_alt_text_generator(fpath):
    fparts = fpath.parts
    assert fparts[0] == 'figures', "Image file path must start with 'figures'"
-    return Path('/'.join(fparts[2:])).as_posix()
+    if include_dataset_dirname:
        return Path('/'.join(fparts[1:])).as_posix()
    else:
        return Path('/'.join(fparts[2:])).as_posix()
 def pptx_replace_named_image(presentation_path, target_tag, new_image_path, save_path):
    """