fix sample size

This commit is contained in:
2026-02-03 20:48:34 +01:00
parent 9a587dcc4c
commit 36280a6ff8
3 changed files with 186 additions and 122 deletions

View File

@@ -26,9 +26,9 @@ def _():
@app.cell @app.cell
def _(): def _():
TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2_3-2-18-15.pptx') TAG_SOURCE = Path('data/reports/Perception-Research-Report_3-2-26_20-00.pptx')
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx') # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/2-3-26_Copy-2-2-26') TAG_IMAGE_DIR = Path('figures/debug')
return TAG_IMAGE_DIR, TAG_SOURCE return TAG_IMAGE_DIR, TAG_SOURCE
@@ -52,7 +52,7 @@ def _():
@app.cell @app.cell
def _(): def _():
REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2_3-2-18-15.pptx') REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_3-2-26_20-00.pptx')
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx') # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
NEW_IMAGES_DIR = Path('figures/debug') NEW_IMAGES_DIR = Path('figures/debug')

View File

@@ -178,8 +178,8 @@ class QualtricsPlotsMixin:
# Use UPPERCASE for category name to distinguish from values # Use UPPERCASE for category name to distinguish from values
parts.append(f"{display_name.upper()}: {val_str}") parts.append(f"{display_name.upper()}: {val_str}")
# Get sample size (stored by _ensure_dataframe) # Get sample size from the filtered dataset (not from transformed plot data)
sample_size = getattr(self, '_last_sample_size', None) sample_size = self._get_filtered_sample_size()
sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else "" sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else ""
if not parts: if not parts:
@@ -297,10 +297,7 @@ class QualtricsPlotsMixin:
return chart return chart
def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame: def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame:
"""Ensure data is an eager DataFrame, collecting if necessary. """Ensure data is an eager DataFrame, collecting if necessary."""
Also stores the sample size on self._last_sample_size for use in filter descriptions.
"""
df = data if data is not None else getattr(self, 'data_filtered', None) df = data if data is not None else getattr(self, 'data_filtered', None)
if df is None: if df is None:
raise ValueError("No data provided and self.data_filtered is None.") raise ValueError("No data provided and self.data_filtered is None.")
@@ -308,10 +305,22 @@ class QualtricsPlotsMixin:
if isinstance(df, pl.LazyFrame): if isinstance(df, pl.LazyFrame):
df = df.collect() df = df.collect()
# Store sample size for filter description
self._last_sample_size = df.height
return df return df
def _get_filtered_sample_size(self) -> int | None:
"""Get the sample size from the filtered dataset (self.data_filtered).
This returns the number of respondents in the filtered dataset,
not the size of any transformed/aggregated data passed to plot functions.
"""
data_filtered = getattr(self, 'data_filtered', None)
if data_filtered is None:
return None
if isinstance(data_filtered, pl.LazyFrame):
return data_filtered.select(pl.len()).collect().item()
return data_filtered.height
def _clean_voice_label(self, col_name: str) -> str: def _clean_voice_label(self, col_name: str) -> str:
"""Extract and clean voice name from column name for display. """Extract and clean voice name from column name for display.
@@ -681,7 +690,7 @@ class QualtricsPlotsMixin:
ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL
] ]
chart = alt.Chart(stats_df).mark_bar().encode( bars = alt.Chart(stats_df).mark_bar().encode(
x=alt.X('item:N', title=x_label, sort='-y'), x=alt.X('item:N', title=x_label, sort='-y'),
y=alt.Y('count:Q', title=y_label), y=alt.Y('count:Q', title=y_label),
color=alt.Color('gender_category:N', color=alt.Color('gender_category:N',
@@ -692,14 +701,27 @@ class QualtricsPlotsMixin:
alt.Tooltip('count:Q', title='1st Place Votes'), alt.Tooltip('count:Q', title='1st Place Votes'),
alt.Tooltip('gender:N', title='Gender') alt.Tooltip('gender:N', title='Gender')
] ]
).properties( )
# Text overlay for counts
text = alt.Chart(stats_df).mark_text(
dy=-5,
color='black',
fontSize=10
).encode(
x=alt.X('item:N', sort='-y'),
y=alt.Y('count:Q'),
text=alt.Text('count:Q')
)
chart = (bars + text).properties(
title=self._process_title(title), title=self._process_title(title),
width=width or 800, width=width or 800,
height=height or getattr(self, 'plot_height', 400) height=height or getattr(self, 'plot_height', 400)
) )
else: else:
# Bar chart with conditional color # Bar chart with conditional color
chart = alt.Chart(stats_df).mark_bar().encode( bars = alt.Chart(stats_df).mark_bar().encode(
x=alt.X('item:N', title=x_label, sort='-y'), x=alt.X('item:N', title=x_label, sort='-y'),
y=alt.Y('count:Q', title=y_label), y=alt.Y('count:Q', title=y_label),
color=alt.Color('category:N', color=alt.Color('category:N',
@@ -710,7 +732,20 @@ class QualtricsPlotsMixin:
alt.Tooltip('item:N', title='Item'), alt.Tooltip('item:N', title='Item'),
alt.Tooltip('count:Q', title='1st Place Votes') alt.Tooltip('count:Q', title='1st Place Votes')
] ]
).properties( )
# Text overlay for counts
text = alt.Chart(stats_df).mark_text(
dy=-5,
color='black',
fontSize=10
).encode(
x=alt.X('item:N', sort='-y'),
y=alt.Y('count:Q'),
text=alt.Text('count:Q')
)
chart = (bars + text).properties(
title=self._process_title(title), title=self._process_title(title),
width=width or 800, width=width or 800,
height=height or getattr(self, 'plot_height', 400) height=height or getattr(self, 'plot_height', 400)
@@ -769,7 +804,7 @@ class QualtricsPlotsMixin:
# Text overlay # Text overlay
text = bars.mark_text( text = bars.mark_text(
dy=-5, dy=-5,
color='white', color='black',
fontSize=11 fontSize=11
).encode( ).encode(
text='Weighted Score:Q' text='Weighted Score:Q'

View File

@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
Usage: Usage:
uv run python run_filter_combinations.py uv run python run_filter_combinations.py
uv run python run_filter_combinations.py --dry-run # Preview combinations without running uv run python run_filter_combinations.py --dry-run # Preview combinations without running
uv run python run_filter_combinations.py --category age # Only run age combinations
uv run python run_filter_combinations.py --category consumer # Only run consumer segment combinations
""" """
import subprocess import subprocess
@@ -31,22 +33,33 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py' REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'
def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]: def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
""" """
Generate all single-filter combinations. Generate all single-filter combinations.
Each combination isolates ONE filter value while keeping all others at "all selected". Each combination isolates ONE filter value while keeping all others at "all selected".
Returns list of dicts with filter kwargs for each run.
Args:
survey: QualtricsSurvey instance with loaded data
category: Optional filter category to limit combinations to.
Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
'business_owner', 'ai_user', 'investable_assets', 'industry'
If None or 'all', generates all combinations.
Returns:
List of dicts with filter kwargs for each run.
""" """
combinations = [] combinations = []
# Add "All Respondents" run (no filters = all options selected) # Add "All Respondents" run (no filters = all options selected)
if not category or category == 'all':
combinations.append({ combinations.append({
'name': 'All_Respondents', 'name': 'All_Respondents',
'filters': {} # Empty = use defaults (all selected) 'filters': {} # Empty = use defaults (all selected)
}) })
# Age groups - one at a time # Age groups - one at a time
if not category or category in ['all', 'age']:
for age in survey.options_age: for age in survey.options_age:
combinations.append({ combinations.append({
'name': f'Age-{age}', 'name': f'Age-{age}',
@@ -54,6 +67,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Gender - one at a time # Gender - one at a time
if not category or category in ['all', 'gender']:
for gender in survey.options_gender: for gender in survey.options_gender:
combinations.append({ combinations.append({
'name': f'Gender-{gender}', 'name': f'Gender-{gender}',
@@ -61,6 +75,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Ethnicity - grouped by individual values # Ethnicity - grouped by individual values
if not category or category in ['all', 'ethnicity']:
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino") # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
# Create filters that include ALL options containing each individual ethnicity value # Create filters that include ALL options containing each individual ethnicity value
ethnicity_values = set() ethnicity_values = set()
@@ -81,6 +96,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Income - one at a time # Income - one at a time
if not category or category in ['all', 'income']:
for income in survey.options_income: for income in survey.options_income:
combinations.append({ combinations.append({
'name': f'Income-{income}', 'name': f'Income-{income}',
@@ -88,6 +104,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Consumer segments - combine _A and _B options, and also include standalone # Consumer segments - combine _A and _B options, and also include standalone
if not category or category in ['all', 'consumer']:
# Group options by base name (removing _A/_B suffix) # Group options by base name (removing _A/_B suffix)
consumer_groups = {} consumer_groups = {}
for consumer in survey.options_consumer: for consumer in survey.options_consumer:
@@ -117,6 +134,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Business Owner - one at a time # Business Owner - one at a time
if not category or category in ['all', 'business_owner']:
for business_owner in survey.options_business_owner: for business_owner in survey.options_business_owner:
combinations.append({ combinations.append({
'name': f'BusinessOwner-{business_owner}', 'name': f'BusinessOwner-{business_owner}',
@@ -124,6 +142,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# AI User - one at a time # AI User - one at a time
if not category or category in ['all', 'ai_user']:
for ai_user in survey.options_ai_user: for ai_user in survey.options_ai_user:
combinations.append({ combinations.append({
'name': f'AIUser-{ai_user}', 'name': f'AIUser-{ai_user}',
@@ -145,6 +164,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Investable Assets - one at a time # Investable Assets - one at a time
if not category or category in ['all', 'investable_assets']:
for investable_assets in survey.options_investable_assets: for investable_assets in survey.options_investable_assets:
combinations.append({ combinations.append({
'name': f'Assets-{investable_assets}', 'name': f'Assets-{investable_assets}',
@@ -152,6 +172,7 @@ def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
}) })
# Industry - one at a time # Industry - one at a time
if not category or category in ['all', 'industry']:
for industry in survey.options_industry: for industry in survey.options_industry:
combinations.append({ combinations.append({
'name': f'Industry-{industry}', 'name': f'Industry-{industry}',
@@ -207,6 +228,13 @@ def main():
import argparse import argparse
parser = argparse.ArgumentParser(description='Run quant report for all filter combinations') parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running') parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
parser.add_argument(
'--category',
choices=['all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
'business_owner', 'ai_user', 'investable_assets', 'industry'],
default='all',
help='Filter category to run combinations for (default: all)'
)
args = parser.parse_args() args = parser.parse_args()
# Load survey to get available filter options # Load survey to get available filter options
@@ -214,9 +242,10 @@ def main():
survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE) survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
survey.load_data() # Populates options_* attributes survey.load_data() # Populates options_* attributes
# Generate all combinations # Generate combinations for specified category
combinations = get_filter_combinations(survey) combinations = get_filter_combinations(survey, category=args.category)
print(f"Generated {len(combinations)} filter combinations") category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
print(f"Generated {len(combinations)} filter combinations{category_desc}")
if args.dry_run: if args.dry_run:
print("\nDRY RUN - Commands that would be executed:") print("\nDRY RUN - Commands that would be executed:")