fix sample size

This commit is contained in:
2026-02-03 20:48:34 +01:00
parent 9a587dcc4c
commit 36280a6ff8
3 changed files with 186 additions and 122 deletions

View File

@@ -26,9 +26,9 @@ def _():
@app.cell @app.cell
def _(): def _():
TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2_3-2-18-15.pptx') TAG_SOURCE = Path('data/reports/Perception-Research-Report_3-2-26_20-00.pptx')
# TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx') # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
TAG_IMAGE_DIR = Path('figures/2-3-26_Copy-2-2-26') TAG_IMAGE_DIR = Path('figures/debug')
return TAG_IMAGE_DIR, TAG_SOURCE return TAG_IMAGE_DIR, TAG_SOURCE
@@ -52,7 +52,7 @@ def _():
@app.cell @app.cell
def _(): def _():
REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2_3-2-18-15.pptx') REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_3-2-26_20-00.pptx')
# REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx') # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')
NEW_IMAGES_DIR = Path('figures/debug') NEW_IMAGES_DIR = Path('figures/debug')

View File

@@ -178,8 +178,8 @@ class QualtricsPlotsMixin:
# Use UPPERCASE for category name to distinguish from values # Use UPPERCASE for category name to distinguish from values
parts.append(f"{display_name.upper()}: {val_str}") parts.append(f"{display_name.upper()}: {val_str}")
# Get sample size (stored by _ensure_dataframe) # Get sample size from the filtered dataset (not from transformed plot data)
sample_size = getattr(self, '_last_sample_size', None) sample_size = self._get_filtered_sample_size()
sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else "" sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else ""
if not parts: if not parts:
@@ -297,10 +297,7 @@ class QualtricsPlotsMixin:
return chart return chart
def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame: def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame:
"""Ensure data is an eager DataFrame, collecting if necessary. """Ensure data is an eager DataFrame, collecting if necessary."""
Also stores the sample size on self._last_sample_size for use in filter descriptions.
"""
df = data if data is not None else getattr(self, 'data_filtered', None) df = data if data is not None else getattr(self, 'data_filtered', None)
if df is None: if df is None:
raise ValueError("No data provided and self.data_filtered is None.") raise ValueError("No data provided and self.data_filtered is None.")
@@ -308,9 +305,21 @@ class QualtricsPlotsMixin:
if isinstance(df, pl.LazyFrame): if isinstance(df, pl.LazyFrame):
df = df.collect() df = df.collect()
# Store sample size for filter description
self._last_sample_size = df.height
return df return df
def _get_filtered_sample_size(self) -> int | None:
"""Get the sample size from the filtered dataset (self.data_filtered).
This returns the number of respondents in the filtered dataset,
not the size of any transformed/aggregated data passed to plot functions.
"""
data_filtered = getattr(self, 'data_filtered', None)
if data_filtered is None:
return None
if isinstance(data_filtered, pl.LazyFrame):
return data_filtered.select(pl.len()).collect().item()
return data_filtered.height
def _clean_voice_label(self, col_name: str) -> str: def _clean_voice_label(self, col_name: str) -> str:
"""Extract and clean voice name from column name for display. """Extract and clean voice name from column name for display.
@@ -681,7 +690,7 @@ class QualtricsPlotsMixin:
ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL
] ]
chart = alt.Chart(stats_df).mark_bar().encode( bars = alt.Chart(stats_df).mark_bar().encode(
x=alt.X('item:N', title=x_label, sort='-y'), x=alt.X('item:N', title=x_label, sort='-y'),
y=alt.Y('count:Q', title=y_label), y=alt.Y('count:Q', title=y_label),
color=alt.Color('gender_category:N', color=alt.Color('gender_category:N',
@@ -692,14 +701,27 @@ class QualtricsPlotsMixin:
alt.Tooltip('count:Q', title='1st Place Votes'), alt.Tooltip('count:Q', title='1st Place Votes'),
alt.Tooltip('gender:N', title='Gender') alt.Tooltip('gender:N', title='Gender')
] ]
).properties( )
# Text overlay for counts
text = alt.Chart(stats_df).mark_text(
dy=-5,
color='black',
fontSize=10
).encode(
x=alt.X('item:N', sort='-y'),
y=alt.Y('count:Q'),
text=alt.Text('count:Q')
)
chart = (bars + text).properties(
title=self._process_title(title), title=self._process_title(title),
width=width or 800, width=width or 800,
height=height or getattr(self, 'plot_height', 400) height=height or getattr(self, 'plot_height', 400)
) )
else: else:
# Bar chart with conditional color # Bar chart with conditional color
chart = alt.Chart(stats_df).mark_bar().encode( bars = alt.Chart(stats_df).mark_bar().encode(
x=alt.X('item:N', title=x_label, sort='-y'), x=alt.X('item:N', title=x_label, sort='-y'),
y=alt.Y('count:Q', title=y_label), y=alt.Y('count:Q', title=y_label),
color=alt.Color('category:N', color=alt.Color('category:N',
@@ -710,7 +732,20 @@ class QualtricsPlotsMixin:
alt.Tooltip('item:N', title='Item'), alt.Tooltip('item:N', title='Item'),
alt.Tooltip('count:Q', title='1st Place Votes') alt.Tooltip('count:Q', title='1st Place Votes')
] ]
).properties( )
# Text overlay for counts
text = alt.Chart(stats_df).mark_text(
dy=-5,
color='black',
fontSize=10
).encode(
x=alt.X('item:N', sort='-y'),
y=alt.Y('count:Q'),
text=alt.Text('count:Q')
)
chart = (bars + text).properties(
title=self._process_title(title), title=self._process_title(title),
width=width or 800, width=width or 800,
height=height or getattr(self, 'plot_height', 400) height=height or getattr(self, 'plot_height', 400)
@@ -769,7 +804,7 @@ class QualtricsPlotsMixin:
# Text overlay # Text overlay
text = bars.mark_text( text = bars.mark_text(
dy=-5, dy=-5,
color='white', color='black',
fontSize=11 fontSize=11
).encode( ).encode(
text='Weighted Score:Q' text='Weighted Score:Q'

View File

@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
Usage: Usage:
uv run python run_filter_combinations.py uv run python run_filter_combinations.py
uv run python run_filter_combinations.py --dry-run # Preview combinations without running uv run python run_filter_combinations.py --dry-run # Preview combinations without running
uv run python run_filter_combinations.py --category age # Only run age combinations
uv run python run_filter_combinations.py --category consumer # Only run consumer segment combinations
""" """
import subprocess import subprocess
@@ -31,132 +33,151 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py' REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'
def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]: def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
""" """
Generate all single-filter combinations. Generate all single-filter combinations.
Each combination isolates ONE filter value while keeping all others at "all selected". Each combination isolates ONE filter value while keeping all others at "all selected".
Returns list of dicts with filter kwargs for each run.
Args:
survey: QualtricsSurvey instance with loaded data
category: Optional filter category to limit combinations to.
Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
'business_owner', 'ai_user', 'investable_assets', 'industry'
If None or 'all', generates all combinations.
Returns:
List of dicts with filter kwargs for each run.
""" """
combinations = [] combinations = []
# Add "All Respondents" run (no filters = all options selected) # Add "All Respondents" run (no filters = all options selected)
combinations.append({ if not category or category == 'all':
'name': 'All_Respondents', combinations.append({
'filters': {} # Empty = use defaults (all selected) 'name': 'All_Respondents',
}) 'filters': {} # Empty = use defaults (all selected)
})
# Age groups - one at a time # Age groups - one at a time
for age in survey.options_age: if not category or category in ['all', 'age']:
combinations.append({ for age in survey.options_age:
'name': f'Age-{age}',
'filters': {'age': [age]}
})
# Gender - one at a time
for gender in survey.options_gender:
combinations.append({
'name': f'Gender-{gender}',
'filters': {'gender': [gender]}
})
# Ethnicity - grouped by individual values
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
# Create filters that include ALL options containing each individual ethnicity value
ethnicity_values = set()
for ethnicity_option in survey.options_ethnicity:
# Split by comma and strip whitespace
values = [v.strip() for v in ethnicity_option.split(',')]
ethnicity_values.update(values)
for ethnicity_value in sorted(ethnicity_values):
# Find all options that contain this value
matching_options = [
opt for opt in survey.options_ethnicity
if ethnicity_value in [v.strip() for v in opt.split(',')]
]
combinations.append({
'name': f'Ethnicity-{ethnicity_value}',
'filters': {'ethnicity': matching_options}
})
# Income - one at a time
for income in survey.options_income:
combinations.append({
'name': f'Income-{income}',
'filters': {'income': [income]}
})
# Consumer segments - combine _A and _B options, and also include standalone
# Group options by base name (removing _A/_B suffix)
consumer_groups = {}
for consumer in survey.options_consumer:
# Check if ends with _A or _B
if consumer.endswith('_A') or consumer.endswith('_B'):
base_name = consumer[:-2] # Remove last 2 chars (_A or _B)
if base_name not in consumer_groups:
consumer_groups[base_name] = []
consumer_groups[base_name].append(consumer)
else:
# Not an _A/_B option, keep as-is
consumer_groups[consumer] = [consumer]
# Add combined _A+_B options
for base_name, options in consumer_groups.items():
if len(options) > 1: # Only combine if there are multiple (_A and _B)
combinations.append({ combinations.append({
'name': f'Consumer-{base_name}', 'name': f'Age-{age}',
'filters': {'consumer': options} 'filters': {'age': [age]}
}) })
# Add standalone options (including individual _A and _B) # Gender - one at a time
for consumer in survey.options_consumer: if not category or category in ['all', 'gender']:
combinations.append({ for gender in survey.options_gender:
'name': f'Consumer-{consumer}', combinations.append({
'filters': {'consumer': [consumer]} 'name': f'Gender-{gender}',
}) 'filters': {'gender': [gender]}
})
# Ethnicity - grouped by individual values
if not category or category in ['all', 'ethnicity']:
# Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
# Create filters that include ALL options containing each individual ethnicity value
ethnicity_values = set()
for ethnicity_option in survey.options_ethnicity:
# Split by comma and strip whitespace
values = [v.strip() for v in ethnicity_option.split(',')]
ethnicity_values.update(values)
for ethnicity_value in sorted(ethnicity_values):
# Find all options that contain this value
matching_options = [
opt for opt in survey.options_ethnicity
if ethnicity_value in [v.strip() for v in opt.split(',')]
]
combinations.append({
'name': f'Ethnicity-{ethnicity_value}',
'filters': {'ethnicity': matching_options}
})
# Income - one at a time
if not category or category in ['all', 'income']:
for income in survey.options_income:
combinations.append({
'name': f'Income-{income}',
'filters': {'income': [income]}
})
# Consumer segments - combine _A and _B options, and also include standalone
if not category or category in ['all', 'consumer']:
# Group options by base name (removing _A/_B suffix)
consumer_groups = {}
for consumer in survey.options_consumer:
# Check if ends with _A or _B
if consumer.endswith('_A') or consumer.endswith('_B'):
base_name = consumer[:-2] # Remove last 2 chars (_A or _B)
if base_name not in consumer_groups:
consumer_groups[base_name] = []
consumer_groups[base_name].append(consumer)
else:
# Not an _A/_B option, keep as-is
consumer_groups[consumer] = [consumer]
# Add combined _A+_B options
for base_name, options in consumer_groups.items():
if len(options) > 1: # Only combine if there are multiple (_A and _B)
combinations.append({
'name': f'Consumer-{base_name}',
'filters': {'consumer': options}
})
# Add standalone options (including individual _A and _B)
for consumer in survey.options_consumer:
combinations.append({
'name': f'Consumer-{consumer}',
'filters': {'consumer': [consumer]}
})
# Business Owner - one at a time # Business Owner - one at a time
for business_owner in survey.options_business_owner: if not category or category in ['all', 'business_owner']:
combinations.append({ for business_owner in survey.options_business_owner:
'name': f'BusinessOwner-{business_owner}', combinations.append({
'filters': {'business_owner': [business_owner]} 'name': f'BusinessOwner-{business_owner}',
}) 'filters': {'business_owner': [business_owner]}
})
# AI User - one at a time # AI User - one at a time
for ai_user in survey.options_ai_user: if not category or category in ['all', 'ai_user']:
for ai_user in survey.options_ai_user:
combinations.append({
'name': f'AIUser-{ai_user}',
'filters': {'ai_user': [ai_user]}
})
# AI user daily, more than once daily, en multiple times a week = frequent
combinations.append({ combinations.append({
'name': f'AIUser-{ai_user}', 'name': 'AIUser-Frequent',
'filters': {'ai_user': [ai_user]} 'filters': {'ai_user': [
'Daily', 'More than once daily', 'Multiple times per week'
]}
})
combinations.append({
'name': 'AIUser-Infrequent',
'filters': {'ai_user': [
'Once a month', 'Less than once a month', 'Once a week'
]}
}) })
# AI user daily, more than once daily, en multiple times a week = frequent
combinations.append({
'name': 'AIUser-Frequent',
'filters': {'ai_user': [
'Daily', 'More than once daily', 'Multiple times per week'
]}
})
combinations.append({
'name': 'AIUser-Infrequent',
'filters': {'ai_user': [
'Once a month', 'Less than once a month', 'Once a week'
]}
})
# Investable Assets - one at a time # Investable Assets - one at a time
for investable_assets in survey.options_investable_assets: if not category or category in ['all', 'investable_assets']:
combinations.append({ for investable_assets in survey.options_investable_assets:
'name': f'Assets-{investable_assets}', combinations.append({
'filters': {'investable_assets': [investable_assets]} 'name': f'Assets-{investable_assets}',
}) 'filters': {'investable_assets': [investable_assets]}
})
# Industry - one at a time # Industry - one at a time
for industry in survey.options_industry: if not category or category in ['all', 'industry']:
combinations.append({ for industry in survey.options_industry:
'name': f'Industry-{industry}', combinations.append({
'filters': {'industry': [industry]} 'name': f'Industry-{industry}',
}) 'filters': {'industry': [industry]}
})
return combinations return combinations
@@ -207,6 +228,13 @@ def main():
import argparse import argparse
parser = argparse.ArgumentParser(description='Run quant report for all filter combinations') parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running') parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
parser.add_argument(
'--category',
choices=['all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
'business_owner', 'ai_user', 'investable_assets', 'industry'],
default='all',
help='Filter category to run combinations for (default: all)'
)
args = parser.parse_args() args = parser.parse_args()
# Load survey to get available filter options # Load survey to get available filter options
@@ -214,9 +242,10 @@ def main():
survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE) survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
survey.load_data() # Populates options_* attributes survey.load_data() # Populates options_* attributes
# Generate all combinations # Generate combinations for specified category
combinations = get_filter_combinations(survey) combinations = get_filter_combinations(survey, category=args.category)
print(f"Generated {len(combinations)} filter combinations") category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
print(f"Generated {len(combinations)} filter combinations{category_desc}")
if args.dry_run: if args.dry_run:
print("\nDRY RUN - Commands that would be executed:") print("\nDRY RUN - Commands that would be executed:")