fix sample size

2026-02-03 20:48:34 +01:00
parent 9a587dcc4c
commit 36280a6ff8
3 changed files with 186 additions and 122 deletions
--- a/04_PPTX_Update_Images.py
+++ b/04_PPTX_Update_Images.py
@@ -26,9 +26,9 @@ def _():

@app.cell
 def _():
-    TAG_SOURCE = Path('data/reports/Perception-Research-Report_2-2_3-2-18-15.pptx')
+    TAG_SOURCE = Path('data/reports/Perception-Research-Report_3-2-26_20-00.pptx')
    # TAG_TARGET = Path('data/reports/Perception-Research-Report_2-2_tagged.pptx')
-    TAG_IMAGE_DIR = Path('figures/2-3-26_Copy-2-2-26')
+    TAG_IMAGE_DIR = Path('figures/debug')
    return TAG_IMAGE_DIR, TAG_SOURCE


@@ -52,7 +52,7 @@ def _():

@app.cell
 def _():
-    REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_2-2_3-2-18-15.pptx')
+    REPLACE_SOURCE = Path('data/reports/Perception-Research-Report_3-2-26_20-00.pptx')
    # REPLACE_TARGET = Path('data/reports/Perception-Research-Report_2-2_updated.pptx')

    NEW_IMAGES_DIR = Path('figures/debug')
--- a/plots.py
+++ b/plots.py
@@ -178,8 +178,8 @@ class QualtricsPlotsMixin:
            # Use UPPERCASE for category name to distinguish from values
            parts.append(f"{display_name.upper()}: {val_str}")
        
-        # Get sample size (stored by _ensure_dataframe)
-        sample_size = getattr(self, '_last_sample_size', None)
+        # Get sample size from the filtered dataset (not from transformed plot data)
+        sample_size = self._get_filtered_sample_size()
        sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else ""
        
        if not parts:
@@ -297,10 +297,7 @@ class QualtricsPlotsMixin:
        return chart

    def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame:
-        """Ensure data is an eager DataFrame, collecting if necessary.
-        
-        Also stores the sample size on self._last_sample_size for use in filter descriptions.
-        """
+        """Ensure data is an eager DataFrame, collecting if necessary."""
        df = data if data is not None else getattr(self, 'data_filtered', None)
        if df is None:
             raise ValueError("No data provided and self.data_filtered is None.")
@@ -308,9 +305,21 @@ class QualtricsPlotsMixin:
        if isinstance(df, pl.LazyFrame):
            df = df.collect()
        
-        # Store sample size for filter description
-        self._last_sample_size = df.height
        return df
+    
+    def _get_filtered_sample_size(self) -> int | None:
+        """Get the sample size from the filtered dataset (self.data_filtered).
+        
+        This returns the number of respondents in the filtered dataset,
+        not the size of any transformed/aggregated data passed to plot functions.
+        """
+        data_filtered = getattr(self, 'data_filtered', None)
+        if data_filtered is None:
+            return None
+        
+        if isinstance(data_filtered, pl.LazyFrame):
+            return data_filtered.select(pl.len()).collect().item()
+        return data_filtered.height

    def _clean_voice_label(self, col_name: str) -> str:
        """Extract and clean voice name from column name for display.
@@ -681,7 +690,7 @@ class QualtricsPlotsMixin:
                ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL
            ]
            
-            chart = alt.Chart(stats_df).mark_bar().encode(
+            bars = alt.Chart(stats_df).mark_bar().encode(
                x=alt.X('item:N', title=x_label, sort='-y'),
                y=alt.Y('count:Q', title=y_label),
                color=alt.Color('gender_category:N',
@@ -692,14 +701,27 @@ class QualtricsPlotsMixin:
                    alt.Tooltip('count:Q', title='1st Place Votes'),
                    alt.Tooltip('gender:N', title='Gender')
                ]
-            ).properties(
+            )
+            
+            # Text overlay for counts
+            text = alt.Chart(stats_df).mark_text(
+                dy=-5,
+                color='black',
+                fontSize=10
+            ).encode(
+                x=alt.X('item:N', sort='-y'),
+                y=alt.Y('count:Q'),
+                text=alt.Text('count:Q')
+            )
+            
+            chart = (bars + text).properties(
                title=self._process_title(title),
                width=width or 800,
                height=height or getattr(self, 'plot_height', 400)
            )
        else:
            # Bar chart with conditional color
-            chart = alt.Chart(stats_df).mark_bar().encode(
+            bars = alt.Chart(stats_df).mark_bar().encode(
                x=alt.X('item:N', title=x_label, sort='-y'),
                y=alt.Y('count:Q', title=y_label),
                color=alt.Color('category:N',
@@ -710,7 +732,20 @@ class QualtricsPlotsMixin:
                    alt.Tooltip('item:N', title='Item'),
                    alt.Tooltip('count:Q', title='1st Place Votes')
                ]
-            ).properties(
+            )
+            
+            # Text overlay for counts
+            text = alt.Chart(stats_df).mark_text(
+                dy=-5,
+                color='black',
+                fontSize=10
+            ).encode(
+                x=alt.X('item:N', sort='-y'),
+                y=alt.Y('count:Q'),
+                text=alt.Text('count:Q')
+            )
+            
+            chart = (bars + text).properties(
                title=self._process_title(title),
                width=width or 800,
                height=height or getattr(self, 'plot_height', 400)
@@ -769,7 +804,7 @@ class QualtricsPlotsMixin:
        # Text overlay
        text = bars.mark_text(
            dy=-5,
-            color='white',
+            color='black',
            fontSize=11
        ).encode(
            text='Weighted Score:Q'
--- a/run_filter_combinations.py
+++ b/run_filter_combinations.py
@@ -12,6 +12,8 @@ Runs 03_quant_report.script.py for each single-filter combination:
 Usage:
    uv run python run_filter_combinations.py
    uv run python run_filter_combinations.py --dry-run  # Preview combinations without running
+    uv run python run_filter_combinations.py --category age  # Only run age combinations
+    uv run python run_filter_combinations.py --category consumer  # Only run consumer segment combinations
 """

 import subprocess
@@ -31,132 +33,151 @@ QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_P
 REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'


-def get_filter_combinations(survey: QualtricsSurvey) -> list[dict]:
+def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
    """
    Generate all single-filter combinations.
    
    Each combination isolates ONE filter value while keeping all others at "all selected".
-    Returns list of dicts with filter kwargs for each run.
+    
+    Args:
+        survey: QualtricsSurvey instance with loaded data
+        category: Optional filter category to limit combinations to.
+                  Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
+                               'business_owner', 'ai_user', 'investable_assets', 'industry'
+                  If None or 'all', generates all combinations.
+    
+    Returns:
+        List of dicts with filter kwargs for each run.
    """
    combinations = []
    
    # Add "All Respondents" run (no filters = all options selected)
-    combinations.append({
-        'name': 'All_Respondents',
-        'filters': {}  # Empty = use defaults (all selected)
-    })
+    if not category or category == 'all':
+        combinations.append({
+            'name': 'All_Respondents',
+            'filters': {}  # Empty = use defaults (all selected)
+        })
    
    # Age groups - one at a time
-    for age in survey.options_age:
-        combinations.append({
-            'name': f'Age-{age}',
-            'filters': {'age': [age]}
-        })
-    
-    # Gender - one at a time
-    for gender in survey.options_gender:
-        combinations.append({
-            'name': f'Gender-{gender}',
-            'filters': {'gender': [gender]}
-        })
-    
-    # Ethnicity - grouped by individual values
-    # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
-    # Create filters that include ALL options containing each individual ethnicity value
-    ethnicity_values = set()
-    for ethnicity_option in survey.options_ethnicity:
-        # Split by comma and strip whitespace
-        values = [v.strip() for v in ethnicity_option.split(',')]
-        ethnicity_values.update(values)
-    
-    for ethnicity_value in sorted(ethnicity_values):
-        # Find all options that contain this value
-        matching_options = [
-            opt for opt in survey.options_ethnicity 
-            if ethnicity_value in [v.strip() for v in opt.split(',')]
-        ]
-        combinations.append({
-            'name': f'Ethnicity-{ethnicity_value}',
-            'filters': {'ethnicity': matching_options}
-        })
-    
-    # Income - one at a time
-    for income in survey.options_income:
-        combinations.append({
-            'name': f'Income-{income}',
-            'filters': {'income': [income]}
-        })
-    
-    # Consumer segments - combine _A and _B options, and also include standalone
-    # Group options by base name (removing _A/_B suffix)
-    consumer_groups = {}
-    for consumer in survey.options_consumer:
-        # Check if ends with _A or _B
-        if consumer.endswith('_A') or consumer.endswith('_B'):
-            base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
-            if base_name not in consumer_groups:
-                consumer_groups[base_name] = []
-            consumer_groups[base_name].append(consumer)
-        else:
-            # Not an _A/_B option, keep as-is
-            consumer_groups[consumer] = [consumer]
-    
-    # Add combined _A+_B options
-    for base_name, options in consumer_groups.items():
-        if len(options) > 1:  # Only combine if there are multiple (_A and _B)
+    if not category or category in ['all', 'age']:
+        for age in survey.options_age:
            combinations.append({
-                'name': f'Consumer-{base_name}',
-                'filters': {'consumer': options}
+                'name': f'Age-{age}',
+                'filters': {'age': [age]}
            })
    
-    # Add standalone options (including individual _A and _B)
-    for consumer in survey.options_consumer:
-        combinations.append({
-            'name': f'Consumer-{consumer}',
-            'filters': {'consumer': [consumer]}
-        })
+    # Gender - one at a time
+    if not category or category in ['all', 'gender']:
+        for gender in survey.options_gender:
+            combinations.append({
+                'name': f'Gender-{gender}',
+                'filters': {'gender': [gender]}
+            })
+    
+    # Ethnicity - grouped by individual values
+    if not category or category in ['all', 'ethnicity']:
+        # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
+        # Create filters that include ALL options containing each individual ethnicity value
+        ethnicity_values = set()
+        for ethnicity_option in survey.options_ethnicity:
+            # Split by comma and strip whitespace
+            values = [v.strip() for v in ethnicity_option.split(',')]
+            ethnicity_values.update(values)
+        
+        for ethnicity_value in sorted(ethnicity_values):
+            # Find all options that contain this value
+            matching_options = [
+                opt for opt in survey.options_ethnicity 
+                if ethnicity_value in [v.strip() for v in opt.split(',')]
+            ]
+            combinations.append({
+                'name': f'Ethnicity-{ethnicity_value}',
+                'filters': {'ethnicity': matching_options}
+            })
+    
+    # Income - one at a time
+    if not category or category in ['all', 'income']:
+        for income in survey.options_income:
+            combinations.append({
+                'name': f'Income-{income}',
+                'filters': {'income': [income]}
+            })
+    
+    # Consumer segments - combine _A and _B options, and also include standalone
+    if not category or category in ['all', 'consumer']:
+        # Group options by base name (removing _A/_B suffix)
+        consumer_groups = {}
+        for consumer in survey.options_consumer:
+            # Check if ends with _A or _B
+            if consumer.endswith('_A') or consumer.endswith('_B'):
+                base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
+                if base_name not in consumer_groups:
+                    consumer_groups[base_name] = []
+                consumer_groups[base_name].append(consumer)
+            else:
+                # Not an _A/_B option, keep as-is
+                consumer_groups[consumer] = [consumer]
+        
+        # Add combined _A+_B options
+        for base_name, options in consumer_groups.items():
+            if len(options) > 1:  # Only combine if there are multiple (_A and _B)
+                combinations.append({
+                    'name': f'Consumer-{base_name}',
+                    'filters': {'consumer': options}
+                })
+        
+        # Add standalone options (including individual _A and _B)
+        for consumer in survey.options_consumer:
+            combinations.append({
+                'name': f'Consumer-{consumer}',
+                'filters': {'consumer': [consumer]}
+            })
    
    # Business Owner - one at a time
-    for business_owner in survey.options_business_owner:
-        combinations.append({
-            'name': f'BusinessOwner-{business_owner}',
-            'filters': {'business_owner': [business_owner]}
-        })
+    if not category or category in ['all', 'business_owner']:
+        for business_owner in survey.options_business_owner:
+            combinations.append({
+                'name': f'BusinessOwner-{business_owner}',
+                'filters': {'business_owner': [business_owner]}
+            })
    
    # AI User - one at a time
-    for ai_user in survey.options_ai_user:
+    if not category or category in ['all', 'ai_user']:
+        for ai_user in survey.options_ai_user:
+            combinations.append({
+                'name': f'AIUser-{ai_user}',
+                'filters': {'ai_user': [ai_user]}
+            })
+        
+        # AI user daily, more than once daily, en multiple times a week = frequent
        combinations.append({
-            'name': f'AIUser-{ai_user}',
-            'filters': {'ai_user': [ai_user]}
+            'name': 'AIUser-Frequent',
+            'filters': {'ai_user': [
+                'Daily', 'More than once daily', 'Multiple times per week'
+            ]}
+        })
+        combinations.append({
+            'name': 'AIUser-Infrequent',
+            'filters': {'ai_user': [
+                'Once a month', 'Less than once a month', 'Once a week'
+            ]}
        })
-    
-    # AI user daily, more than once daily, en multiple times a week = frequent
-    combinations.append({
-        'name': 'AIUser-Frequent',
-        'filters': {'ai_user': [
-            'Daily', 'More than once daily', 'Multiple times per week'
-        ]}
-    })
-    combinations.append({
-        'name': 'AIUser-Infrequent',
-        'filters': {'ai_user': [
-            'Once a month', 'Less than once a month', 'Once a week'
-        ]}
-    })
    
    # Investable Assets - one at a time
-    for investable_assets in survey.options_investable_assets:
-        combinations.append({
-            'name': f'Assets-{investable_assets}',
-            'filters': {'investable_assets': [investable_assets]}
-        })
+    if not category or category in ['all', 'investable_assets']:
+        for investable_assets in survey.options_investable_assets:
+            combinations.append({
+                'name': f'Assets-{investable_assets}',
+                'filters': {'investable_assets': [investable_assets]}
+            })
    
    # Industry - one at a time
-    for industry in survey.options_industry:
-        combinations.append({
-            'name': f'Industry-{industry}',
-            'filters': {'industry': [industry]}
-        })
+    if not category or category in ['all', 'industry']:
+        for industry in survey.options_industry:
+            combinations.append({
+                'name': f'Industry-{industry}',
+                'filters': {'industry': [industry]}
+            })
    
    return combinations

@@ -207,6 +228,13 @@ def main():
    import argparse
    parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
    parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
+    parser.add_argument(
+        '--category',
+        choices=['all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 
+                 'business_owner', 'ai_user', 'investable_assets', 'industry'],
+        default='all',
+        help='Filter category to run combinations for (default: all)'
+    )
    args = parser.parse_args()
    
    # Load survey to get available filter options
@@ -214,9 +242,10 @@ def main():
    survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
    survey.load_data()  # Populates options_* attributes
    
-    # Generate all combinations
-    combinations = get_filter_combinations(survey)
-    print(f"Generated {len(combinations)} filter combinations")
+    # Generate combinations for specified category
+    combinations = get_filter_combinations(survey, category=args.category)
+    category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
+    print(f"Generated {len(combinations)} filter combinations{category_desc}")
    
    if args.dry_run:
        print("\nDRY RUN - Commands that would be executed:")