JPMC-quant/run_filter_combinations.py

#!/usr/bin/env python
"""
Batch runner for quant report with different filter combinations.

Runs 03_quant_report.script.py for each single-filter combination:
- Each age group (with all others active)
- Each gender (with all others active)
- Each ethnicity (with all others active)
- Each income group (with all others active)
- Each consumer segment (with all others active)

Usage:
    uv run python run_filter_combinations.py
    uv run python run_filter_combinations.py --dry-run  # Preview combinations without running
    uv run python run_filter_combinations.py --category age  # Only run age combinations
    uv run python run_filter_combinations.py --category consumer  # Only run consumer segment combinations
"""

import subprocess
import sys
import json
from pathlib import Path

from tqdm import tqdm

from utils import QualtricsSurvey


# Default data paths (same as in 03_quant_report.script.py)
RESULTS_FILE = 'data/exports/2-2-26/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'

REPORT_SCRIPT = Path(__file__).parent / '03_quant_report.script.py'


def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> list[dict]:
    """
    Generate all single-filter combinations.

    Each combination isolates ONE filter value while keeping all others at "all selected".

    Args:
        survey: QualtricsSurvey instance with loaded data
        category: Optional filter category to limit combinations to.
                  Valid values: 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer',
                               'business_owner', 'ai_user', 'investable_assets', 'industry'
                  If None or 'all', generates all combinations.

    Returns:
        List of dicts with filter kwargs for each run.
    """
    combinations = []

    # Add "All Respondents" run (no filters = all options selected)
    if not category or category in ['all_filters', 'all']:
        combinations.append({
            'name': 'All_Respondents',
            'filters': {}  # Empty = use defaults (all selected)
        })

    # Age groups - one at a time
    if not category or category in ['all_filters', 'age']:
        for age in survey.options_age:
            combinations.append({
                'name': f'Age-{age}',
                'filters': {'age': [age]}
            })

    # Gender - one at a time
    if not category or category in ['all_filters', 'gender']:
        for gender in survey.options_gender:
            combinations.append({
                'name': f'Gender-{gender}',
                'filters': {'gender': [gender]}
            })

    # Ethnicity - grouped by individual values
    if not category or category in ['all_filters', 'ethnicity']:
        # Ethnicity options are comma-separated (e.g., "White or Caucasian, Hispanic or Latino")
        # Create filters that include ALL options containing each individual ethnicity value
        ethnicity_values = set()
        for ethnicity_option in survey.options_ethnicity:
            # Split by comma and strip whitespace
            values = [v.strip() for v in ethnicity_option.split(',')]
            ethnicity_values.update(values)

        for ethnicity_value in sorted(ethnicity_values):
            # Find all options that contain this value
            matching_options = [
                opt for opt in survey.options_ethnicity
                if ethnicity_value in [v.strip() for v in opt.split(',')]
            ]
            combinations.append({
                'name': f'Ethnicity-{ethnicity_value}',
                'filters': {'ethnicity': matching_options}
            })

    # Income - one at a time
    if not category or category in ['all_filters', 'income']:
        for income in survey.options_income:
            combinations.append({
                'name': f'Income-{income}',
                'filters': {'income': [income]}
            })

    # Consumer segments - combine _A and _B options, and also include standalone
    if not category or category in ['all_filters', 'consumer']:
        # Group options by base name (removing _A/_B suffix)
        consumer_groups = {}
        for consumer in survey.options_consumer:
            # Check if ends with _A or _B
            if consumer.endswith('_A') or consumer.endswith('_B'):
                base_name = consumer[:-2]  # Remove last 2 chars (_A or _B)
                if base_name not in consumer_groups:
                    consumer_groups[base_name] = []
                consumer_groups[base_name].append(consumer)
            else:
                # Not an _A/_B option, keep as-is
                consumer_groups[consumer] = [consumer]

        # Add combined _A+_B options
        for base_name, options in consumer_groups.items():
            if len(options) > 1:  # Only combine if there are multiple (_A and _B)
                combinations.append({
                    'name': f'Consumer-{base_name}',
                    'filters': {'consumer': options}
                })

        # Add standalone options (including individual _A and _B)
        for consumer in survey.options_consumer:
            combinations.append({
                'name': f'Consumer-{consumer}',
                'filters': {'consumer': [consumer]}
            })

    # Business Owner - one at a time
    if not category or category in ['all_filters', 'business_owner']:
        for business_owner in survey.options_business_owner:
            combinations.append({
                'name': f'BusinessOwner-{business_owner}',
                'filters': {'business_owner': [business_owner]}
            })

    # AI User - one at a time
    if not category or category in ['all_filters', 'ai_user']:
        for ai_user in survey.options_ai_user:
            combinations.append({
                'name': f'AIUser-{ai_user}',
                'filters': {'ai_user': [ai_user]}
            })

        # AI user daily, more than once daily, en multiple times a week = frequent
        combinations.append({
            'name': 'AIUser-Frequent',
            'filters': {'ai_user': [
                'Daily', 'More than once daily', 'Multiple times per week'
            ]}
        })
        combinations.append({
            'name': 'AIUser-RarelyNever',
            'filters': {'ai_user': [
                'Once a month', 'Less than once a month', 'Once a week', 'Rarely/Never'
            ]}
        })

    # Investable Assets - one at a time
    if not category or category in ['all_filters', 'investable_assets']:
        for investable_assets in survey.options_investable_assets:
            combinations.append({
                'name': f'Assets-{investable_assets}',
                'filters': {'investable_assets': [investable_assets]}
            })

    # Industry - one at a time
    if not category or category in ['all_filters', 'industry']:
        for industry in survey.options_industry:
            combinations.append({
                'name': f'Industry-{industry}',
                'filters': {'industry': [industry]}
            })

    # Voice ranking completeness filter
    # These use a special flag rather than demographic filters, so we store
    # the mode in a dedicated key that run_report passes as --voice-ranking-filter.
    if not category or category in ['all_filters', 'voice_ranking']:
        combinations.append({
            'name': 'VoiceRanking-OnlyMissing',
            'filters': {},
            'voice_ranking_filter': 'only-missing',
        })
        combinations.append({
            'name': 'VoiceRanking-ExcludeMissing',
            'filters': {},
            'voice_ranking_filter': 'exclude-missing',
        })

    return combinations


def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None, voice_ranking_filter: str = None) -> bool:
    """
    Run the report script with given filters.

    Args:
        filters: Dict of filter_name -> list of values
        name: Name for this filter combination (used for .txt description file)
        dry_run: If True, just print command without running
        sl_threshold: If set, exclude respondents with >= N straight-lined question groups
        voice_ranking_filter: If set, filter by voice ranking completeness.
            'only-missing' keeps only respondents missing QID98 data,
            'exclude-missing' removes them.

    Returns:
        True if successful, False otherwise
    """
    cmd = [sys.executable, str(REPORT_SCRIPT)]

    # Add filter-name for description file
    if name:
        cmd.extend(['--filter-name', name])

    # Pass straight-liner threshold if specified
    if sl_threshold is not None:
        cmd.extend(['--sl-threshold', str(sl_threshold)])

    # Pass voice ranking filter if specified
    if voice_ranking_filter is not None:
        cmd.extend(['--voice-ranking-filter', voice_ranking_filter])

    for filter_name, values in filters.items():
        if values:
            cmd.extend([f'--{filter_name}', json.dumps(values)])

    if dry_run:
        print(f"  Would run: {' '.join(cmd)}")
        return True

    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            cwd=Path(__file__).parent
        )
        if result.returncode != 0:
            print(f"\n  ERROR: {result.stderr[:500]}")
            return False
        return True
    except Exception as e:
        print(f"\n  ERROR: {e}")
        return False


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Run quant report for all filter combinations')
    parser.add_argument('--dry-run', action='store_true', help='Preview combinations without running')
    parser.add_argument(
        '--category',
        choices=['all_filters', 'all', 'age', 'gender', 'ethnicity', 'income', 'consumer', 'business_owner', 'ai_user', 'investable_assets', 'industry', 'voice_ranking'],
        default='all_filters',
        help='Filter category to run combinations for (default: all_filters)'
    )
    parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)')
    args = parser.parse_args()

    # Load survey to get available filter options
    print("Loading survey to get filter options...")
    survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
    survey.load_data()  # Populates options_* attributes

    # Generate combinations for specified category
    combinations = get_filter_combinations(survey, category=args.category)
    category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
    print(f"Generated {len(combinations)} filter combinations{category_desc}")

    if args.sl_threshold is not None:
        print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups")

    if args.dry_run:
        print("\nDRY RUN - Commands that would be executed:")
        for combo in combinations:
            print(f"\n{combo['name']}:")
            run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter'))
        return

    # Run each combination with progress bar
    successful = 0
    failed = []

    for combo in tqdm(combinations, desc="Running reports", unit="filter"):
        tqdm.write(f"Running: {combo['name']}")
        if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold, voice_ranking_filter=combo.get('voice_ranking_filter')):
            successful += 1
        else:
            failed.append(combo['name'])

    # Summary
    print(f"\n{'='*50}")
    print(f"Completed: {successful}/{len(combinations)} successful")
    if failed:
        print(f"Failed: {', '.join(failed)}")


if __name__ == '__main__':
    main()