diff --git a/XX_quant_report.script.py b/XX_quant_report.script.py index 5cf7222..402d42f 100644 --- a/XX_quant_report.script.py +++ b/XX_quant_report.script.py @@ -48,8 +48,9 @@ def parse_cli_args(): parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') - parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') + parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")') + parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)') # Only parse if running as script (not in Jupyter/interactive) try: @@ -57,7 +58,7 @@ def parse_cli_args(): get_ipython() # noqa: F821 # type: ignore # Return namespace with all filters set to None no_filters = {f: None for f in FILTER_CONFIG} - return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach") + return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None) except NameError: args = parser.parse_args() # Parse JSON strings to lists @@ -138,6 +139,41 @@ if cli_args.filter_name and S.fig_save_dir: _header += "-" * 80 + "\n" _summary_file.write_text(_header + _summary_line) +# %% Apply straight-liner threshold filter (if specified) +# Removes respondents who straight-lined >= N question groups across +# speaking style and voice scale questions. +if cli_args.sl_threshold is not None: + _sl_n = cli_args.sl_threshold + S.sl_threshold = _sl_n # Store on Survey so filter slug/description include it + print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...") + _n_before = _d.select(pl.len()).collect().item() + + # Extract question groups with renamed columns for check_straight_liners + _sl_ss_or, _ = S.get_ss_orange_red(_d) + _sl_ss_gb, _ = S.get_ss_green_blue(_d) + _sl_vs, _ = S.get_voice_scale_1_10(_d) + _sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId') + + _, _sl_df = check_straight_liners(_sl_all_q, max_score=5) + + if _sl_df is not None and not _sl_df.is_empty(): + # Count straight-lined question groups per respondent + _sl_counts = ( + _sl_df + .group_by("Record ID") + .agg(pl.len().alias("sl_count")) + .filter(pl.col("sl_count") >= _sl_n) + .select(pl.col("Record ID").alias("_recordId")) + ) + # Anti-join to remove offending respondents + _d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy() + # Update filtered data on the Survey object so sample size is correct + S.data_filtered = _d + _n_after = _d.select(pl.len()).collect().item() + print(f" Removed {_n_before - _n_after} respondents ({_n_before} → {_n_after})") + else: + print(" No straight-liners detected — no respondents removed.") + # Save to logical variable name for further analysis data = _d data.collect() diff --git a/XX_straight_liners.py b/XX_straight_liners.py index 68f359f..6a6c085 100644 --- a/XX_straight_liners.py +++ b/XX_straight_liners.py @@ -144,6 +144,8 @@ all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId') # Run straight-liner detection across all question groups # max_score=5 catches all speaking-style straight-lining (1-5 scale) # and voice-scale values ≤5 on the 1-10 scale +# Note: sl_threshold is NOT set on S here — this script analyses straight-liners, +# it doesn't filter them out of the dataset. print("Running straight-liner detection across all question groups...") sl_report, sl_df = check_straight_liners(all_questions, max_score=5) diff --git a/plots.py b/plots.py index c27c888..f26e216 100644 --- a/plots.py +++ b/plots.py @@ -92,6 +92,11 @@ class QualtricsPlotsMixin: parts.append(f"{short_code}-{val_str}") + # Append straight-liner threshold if set + sl_threshold = getattr(self, 'sl_threshold', None) + if sl_threshold is not None: + parts.append(f"SL-gte{sl_threshold}") + if not parts: return "All_Respondents" @@ -182,6 +187,11 @@ class QualtricsPlotsMixin: sample_size = self._get_filtered_sample_size() sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else "" + # Append straight-liner threshold if set + sl_threshold = getattr(self, 'sl_threshold', None) + if sl_threshold is not None: + parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups") + if not parts: # No filters active - return just sample size (or empty string if no sample size) return sample_prefix diff --git a/run_filter_combinations.py b/run_filter_combinations.py index 55b7d93..012a7ff 100644 --- a/run_filter_combinations.py +++ b/run_filter_combinations.py @@ -182,7 +182,7 @@ def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> li return combinations -def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool: +def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None) -> bool: """ Run the report script with given filters. @@ -190,6 +190,7 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool: filters: Dict of filter_name -> list of values name: Name for this filter combination (used for .txt description file) dry_run: If True, just print command without running + sl_threshold: If set, exclude respondents with >= N straight-lined question groups Returns: True if successful, False otherwise @@ -200,6 +201,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool: if name: cmd.extend(['--filter-name', name]) + # Pass straight-liner threshold if specified + if sl_threshold is not None: + cmd.extend(['--sl-threshold', str(sl_threshold)]) + for filter_name, values in filters.items(): if values: cmd.extend([f'--{filter_name}', json.dumps(values)]) @@ -234,6 +239,7 @@ def main(): default='all_filters', help='Filter category to run combinations for (default: all_filters)' ) + parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)') args = parser.parse_args() # Load survey to get available filter options @@ -246,11 +252,14 @@ def main(): category_desc = f" for category '{args.category}'" if args.category != 'all' else '' print(f"Generated {len(combinations)} filter combinations{category_desc}") + if args.sl_threshold is not None: + print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups") + if args.dry_run: print("\nDRY RUN - Commands that would be executed:") for combo in combinations: print(f"\n{combo['name']}:") - run_report(combo['filters'], name=combo['name'], dry_run=True) + run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold) return # Run each combination with progress bar @@ -259,7 +268,7 @@ def main(): for combo in tqdm(combinations, desc="Running reports", unit="filter"): tqdm.write(f"Running: {combo['name']}") - if run_report(combo['filters'], name=combo['name']): + if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold): successful += 1 else: failed.append(combo['name'])