SL filter

This commit is contained in:
2026-02-09 17:57:04 +01:00
parent 6c16993cb3
commit 8e181e193a
4 changed files with 62 additions and 5 deletions

View File

@@ -48,8 +48,9 @@ def parse_cli_args():
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
parser.add_argument('--figures-dir', type=str, default=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') parser.add_argument('--figures-dir', type=str, default=f'figures/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")') parser.add_argument('--best-character', type=str, default="the_coach", help='Slug of the best chosen character (default: "the_coach")')
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (e.g. 3 removes anyone with 3+ straight-lined groups)')
# Only parse if running as script (not in Jupyter/interactive) # Only parse if running as script (not in Jupyter/interactive)
try: try:
@@ -57,7 +58,7 @@ def parse_cli_args():
get_ipython() # noqa: F821 # type: ignore get_ipython() # noqa: F821 # type: ignore
# Return namespace with all filters set to None # Return namespace with all filters set to None
no_filters = {f: None for f in FILTER_CONFIG} no_filters = {f: None for f in FILTER_CONFIG}
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach") return argparse.Namespace(**no_filters, filter_name=None, figures_dir=f'figures/statistical_significance/{Path(RESULTS_FILE).parts[2]}', best_character="the_coach", sl_threshold=None)
except NameError: except NameError:
args = parser.parse_args() args = parser.parse_args()
# Parse JSON strings to lists # Parse JSON strings to lists
@@ -138,6 +139,41 @@ if cli_args.filter_name and S.fig_save_dir:
_header += "-" * 80 + "\n" _header += "-" * 80 + "\n"
_summary_file.write_text(_header + _summary_line) _summary_file.write_text(_header + _summary_line)
# %% Apply straight-liner threshold filter (if specified)
# Removes respondents who straight-lined >= N question groups across
# speaking style and voice scale questions.
if cli_args.sl_threshold is not None:
_sl_n = cli_args.sl_threshold
S.sl_threshold = _sl_n # Store on Survey so filter slug/description include it
print(f"Applying straight-liner filter: excluding respondents with ≥{_sl_n} straight-lined question groups...")
_n_before = _d.select(pl.len()).collect().item()
# Extract question groups with renamed columns for check_straight_liners
_sl_ss_or, _ = S.get_ss_orange_red(_d)
_sl_ss_gb, _ = S.get_ss_green_blue(_d)
_sl_vs, _ = S.get_voice_scale_1_10(_d)
_sl_all_q = _sl_ss_or.join(_sl_ss_gb, on='_recordId').join(_sl_vs, on='_recordId')
_, _sl_df = check_straight_liners(_sl_all_q, max_score=5)
if _sl_df is not None and not _sl_df.is_empty():
# Count straight-lined question groups per respondent
_sl_counts = (
_sl_df
.group_by("Record ID")
.agg(pl.len().alias("sl_count"))
.filter(pl.col("sl_count") >= _sl_n)
.select(pl.col("Record ID").alias("_recordId"))
)
# Anti-join to remove offending respondents
_d = _d.collect().join(_sl_counts, on="_recordId", how="anti").lazy()
# Update filtered data on the Survey object so sample size is correct
S.data_filtered = _d
_n_after = _d.select(pl.len()).collect().item()
print(f" Removed {_n_before - _n_after} respondents ({_n_before}{_n_after})")
else:
print(" No straight-liners detected — no respondents removed.")
# Save to logical variable name for further analysis # Save to logical variable name for further analysis
data = _d data = _d
data.collect() data.collect()

View File

@@ -144,6 +144,8 @@ all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
# Run straight-liner detection across all question groups # Run straight-liner detection across all question groups
# max_score=5 catches all speaking-style straight-lining (1-5 scale) # max_score=5 catches all speaking-style straight-lining (1-5 scale)
# and voice-scale values ≤5 on the 1-10 scale # and voice-scale values ≤5 on the 1-10 scale
# Note: sl_threshold is NOT set on S here — this script analyses straight-liners,
# it doesn't filter them out of the dataset.
print("Running straight-liner detection across all question groups...") print("Running straight-liner detection across all question groups...")
sl_report, sl_df = check_straight_liners(all_questions, max_score=5) sl_report, sl_df = check_straight_liners(all_questions, max_score=5)

View File

@@ -92,6 +92,11 @@ class QualtricsPlotsMixin:
parts.append(f"{short_code}-{val_str}") parts.append(f"{short_code}-{val_str}")
# Append straight-liner threshold if set
sl_threshold = getattr(self, 'sl_threshold', None)
if sl_threshold is not None:
parts.append(f"SL-gte{sl_threshold}")
if not parts: if not parts:
return "All_Respondents" return "All_Respondents"
@@ -182,6 +187,11 @@ class QualtricsPlotsMixin:
sample_size = self._get_filtered_sample_size() sample_size = self._get_filtered_sample_size()
sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else "" sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else ""
# Append straight-liner threshold if set
sl_threshold = getattr(self, 'sl_threshold', None)
if sl_threshold is not None:
parts.append(f"STRAIGHT-LINER EXCL: ≥{sl_threshold} question groups")
if not parts: if not parts:
# No filters active - return just sample size (or empty string if no sample size) # No filters active - return just sample size (or empty string if no sample size)
return sample_prefix return sample_prefix

View File

@@ -182,7 +182,7 @@ def get_filter_combinations(survey: QualtricsSurvey, category: str = None) -> li
return combinations return combinations
def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool: def run_report(filters: dict, name: str = None, dry_run: bool = False, sl_threshold: int = None) -> bool:
""" """
Run the report script with given filters. Run the report script with given filters.
@@ -190,6 +190,7 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
filters: Dict of filter_name -> list of values filters: Dict of filter_name -> list of values
name: Name for this filter combination (used for .txt description file) name: Name for this filter combination (used for .txt description file)
dry_run: If True, just print command without running dry_run: If True, just print command without running
sl_threshold: If set, exclude respondents with >= N straight-lined question groups
Returns: Returns:
True if successful, False otherwise True if successful, False otherwise
@@ -200,6 +201,10 @@ def run_report(filters: dict, name: str = None, dry_run: bool = False) -> bool:
if name: if name:
cmd.extend(['--filter-name', name]) cmd.extend(['--filter-name', name])
# Pass straight-liner threshold if specified
if sl_threshold is not None:
cmd.extend(['--sl-threshold', str(sl_threshold)])
for filter_name, values in filters.items(): for filter_name, values in filters.items():
if values: if values:
cmd.extend([f'--{filter_name}', json.dumps(values)]) cmd.extend([f'--{filter_name}', json.dumps(values)])
@@ -234,6 +239,7 @@ def main():
default='all_filters', default='all_filters',
help='Filter category to run combinations for (default: all_filters)' help='Filter category to run combinations for (default: all_filters)'
) )
parser.add_argument('--sl-threshold', type=int, default=None, help='Exclude respondents who straight-lined >= N question groups (passed to report script)')
args = parser.parse_args() args = parser.parse_args()
# Load survey to get available filter options # Load survey to get available filter options
@@ -246,11 +252,14 @@ def main():
category_desc = f" for category '{args.category}'" if args.category != 'all' else '' category_desc = f" for category '{args.category}'" if args.category != 'all' else ''
print(f"Generated {len(combinations)} filter combinations{category_desc}") print(f"Generated {len(combinations)} filter combinations{category_desc}")
if args.sl_threshold is not None:
print(f"Straight-liner threshold: excluding respondents with ≥{args.sl_threshold} straight-lined question groups")
if args.dry_run: if args.dry_run:
print("\nDRY RUN - Commands that would be executed:") print("\nDRY RUN - Commands that would be executed:")
for combo in combinations: for combo in combinations:
print(f"\n{combo['name']}:") print(f"\n{combo['name']}:")
run_report(combo['filters'], name=combo['name'], dry_run=True) run_report(combo['filters'], name=combo['name'], dry_run=True, sl_threshold=args.sl_threshold)
return return
# Run each combination with progress bar # Run each combination with progress bar
@@ -259,7 +268,7 @@ def main():
for combo in tqdm(combinations, desc="Running reports", unit="filter"): for combo in tqdm(combinations, desc="Running reports", unit="filter"):
tqdm.write(f"Running: {combo['name']}") tqdm.write(f"Running: {combo['name']}")
if run_report(combo['filters'], name=combo['name']): if run_report(combo['filters'], name=combo['name'], sl_threshold=args.sl_threshold):
successful += 1 successful += 1
else: else:
failed.append(combo['name']) failed.append(combo['name'])