From 2958fed780e4fe746bbdce0d17c897d606e87e67 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Thu, 29 Jan 2026 18:40:18 +0100 Subject: [PATCH] straightliner validation --- 02_quant_analysis.py | 30 +++++++++++++++++++---------- validation.py | 46 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/02_quant_analysis.py b/02_quant_analysis.py index 43db699..63d4690 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -21,6 +21,7 @@ def _(): SPEAKING_STYLES, calculate_weighted_ranking_scores, check_progress, + check_straight_liners, duration_validation, mo, pl, @@ -58,7 +59,7 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo): @app.cell -def _(Path, RESULTS_FILE, mo): +def _(Path, RESULTS_FILE, data_all, mo): mo.md(f""" --- @@ -66,13 +67,29 @@ def _(Path, RESULTS_FILE, mo): **Dataset:** `{Path(RESULTS_FILE).name}` + **Responses**: `{data_all.collect().shape[0]}` + """) return @app.cell -def _(check_progress, data_all, duration_validation, mo): +def _(): + return + + +@app.cell +def _( + S, + check_progress, + check_straight_liners, + data_all, + duration_validation, + mo, +): + sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5) + mo.md(f""" ## Data Validation @@ -83,19 +100,12 @@ def _(check_progress, data_all, duration_validation, mo): {duration_validation(data_all)} + {sl_content} """) return -@app.cell -def _(mo): - mo.md(r""" - ### ⚠️ ToDo: "straight-liner" detection and removal - """) - return - - @app.cell def _(mo): mo.md(r""" diff --git a/validation.py b/validation.py index 54bfab9..6caef5e 100644 --- a/validation.py +++ b/validation.py @@ -85,6 +85,13 @@ def check_straight_liners(data, max_score=3): if not multi_attribute_groups: return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found." + # Cast all involved columns to Int64 (strict=False) to handle potential string columns + # This prevents "cannot compare string with numeric type" errors + all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols] + data = data.with_columns([ + pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols + ]) + # Build expressions expressions = [] @@ -120,19 +127,29 @@ def check_straight_liners(data, max_score=3): # Process results into a nice table outliers = [] - for key in multi_attribute_groups.keys(): + for key, group_cols in multi_attribute_groups.items(): flag_col = f"__is_straight__{key}" val_col = f"__val__{key}" filtered = checked_data.filter(pl.col(flag_col)) if filtered.height > 0: - rows = filtered.select(["_recordId", val_col]).rows() - for row in rows: + # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.) + # Assuming format ends with __Choice_X + sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1])) + + # Select relevant columns: Record ID, Value, and the sorted group columns + subset = filtered.select(["_recordId", val_col] + sorted_group_cols) + + for row in subset.iter_rows(named=True): + # Create ordered list of values, using 'NaN' for missing data + resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols] + outliers.append({ - "Record ID": row[0], + "Record ID": row["_recordId"], "Question Group": key, - "Value": row[1] + "Value": row[val_col], + "Responses": str(resp_list) }) if not outliers: @@ -147,6 +164,21 @@ def check_straight_liners(data, max_score=3): Respondents selected the same value (<= {max_score}) for all attributes in the following groups: {mo.ui.table(outlier_df)} - """ + """, outlier_df - \ No newline at end of file + + +if __name__ == "__main__": + + from utils import JPMCSurvey + + RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv" + QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf" + + S = JPMCSurvey(RESULTS_FILE, QSF_FILE) + data = S.load_data() + + print("Checking Green Blue:") + print(check_straight_liners(S.get_ss_green_blue(data)[0])) + print("Checking Orange Red:") + print(check_straight_liners(S.get_ss_orange_red(data)[0])) \ No newline at end of file