From 2958fed780e4fe746bbdce0d17c897d606e87e67 Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Thu, 29 Jan 2026 18:40:18 +0100
Subject: [PATCH] straightliner validation

---
 02_quant_analysis.py | 30 +++++++++++++++++++----------
 validation.py        | 46 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/02_quant_analysis.py b/02_quant_analysis.py
index 43db699..63d4690 100644
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -21,6 +21,7 @@ def _():
         SPEAKING_STYLES,
         calculate_weighted_ranking_scores,
         check_progress,
+        check_straight_liners,
         duration_validation,
         mo,
         pl,
@@ -58,7 +59,7 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
 
 
 @app.cell
-def _(Path, RESULTS_FILE, mo):
+def _(Path, RESULTS_FILE, data_all, mo):
     mo.md(f"""
 
     ---
@@ -66,13 +67,29 @@ def _(Path, RESULTS_FILE, mo):
 
     **Dataset:** `{Path(RESULTS_FILE).name}`
 
+    **Responses**: `{data_all.collect().shape[0]}`
+
 
     """)
     return
 
 
 @app.cell
-def _(check_progress, data_all, duration_validation, mo):
+def _():
+    return
+
+
+@app.cell
+def _(
+    S,
+    check_progress,
+    check_straight_liners,
+    data_all,
+    duration_validation,
+    mo,
+):
+    sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5)
+
     mo.md(f"""
     ## Data Validation
 
@@ -83,19 +100,12 @@ def _(check_progress, data_all, duration_validation, mo):
     {duration_validation(data_all)}
 
 
+    {sl_content}
 
     """)
     return
 
 
-@app.cell
-def _(mo):
-    mo.md(r"""
-    ### ⚠️  ToDo: "straight-liner" detection and removal
-    """)
-    return
-
-
 @app.cell
 def _(mo):
     mo.md(r"""
diff --git a/validation.py b/validation.py
index 54bfab9..6caef5e 100644
--- a/validation.py
+++ b/validation.py
@@ -85,6 +85,13 @@ def check_straight_liners(data, max_score=3):
     if not multi_attribute_groups:
         return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
 
+    # Cast all involved columns to Int64 (strict=False) to handle potential string columns
+    # This prevents "cannot compare string with numeric type" errors
+    all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
+    data = data.with_columns([
+        pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
+    ])
+
     # Build expressions
     expressions = []
     
@@ -120,19 +127,29 @@ def check_straight_liners(data, max_score=3):
     # Process results into a nice table
     outliers = []
     
-    for key in multi_attribute_groups.keys():
+    for key, group_cols in multi_attribute_groups.items():
         flag_col = f"__is_straight__{key}"
         val_col = f"__val__{key}"
         
         filtered = checked_data.filter(pl.col(flag_col))
         
         if filtered.height > 0:
-            rows = filtered.select(["_recordId", val_col]).rows()
-            for row in rows:
+            # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
+            # Assuming format ends with __Choice_X
+            sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
+            
+            # Select relevant columns: Record ID, Value, and the sorted group columns
+            subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
+            
+            for row in subset.iter_rows(named=True):
+                # Create ordered list of values, using 'NaN' for missing data
+                resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
+                
                 outliers.append({
-                    "Record ID": row[0],
+                    "Record ID": row["_recordId"],
                     "Question Group": key,
-                    "Value": row[1]
+                    "Value": row[val_col],
+                    "Responses": str(resp_list)
                 })
 
     if not outliers:
@@ -147,6 +164,21 @@ def check_straight_liners(data, max_score=3):
     Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
     
     {mo.ui.table(outlier_df)}
-    """
+    """, outlier_df
 
-    
\ No newline at end of file
+
+
+if __name__ == "__main__":
+    
+    from utils import JPMCSurvey
+    
+    RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
+    QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
+    
+    S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
+    data = S.load_data()
+    
+    print("Checking Green Blue:")
+    print(check_straight_liners(S.get_ss_green_blue(data)[0]))
+    print("Checking Orange Red:")
+    print(check_straight_liners(S.get_ss_orange_red(data)[0]))
\ No newline at end of file