From bc12df28a585053510ac701825ab213b458e325e Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Thu, 29 Jan 2026 13:20:32 +0100
Subject: [PATCH] straight line fn dev

---
 01_ingest_qualtrics_export.py |  44 +++++++-------
 02_quant_analysis.py          |  49 +++++++++------
 README.md                     |   2 +-
 validation.py                 | 109 ++++++++++++++++++++++++++++++++--
 4 files changed, 160 insertions(+), 44 deletions(-)

diff --git a/01_ingest_qualtrics_export.py b/01_ingest_qualtrics_export.py
index 9ad93ff..e43e744 100644
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -11,15 +11,13 @@ def _():
     from pathlib import Path
 
     from utils import JPMCSurvey, combine_exclusive_columns
-    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
-    return (
-        JPMCSurvey,
-        combine_exclusive_columns,
-        mo,
-        pl,
-        plot_average_scores_with_counts,
-        plot_top3_ranking_distribution,
-    )
+    return JPMCSurvey, combine_exclusive_columns, mo, pl
+
+
+@app.cell
+def _(mo):
+    mo.outline()
+    return
 
 
 @app.cell
@@ -66,11 +64,10 @@ def _(data, mo, pl):
     def check_progress(data):
         if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
             return mo.md("""## ✅ All responses are complete (progress = 100) """)
-    
+
         return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
 
     check_progress(data)
-
     return
 
 
@@ -87,11 +84,11 @@ def _(data, mo, pl):
         std_duration = duration_stats['std_duration'][0]
         upper_outlier_threshold = mean_duration + 3 * std_duration
         lower_outlier_threshold = mean_duration - 3 * std_duration
-    
+
         _d = data.with_columns(
             ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
         )
-    
+
         # Show durations with outlier flag is true
         outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
 
@@ -105,16 +102,16 @@ def _(data, mo, pl):
         - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
         - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
         - Number of Outlier Responses: {outlier_data.shape[0]}
-    
+
         Outliers:
-    
+
         {mo.ui.table(outlier_data)}
-    
-    
+
+
         **⚠️ NOTE: These have not been removed from the dataset ⚠️**
-    
+
         """)
-    
+
     duration_validation(data)
     return
 
@@ -229,10 +226,18 @@ def _(mo):
 @app.cell
 def _(data, survey):
     _lf, _choice_map = survey.get_ss_green_blue(data)
+    # _lf.collect()
     print(_lf.collect().head())
     return
 
 
+@app.cell
+def _(df):
+
+    df
+    return
+
+
 @app.cell
 def _(mo):
     mo.md(r"""
@@ -297,7 +302,6 @@ def _(data, survey):
     traits_refined = survey.get_character_refine(data)[0]
 
     traits_refined.collect()
-
     return (traits_refined,)
 
 
diff --git a/02_quant_analysis.py b/02_quant_analysis.py
index bd9404f..d997e26 100644
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -10,7 +10,7 @@ def _():
     import polars as pl
     from pathlib import Path
 
-    from validation import check_progress, duration_validation
+    from validation import check_progress, duration_validation, check_straight_liners
     from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
     import utils
 
@@ -28,6 +28,18 @@ def _():
     )
 
 
+@app.cell(hide_code=True)
+def _(mo):
+    mo.outline(label="Table of Contents")
+    return
+
+
+@app.cell
+def _():
+    # Select Dataset
+    return
+
+
 @app.cell
 def _(mo):
     file_browser = mo.ui.file_browser(
@@ -63,21 +75,27 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
     return S, data_all
 
 
-@app.cell(hide_code=True)
-def _(Path, RESULTS_FILE, data_all, mo):
-    mo.md(f"""
-    ---
-
-    # Load Data
-
-    **Dataset:** `{Path(RESULTS_FILE).name}`
-
-    {mo.ui.table(data_all.collect())}
-    """)
+@app.cell
+def _():
+    # check_straight_liners(S.get_ss_green_blue(data_all)[0])
     return
 
 
 @app.cell(hide_code=True)
+def _(Path, RESULTS_FILE, mo):
+    mo.md(f"""
+
+    ---
+    # Load Data
+
+    **Dataset:** `{Path(RESULTS_FILE).name}`
+
+
+    """)
+    return
+
+
+@app.cell
 def _(check_progress, data_all, duration_validation, mo):
     mo.md(f"""
     ## Data Validation
@@ -87,14 +105,9 @@ def _(check_progress, data_all, duration_validation, mo):
 
 
     {duration_validation(data_all)}
-    """)
-    return
 
 
-@app.cell(hide_code=True)
-def _(mo):
-    mo.md(r"""
-    ### ⚠️  ToDo: "straight-liner" detection and removal
+
     """)
     return
 
diff --git a/README.md b/README.md
index d025e74..3eb074a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 Running on Ct-105 for shared access:
 
 ```
-uv run marimo edit --headless --port 8080 --host ct-105.tail44fa00.ts.net
+uv run marimo run 02_quant_analysis.py --headless --port 8080
 ```
\ No newline at end of file
diff --git a/validation.py b/validation.py
index 9828fff..54bfab9 100644
--- a/validation.py
+++ b/validation.py
@@ -36,11 +36,13 @@ def duration_validation(data):
     
     **⚠️ Potential outliers detected based on response duration ⚠️**
     
-    - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
-    - Standard Deviation of Duration: {std_duration:.2f} seconds
-    - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
-    - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
-    - Number of Outlier Responses: {outlier_data.shape[0]}
+    | Metric | Value |
+    |--------|-------|
+    | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
+    | Standard Deviation of Duration | {std_duration:.2f} seconds |
+    | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
+    | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
+    | Number of Outlier Responses | {outlier_data.shape[0]} |
     
     Outliers:
     
@@ -50,4 +52,101 @@ def duration_validation(data):
     **⚠️ NOTE: These have not been removed from the dataset ⚠️**
     
     """
+
+
+def check_straight_liners(data, max_score=3):
+    """
+    Check for straight-lining behavior (selecting same value for all attributes).
+    
+    Args:
+        data: Polars LazyFrame
+        max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
+    """
+    import re
+
+    # detect columns groups based on pattern SS_...__Vxx__Choice_y
+    schema_names = data.collect_schema().names()
+    
+    # regex groupings
+    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
+    groups = {}
+    
+    for col in schema_names:
+        match = pattern.search(col)
+        if match:
+            group_key = match.group(1)
+            if group_key not in groups:
+                groups[group_key] = []
+            groups[group_key].append(col)
+            
+    # Filter for groups with multiple attributes/choices
+    multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
+    
+    if not multi_attribute_groups:
+        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
+
+    # Build expressions
+    expressions = []
+    
+    for key, cols in multi_attribute_groups.items():
+        # Logic: 
+        # 1. Create list of values
+        # 2. Drop nulls
+        # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
+        
+        list_expr = pl.concat_list(cols).list.drop_nulls()
+        
+        # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
+        # If n_unique == 1, min() is the same as the single value.
+        # If list is empty, min() is null, which is safe.
+        safe_val = list_expr.list.min()
+        
+        is_straight = (
+            (list_expr.list.len() > 0) & 
+            (list_expr.list.n_unique() == 1) & 
+            (safe_val <= max_score)
+        ).alias(f"__is_straight__{key}")
+        
+        value_expr = safe_val.alias(f"__val__{key}")
+        
+        expressions.extend([is_straight, value_expr])
+        
+    # collect data with checks
+    # We only need _recordId and the check columns
+    # We do with_columns then select implicitly/explicitly via filter/select later.
+    
+    checked_data = data.with_columns(expressions).collect()
+    
+    # Process results into a nice table
+    outliers = []
+    
+    for key in multi_attribute_groups.keys():
+        flag_col = f"__is_straight__{key}"
+        val_col = f"__val__{key}"
+        
+        filtered = checked_data.filter(pl.col(flag_col))
+        
+        if filtered.height > 0:
+            rows = filtered.select(["_recordId", val_col]).rows()
+            for row in rows:
+                outliers.append({
+                    "Record ID": row[0],
+                    "Question Group": key,
+                    "Value": row[1]
+                })
+
+    if not outliers:
+        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
+        
+    outlier_df = pl.DataFrame(outliers)
+    
+    return f"""### Straight-lining Checks:
+    
+    **⚠️ Potential straight-liners detected ⚠️**
+    
+    Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
+    
+    {mo.ui.table(outlier_df)}
+    """
+
     
\ No newline at end of file