straight line fn dev

2026-01-29 13:20:32 +01:00
parent 70719702ec
commit bc12df28a5
4 changed files with 160 additions and 44 deletions
--- a/validation.py
+++ b/validation.py
@@ -36,11 +36,13 @@ def duration_validation(data):
    
    **⚠️ Potential outliers detected based on response duration ⚠️**
    
-    - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
-    - Standard Deviation of Duration: {std_duration:.2f} seconds
-    - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
-    - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
-    - Number of Outlier Responses: {outlier_data.shape[0]}
+    | Metric | Value |
+    |--------|-------|
+    | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
+    | Standard Deviation of Duration | {std_duration:.2f} seconds |
+    | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
+    | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
+    | Number of Outlier Responses | {outlier_data.shape[0]} |
    
    Outliers:
    
@@ -50,4 +52,101 @@ def duration_validation(data):
    **⚠️ NOTE: These have not been removed from the dataset ⚠️**
    
    """
+
+
+def check_straight_liners(data, max_score=3):
+    """
+    Check for straight-lining behavior (selecting same value for all attributes).
+    
+    Args:
+        data: Polars LazyFrame
+        max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
+    """
+    import re
+
+    # detect columns groups based on pattern SS_...__Vxx__Choice_y
+    schema_names = data.collect_schema().names()
+    
+    # regex groupings
+    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
+    groups = {}
+    
+    for col in schema_names:
+        match = pattern.search(col)
+        if match:
+            group_key = match.group(1)
+            if group_key not in groups:
+                groups[group_key] = []
+            groups[group_key].append(col)
+            
+    # Filter for groups with multiple attributes/choices
+    multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
+    
+    if not multi_attribute_groups:
+        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
+
+    # Build expressions
+    expressions = []
+    
+    for key, cols in multi_attribute_groups.items():
+        # Logic: 
+        # 1. Create list of values
+        # 2. Drop nulls
+        # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
+        
+        list_expr = pl.concat_list(cols).list.drop_nulls()
+        
+        # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
+        # If n_unique == 1, min() is the same as the single value.
+        # If list is empty, min() is null, which is safe.
+        safe_val = list_expr.list.min()
+        
+        is_straight = (
+            (list_expr.list.len() > 0) & 
+            (list_expr.list.n_unique() == 1) & 
+            (safe_val <= max_score)
+        ).alias(f"__is_straight__{key}")
+        
+        value_expr = safe_val.alias(f"__val__{key}")
+        
+        expressions.extend([is_straight, value_expr])
+        
+    # collect data with checks
+    # We only need _recordId and the check columns
+    # We do with_columns then select implicitly/explicitly via filter/select later.
+    
+    checked_data = data.with_columns(expressions).collect()
+    
+    # Process results into a nice table
+    outliers = []
+    
+    for key in multi_attribute_groups.keys():
+        flag_col = f"__is_straight__{key}"
+        val_col = f"__val__{key}"
+        
+        filtered = checked_data.filter(pl.col(flag_col))
+        
+        if filtered.height > 0:
+            rows = filtered.select(["_recordId", val_col]).rows()
+            for row in rows:
+                outliers.append({
+                    "Record ID": row[0],
+                    "Question Group": key,
+                    "Value": row[1]
+                })
+
+    if not outliers:
+        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
+        
+    outlier_df = pl.DataFrame(outliers)
+    
+    return f"""### Straight-lining Checks:
+    
+    **⚠️ Potential straight-liners detected ⚠️**
+    
+    Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
+    
+    {mo.ui.table(outlier_df)}
+    """
+