SL validation complete

2026-01-29 20:39:16 +01:00
parent c1729d4896
commit 8aee09f968
4 changed files with 155 additions and 31 deletions
--- a/validation.py
+++ b/validation.py
@@ -6,9 +6,9 @@ from theme import ColorPalette
 def check_progress(data):
    """Check if all responses are complete based on 'progress' column."""
    if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
-        return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
+        return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """
    
-    return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
+    return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"


 def duration_validation(data):
@@ -31,9 +31,9 @@ def duration_validation(data):
    outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()

    if outlier_data.shape[0] == 0:
-        return "### Duration Outliers: \n\n✅ No duration outliers detected"
+        return "## Duration Outliers: \n\n✅ No duration outliers detected"

-    return f"""### Duration Outliers:
+    return f"""## Duration Outliers:
    
    **⚠️ Potential outliers detected based on response duration ⚠️**
    
@@ -69,13 +69,25 @@ def check_straight_liners(data, max_score=3):
    schema_names = data.collect_schema().names()
    
    # regex groupings
-    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
+    pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+")
+    pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+")
+    
    groups = {}
    
    for col in schema_names:
-        match = pattern.search(col)
-        if match:
-            group_key = match.group(1)
+        # Check for Choice pattern (SS_...__Vxx__Choice_y)
+        match_choice = pattern_choice.search(col)
+        if match_choice:
+            group_key = match_choice.group(1)
+            if group_key not in groups:
+                groups[group_key] = []
+            groups[group_key].append(col)
+            continue
+            
+        # Check for Voice Scale pattern (Voice_Scale_1_10__Vxx)
+        # All of these form a single group "Voice_Scale_1_10"
+        if pattern_scale.search(col):
+            group_key = "Voice_Scale_1_10"
            if group_key not in groups:
                groups[group_key] = []
            groups[group_key].append(col)
@@ -86,11 +98,11 @@ def check_straight_liners(data, max_score=3):
    if not multi_attribute_groups:
        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."

-    # Cast all involved columns to Int64 (strict=False) to handle potential string columns
-    # This prevents "cannot compare string with numeric type" errors
+    # Cast all involved columns to Float64 (strict=False) to handle potential string columns
+    # and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well.
    all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
    data = data.with_columns([
-        pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
+        pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols
    ])

    # Build expressions
@@ -136,9 +148,18 @@ def check_straight_liners(data, max_score=3):
        filtered = checked_data.filter(pl.col(flag_col))
        
        if filtered.height > 0:
-            # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
-            # Assuming format ends with __Choice_X
-            sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
+            # Sort group_cols logic
+            # If Choice columns, sort by choice number.
+            # If Voice Scale columns (no Choice_), sort by Voice ID (Vxx)
+            if all("__Choice_" in c for c in group_cols):
+                 key_func = lambda c: int(c.split('__Choice_')[-1])
+            else:
+                 # Extract digits from Vxx
+                 def key_func(c):
+                     m = re.search(r"__V(\d+)", c)
+                     return int(m.group(1)) if m else 0
+            
+            sorted_group_cols = sorted(group_cols, key=key_func)
            
            # Select relevant columns: Record ID, Value, and the sorted group columns
            subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
@@ -155,7 +176,7 @@ def check_straight_liners(data, max_score=3):
                })

    if not outliers:
-        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
+        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None
        
    outlier_df = pl.DataFrame(outliers)

@@ -291,13 +312,12 @@ def check_straight_liners(data, max_score=3):
    
    """
    
-    return mo.vstack([
-        mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
+    return (mo.vstack([
+        mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"),
        mo.ui.table(outlier_df),
        mo.md(analysis_md),
-        mo.md("#### Speaking Style Question Groups"),
        alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
-    ])
+    ]), outlier_df)



@@ -311,7 +331,10 @@ if __name__ == "__main__":
    S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
    data = S.load_data()
    
-    print("Checking Green Blue:")
-    print(check_straight_liners(S.get_ss_green_blue(data)[0]))
-    print("Checking Orange Red:")
-    print(check_straight_liners(S.get_ss_orange_red(data)[0]))
+    # print("Checking Green Blue:")
+    # print(check_straight_liners(S.get_ss_green_blue(data)[0]))
+    # print("Checking Orange Red:")
+    # print(check_straight_liners(S.get_ss_orange_red(data)[0]))
+    
+    print("Checking Voice Scale 1-10:")
+    print(check_straight_liners(S.get_voice_scale_1_10(data)[0]))