straightliner verification for SS questions

2026-01-29 19:57:29 +01:00
parent 2958fed780
commit c1729d4896
2 changed files with 142 additions and 25 deletions
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -74,11 +74,6 @@ def _(Path, RESULTS_FILE, data_all, mo):
    return


-@app.cell
-def _():
-    return
-
-
@app.cell
 def _(
    S,
@@ -88,7 +83,8 @@ def _(
    duration_validation,
    mo,
 ):
-    sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5)
+    _ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
+    sl_content = check_straight_liners(_ss_all, max_score=5)

    mo.md(f"""
    ## Data Validation
@@ -106,18 +102,6 @@ def _(
    return


-@app.cell
-def _(mo):
-    mo.md(r"""
-    ---
-
-    # Data Filter
-
-    Use to select a subset of the data for the following analysis
-    """)
-    return
-
-
@app.cell(hide_code=True)
 def _(S, mo):
    filter_form = mo.md('''
--- a/validation.py
+++ b/validation.py
@@ -1,6 +1,7 @@
 import marimo as mo
 import polars as pl
-
+import altair as alt
+from theme import ColorPalette

 def check_progress(data):
    """Check if all responses are complete based on 'progress' column."""
@@ -115,8 +116,9 @@ def check_straight_liners(data, max_score=3):
        ).alias(f"__is_straight__{key}")
        
        value_expr = safe_val.alias(f"__val__{key}")
+        has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}")
        
-        expressions.extend([is_straight, value_expr])
+        expressions.extend([is_straight, value_expr, has_data])
        
    # collect data with checks
    # We only need _recordId and the check columns
@@ -156,15 +158,146 @@ def check_straight_liners(data, max_score=3):
        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
        
    outlier_df = pl.DataFrame(outliers)
+
+    # --- Analysis & Visualization ---
    
-    return f"""### Straight-lining Checks:
+    total_respondents = checked_data.height
    
-    **⚠️ Potential straight-liners detected ⚠️**
+    # 1. & 3. Percentage Calculation
+    group_stats = []
+    value_dist_data = []
+
+    # Calculate Straight-Liners for ALL groups found in Data
+    # Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions)
+    # Logic: For every group G: if G has data (len > 0), then G must be straight.
+    # Also, the respondent must have answered at least one question group.
    
-    Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
+    conditions = []
+    has_any_data_exprs = []
    
-    {mo.ui.table(outlier_df)}
-    """, outlier_df
+    for key in multi_attribute_groups.keys():
+        flag_col = f"__is_straight__{key}"
+        data_col = f"__has_data__{key}"
+        
+        # If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user.
+        # Equivalent: (not has_data) OR is_straight
+        cond = (~pl.col(data_col)) | pl.col(flag_col)
+        conditions.append(cond)
+        has_any_data_exprs.append(pl.col(data_col))
+
+    all_straight_count = checked_data.filter(
+        pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs)
+    ).height
+    all_straight_pct = (all_straight_count / total_respondents) * 100
+
+    for key in multi_attribute_groups.keys():
+        flag_col = f"__is_straight__{key}"
+        val_col = f"__val__{key}"
+        
+        # Filter for straight-liners in this specific group
+        sl_sub = checked_data.filter(pl.col(flag_col))
+        count = sl_sub.height
+        pct = (count / total_respondents) * 100
+        
+        group_stats.append({
+            "Question Group": key,
+            "Straight-Liner %": pct,
+            "Count": count
+        })
+        
+        # Get Value Distribution for this group's straight-liners
+        if count > 0:
+            # Group by the Value they straight-lined
+            dist = sl_sub.group_by(val_col).agg(pl.len().alias("count"))
+            for row in dist.iter_rows(named=True):
+                 value_dist_data.append({
+                     "Question Group": key,
+                     "Value": row[val_col],
+                     "Count": row["count"]
+                 })
+
+    stats_df = pl.DataFrame(group_stats)
+    dist_df = pl.DataFrame(value_dist_data)
+    
+    # Plot 1: % of Responses with Straight-Liners per Question
+    # Vertical bars with Count label on top
+    base_pct = alt.Chart(stats_df).encode(
+        x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending"))
+    )
+    
+    bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode(
+        y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")),
+        tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"]
+    )
+    
+    text_pct = base_pct.mark_text(dy=-10).encode(
+        y=alt.Y("Straight-Liner %:Q"),
+        text=alt.Text("Count")
+    )
+    
+    chart_pct = (bars_pct + text_pct).properties(
+        title="Share of Responses with Straight-Liners per Question", 
+        width=800,
+        height=300
+    )
+    
+    # Plot 2: Value Distribution (Horizontal Stacked Normalized Bar)
+    # Question Groups sorted by Total Count
+    # Values stacked 1 (left) -> 5 (right)
+    # Legend on top
+    # Total count at bar end
+    
+    # Sort order for Y axis (Question Group) based on total Count (descending)
+    # Explicitly calculate sort order from stats_df to ensure consistency across layers
+    # High counts at the top
+    sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list()
+    
+    # Base chart for Bars
+    # Use JPMC-aligned colors (blues) instead of default categorical rainbow
+    # Remove legend title as per plots.py style
+    bars_dist = alt.Chart(dist_df).mark_bar().encode(
+        y=alt.Y("Question Group", sort=sorted_groups),
+        x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"),
+        color=alt.Color("Value:O", 
+                        title=None, # explicit removal of title like in plots.py
+                        scale=alt.Scale(scheme="blues"), # Professional blue scale
+                        legend=alt.Legend(orient="top", direction="horizontal")
+                       ),
+        order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right
+        tooltip=["Question Group", "Value", "Count"]
+    )
+    
+    # Text layer for Total Count (using stats_df which already has totals)
+    # using same sort for Y
+    text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode(
+        y=alt.Y("Question Group", sort=sorted_groups),
+        x=alt.datum(1.0), # Position at 100%
+        text=alt.Text("Count")
+    )
+
+    chart_dist = (bars_dist + text_dist).properties(
+        title="Distribution of Straight-Lined Values",
+        width=800,
+        height=500
+    )
+
+    analysis_md = f"""
+    ### Straight-Lining Analysis
+    
+    *"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.*
+    
+    *   **Total Respondents**: {total_respondents}
+    *   **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents)
+    
+    """
+    
+    return mo.vstack([
+        mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
+        mo.ui.table(outlier_df),
+        mo.md(analysis_md),
+        mo.md("#### Speaking Style Question Groups"),
+        alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
+    ])