straight line fn dev

2026-01-29 13:20:32 +01:00
parent 70719702ec
commit bc12df28a5
4 changed files with 160 additions and 44 deletions
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -11,15 +11,13 @@ def _():
    from pathlib import Path

    from utils import JPMCSurvey, combine_exclusive_columns
-    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
-    return (
-        JPMCSurvey,
-        combine_exclusive_columns,
-        mo,
-        pl,
-        plot_average_scores_with_counts,
-        plot_top3_ranking_distribution,
-    )
+    return JPMCSurvey, combine_exclusive_columns, mo, pl
+
+
+@app.cell
+def _(mo):
+    mo.outline()
+    return


@app.cell
@@ -66,11 +64,10 @@ def _(data, mo, pl):
    def check_progress(data):
        if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
            return mo.md("""## ✅ All responses are complete (progress = 100) """)
-    
+
        return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")

    check_progress(data)
-
    return


@@ -87,11 +84,11 @@ def _(data, mo, pl):
        std_duration = duration_stats['std_duration'][0]
        upper_outlier_threshold = mean_duration + 3 * std_duration
        lower_outlier_threshold = mean_duration - 3 * std_duration
-    
+
        _d = data.with_columns(
            ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
        )
-    
+
        # Show durations with outlier flag is true
        outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()

@@ -105,16 +102,16 @@ def _(data, mo, pl):
        - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
        - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
        - Number of Outlier Responses: {outlier_data.shape[0]}
-    
+
        Outliers:
-    
+
        {mo.ui.table(outlier_data)}
-    
-    
+
+
        **⚠️ NOTE: These have not been removed from the dataset ⚠️**
-    
+
        """)
-    
+
    duration_validation(data)
    return

@@ -229,10 +226,18 @@ def _(mo):
@app.cell
 def _(data, survey):
    _lf, _choice_map = survey.get_ss_green_blue(data)
+    # _lf.collect()
    print(_lf.collect().head())
    return


+@app.cell
+def _(df):
+
+    df
+    return
+
+
@app.cell
 def _(mo):
    mo.md(r"""
@@ -297,7 +302,6 @@ def _(data, survey):
    traits_refined = survey.get_character_refine(data)[0]

    traits_refined.collect()
-
    return (traits_refined,)


--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -10,7 +10,7 @@ def _():
    import polars as pl
    from pathlib import Path

-    from validation import check_progress, duration_validation
+    from validation import check_progress, duration_validation, check_straight_liners
    from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
    import utils

@@ -28,6 +28,18 @@ def _():
    )


+@app.cell(hide_code=True)
+def _(mo):
+    mo.outline(label="Table of Contents")
+    return
+
+
+@app.cell
+def _():
+    # Select Dataset
+    return
+
+
@app.cell
 def _(mo):
    file_browser = mo.ui.file_browser(
@@ -63,21 +75,27 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
    return S, data_all


-@app.cell(hide_code=True)
-def _(Path, RESULTS_FILE, data_all, mo):
-    mo.md(f"""
-    ---
-
-    # Load Data
-
-    **Dataset:** `{Path(RESULTS_FILE).name}`
-
-    {mo.ui.table(data_all.collect())}
-    """)
+@app.cell
+def _():
+    # check_straight_liners(S.get_ss_green_blue(data_all)[0])
    return


@app.cell(hide_code=True)
+def _(Path, RESULTS_FILE, mo):
+    mo.md(f"""
+
+    ---
+    # Load Data
+
+    **Dataset:** `{Path(RESULTS_FILE).name}`
+
+
+    """)
+    return
+
+
+@app.cell
 def _(check_progress, data_all, duration_validation, mo):
    mo.md(f"""
    ## Data Validation
@@ -87,14 +105,9 @@ def _(check_progress, data_all, duration_validation, mo):


    {duration_validation(data_all)}
-    """)
-    return


-@app.cell(hide_code=True)
-def _(mo):
-    mo.md(r"""
-    ### ⚠️  ToDo: "straight-liner" detection and removal
+
    """)
    return

--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 Running on Ct-105 for shared access:

 ```
-uv run marimo edit --headless --port 8080 --host ct-105.tail44fa00.ts.net
+uv run marimo run 02_quant_analysis.py --headless --port 8080
 ```
--- a/validation.py
+++ b/validation.py
@@ -36,11 +36,13 @@ def duration_validation(data):
    
    **⚠️ Potential outliers detected based on response duration ⚠️**
    
-    - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
-    - Standard Deviation of Duration: {std_duration:.2f} seconds
-    - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
-    - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
-    - Number of Outlier Responses: {outlier_data.shape[0]}
+    | Metric | Value |
+    |--------|-------|
+    | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
+    | Standard Deviation of Duration | {std_duration:.2f} seconds |
+    | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
+    | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
+    | Number of Outlier Responses | {outlier_data.shape[0]} |
    
    Outliers:
    
@@ -50,4 +52,101 @@ def duration_validation(data):
    **⚠️ NOTE: These have not been removed from the dataset ⚠️**
    
    """
+
+
+def check_straight_liners(data, max_score=3):
+    """
+    Check for straight-lining behavior (selecting same value for all attributes).
+    
+    Args:
+        data: Polars LazyFrame
+        max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
+    """
+    import re
+
+    # detect columns groups based on pattern SS_...__Vxx__Choice_y
+    schema_names = data.collect_schema().names()
+    
+    # regex groupings
+    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
+    groups = {}
+    
+    for col in schema_names:
+        match = pattern.search(col)
+        if match:
+            group_key = match.group(1)
+            if group_key not in groups:
+                groups[group_key] = []
+            groups[group_key].append(col)
+            
+    # Filter for groups with multiple attributes/choices
+    multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
+    
+    if not multi_attribute_groups:
+        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
+
+    # Build expressions
+    expressions = []
+    
+    for key, cols in multi_attribute_groups.items():
+        # Logic: 
+        # 1. Create list of values
+        # 2. Drop nulls
+        # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
+        
+        list_expr = pl.concat_list(cols).list.drop_nulls()
+        
+        # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
+        # If n_unique == 1, min() is the same as the single value.
+        # If list is empty, min() is null, which is safe.
+        safe_val = list_expr.list.min()
+        
+        is_straight = (
+            (list_expr.list.len() > 0) & 
+            (list_expr.list.n_unique() == 1) & 
+            (safe_val <= max_score)
+        ).alias(f"__is_straight__{key}")
+        
+        value_expr = safe_val.alias(f"__val__{key}")
+        
+        expressions.extend([is_straight, value_expr])
+        
+    # collect data with checks
+    # We only need _recordId and the check columns
+    # We do with_columns then select implicitly/explicitly via filter/select later.
+    
+    checked_data = data.with_columns(expressions).collect()
+    
+    # Process results into a nice table
+    outliers = []
+    
+    for key in multi_attribute_groups.keys():
+        flag_col = f"__is_straight__{key}"
+        val_col = f"__val__{key}"
+        
+        filtered = checked_data.filter(pl.col(flag_col))
+        
+        if filtered.height > 0:
+            rows = filtered.select(["_recordId", val_col]).rows()
+            for row in rows:
+                outliers.append({
+                    "Record ID": row[0],
+                    "Question Group": key,
+                    "Value": row[1]
+                })
+
+    if not outliers:
+        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
+        
+    outlier_df = pl.DataFrame(outliers)
+    
+    return f"""### Straight-lining Checks:
+    
+    **⚠️ Potential straight-liners detected ⚠️**
+    
+    Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
+    
+    {mo.ui.table(outlier_df)}
+    """
+