straight line fn dev

2026-01-29 13:20:32 +01:00
parent 70719702ec
commit bc12df28a5
4 changed files with 160 additions and 44 deletions
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -11,15 +11,13 @@ def _():
    from pathlib import Path
    from utils import JPMCSurvey, combine_exclusive_columns
-    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
+    return JPMCSurvey, combine_exclusive_columns, mo, pl
-    return (
+
-        JPMCSurvey,
+
-        combine_exclusive_columns,
+@app.cell
-        mo,
+def _(mo):
-        pl,
+    mo.outline()
-        plot_average_scores_with_counts,
+    return
        plot_top3_ranking_distribution,
    )
@app.cell
@@ -66,11 +64,10 @@ def _(data, mo, pl):
    def check_progress(data):
        if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
            return mo.md("""## ✅ All responses are complete (progress = 100) """)
-    
+
        return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
    check_progress(data)
    return
@@ -87,11 +84,11 @@ def _(data, mo, pl):
        std_duration = duration_stats['std_duration'][0]
        upper_outlier_threshold = mean_duration + 3 * std_duration
        lower_outlier_threshold = mean_duration - 3 * std_duration
-    
+
        _d = data.with_columns(
            ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
        )
-    
+
        # Show durations with outlier flag is true
        outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
@@ -105,16 +102,16 @@ def _(data, mo, pl):
        - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
        - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
        - Number of Outlier Responses: {outlier_data.shape[0]}
-    
+
        Outliers:
-    
+
        {mo.ui.table(outlier_data)}
-    
+
-    
+
        **⚠️ NOTE: These have not been removed from the dataset ⚠️**
-    
+
        """)
-    
+
    duration_validation(data)
    return
@@ -229,10 +226,18 @@ def _(mo):
@app.cell
 def _(data, survey):
    _lf, _choice_map = survey.get_ss_green_blue(data)
    # _lf.collect()
    print(_lf.collect().head())
    return
@app.cell
 def _(df):
    df
    return
@app.cell
 def _(mo):
    mo.md(r"""
@@ -297,7 +302,6 @@ def _(data, survey):
    traits_refined = survey.get_character_refine(data)[0]
    traits_refined.collect()
    return (traits_refined,)
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -10,7 +10,7 @@ def _():
    import polars as pl
    from pathlib import Path
-    from validation import check_progress, duration_validation
+    from validation import check_progress, duration_validation, check_straight_liners
    from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
    import utils
@@ -28,6 +28,18 @@ def _():
    )
@app.cell(hide_code=True)
 def _(mo):
    mo.outline(label="Table of Contents")
    return
@app.cell
 def _():
    # Select Dataset
    return
@app.cell
 def _(mo):
    file_browser = mo.ui.file_browser(
@@ -63,21 +75,27 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
    return S, data_all
-@app.cell(hide_code=True)
+@app.cell
-def _(Path, RESULTS_FILE, data_all, mo):
+def _():
-    mo.md(f"""
+    # check_straight_liners(S.get_ss_green_blue(data_all)[0])
    ---
    # Load Data
    **Dataset:** `{Path(RESULTS_FILE).name}`
    {mo.ui.table(data_all.collect())}
    """)
    return
@app.cell(hide_code=True)
 def _(Path, RESULTS_FILE, mo):
    mo.md(f"""
    ---
    # Load Data
    **Dataset:** `{Path(RESULTS_FILE).name}`
    """)
    return
@app.cell
 def _(check_progress, data_all, duration_validation, mo):
    mo.md(f"""
    ## Data Validation
@@ -87,14 +105,9 @@ def _(check_progress, data_all, duration_validation, mo):
    {duration_validation(data_all)}
    """)
    return
-@app.cell(hide_code=True)
+
 def _(mo):
    mo.md(r"""
    ### ⚠️  ToDo: "straight-liner" detection and removal
    """)
    return
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 Running on Ct-105 for shared access:
 ```
-uv run marimo edit --headless --port 8080 --host ct-105.tail44fa00.ts.net
+uv run marimo run 02_quant_analysis.py --headless --port 8080
 ```
--- a/validation.py
+++ b/validation.py
@@ -36,11 +36,13 @@ def duration_validation(data):
    **⚠️ Potential outliers detected based on response duration ⚠️**
-    - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
+    | Metric | Value |
-    - Standard Deviation of Duration: {std_duration:.2f} seconds
+    |--------|-------|
-    - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
+    | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
-    - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
+    | Standard Deviation of Duration | {std_duration:.2f} seconds |
-    - Number of Outlier Responses: {outlier_data.shape[0]}
+    | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
    | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
    | Number of Outlier Responses | {outlier_data.shape[0]} |
    Outliers:
@@ -50,4 +52,101 @@ def duration_validation(data):
    **⚠️ NOTE: These have not been removed from the dataset ⚠️**
    """
 def check_straight_liners(data, max_score=3):
    """
    Check for straight-lining behavior (selecting same value for all attributes).
    Args:
        data: Polars LazyFrame
        max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
    """
    import re
    # detect columns groups based on pattern SS_...__Vxx__Choice_y
    schema_names = data.collect_schema().names()
    # regex groupings
    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
    groups = {}
    for col in schema_names:
        match = pattern.search(col)
        if match:
            group_key = match.group(1)
            if group_key not in groups:
                groups[group_key] = []
            groups[group_key].append(col)
    # Filter for groups with multiple attributes/choices
    multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
    if not multi_attribute_groups:
        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
    # Build expressions
    expressions = []
    for key, cols in multi_attribute_groups.items():
        # Logic: 
        # 1. Create list of values
        # 2. Drop nulls
        # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
        list_expr = pl.concat_list(cols).list.drop_nulls()
        # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
        # If n_unique == 1, min() is the same as the single value.
        # If list is empty, min() is null, which is safe.
        safe_val = list_expr.list.min()
        is_straight = (
            (list_expr.list.len() > 0) & 
            (list_expr.list.n_unique() == 1) & 
            (safe_val <= max_score)
        ).alias(f"__is_straight__{key}")
        value_expr = safe_val.alias(f"__val__{key}")
        expressions.extend([is_straight, value_expr])
    # collect data with checks
    # We only need _recordId and the check columns
    # We do with_columns then select implicitly/explicitly via filter/select later.
    checked_data = data.with_columns(expressions).collect()
    # Process results into a nice table
    outliers = []
    for key in multi_attribute_groups.keys():
        flag_col = f"__is_straight__{key}"
        val_col = f"__val__{key}"
        filtered = checked_data.filter(pl.col(flag_col))
        if filtered.height > 0:
            rows = filtered.select(["_recordId", val_col]).rows()
            for row in rows:
                outliers.append({
                    "Record ID": row[0],
                    "Question Group": key,
                    "Value": row[1]
                })
    if not outliers:
        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
    outlier_df = pl.DataFrame(outliers)
    return f"""### Straight-lining Checks:
    **⚠️ Potential straight-liners detected ⚠️**
    Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
    {mo.ui.table(outlier_df)}
    """