From bc12df28a585053510ac701825ab213b458e325e Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Thu, 29 Jan 2026 13:20:32 +0100 Subject: [PATCH] straight line fn dev --- 01_ingest_qualtrics_export.py | 44 +++++++------- 02_quant_analysis.py | 49 +++++++++------ README.md | 2 +- validation.py | 109 ++++++++++++++++++++++++++++++++-- 4 files changed, 160 insertions(+), 44 deletions(-) diff --git a/01_ingest_qualtrics_export.py b/01_ingest_qualtrics_export.py index 9ad93ff..e43e744 100644 --- a/01_ingest_qualtrics_export.py +++ b/01_ingest_qualtrics_export.py @@ -11,15 +11,13 @@ def _(): from pathlib import Path from utils import JPMCSurvey, combine_exclusive_columns - from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution - return ( - JPMCSurvey, - combine_exclusive_columns, - mo, - pl, - plot_average_scores_with_counts, - plot_top3_ranking_distribution, - ) + return JPMCSurvey, combine_exclusive_columns, mo, pl + + +@app.cell +def _(mo): + mo.outline() + return @app.cell @@ -66,11 +64,10 @@ def _(data, mo, pl): def check_progress(data): if data.collect().select(pl.col('progress').unique()).shape[0] == 1: return mo.md("""## ✅ All responses are complete (progress = 100) """) - + return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️") check_progress(data) - return @@ -87,11 +84,11 @@ def _(data, mo, pl): std_duration = duration_stats['std_duration'][0] upper_outlier_threshold = mean_duration + 3 * std_duration lower_outlier_threshold = mean_duration - 3 * std_duration - + _d = data.with_columns( ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration') ) - + # Show durations with outlier flag is true outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() @@ -105,16 +102,16 @@ def _(data, mo, pl): - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds - Number of Outlier Responses: {outlier_data.shape[0]} - + Outliers: - + {mo.ui.table(outlier_data)} - - + + **⚠️ NOTE: These have not been removed from the dataset ⚠️** - + """) - + duration_validation(data) return @@ -229,10 +226,18 @@ def _(mo): @app.cell def _(data, survey): _lf, _choice_map = survey.get_ss_green_blue(data) + # _lf.collect() print(_lf.collect().head()) return +@app.cell +def _(df): + + df + return + + @app.cell def _(mo): mo.md(r""" @@ -297,7 +302,6 @@ def _(data, survey): traits_refined = survey.get_character_refine(data)[0] traits_refined.collect() - return (traits_refined,) diff --git a/02_quant_analysis.py b/02_quant_analysis.py index bd9404f..d997e26 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -10,7 +10,7 @@ def _(): import polars as pl from pathlib import Path - from validation import check_progress, duration_validation + from validation import check_progress, duration_validation, check_straight_liners from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores import utils @@ -28,6 +28,18 @@ def _(): ) +@app.cell(hide_code=True) +def _(mo): + mo.outline(label="Table of Contents") + return + + +@app.cell +def _(): + # Select Dataset + return + + @app.cell def _(mo): file_browser = mo.ui.file_browser( @@ -63,21 +75,27 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo): return S, data_all -@app.cell(hide_code=True) -def _(Path, RESULTS_FILE, data_all, mo): - mo.md(f""" - --- - - # Load Data - - **Dataset:** `{Path(RESULTS_FILE).name}` - - {mo.ui.table(data_all.collect())} - """) +@app.cell +def _(): + # check_straight_liners(S.get_ss_green_blue(data_all)[0]) return @app.cell(hide_code=True) +def _(Path, RESULTS_FILE, mo): + mo.md(f""" + + --- + # Load Data + + **Dataset:** `{Path(RESULTS_FILE).name}` + + + """) + return + + +@app.cell def _(check_progress, data_all, duration_validation, mo): mo.md(f""" ## Data Validation @@ -87,14 +105,9 @@ def _(check_progress, data_all, duration_validation, mo): {duration_validation(data_all)} - """) - return -@app.cell(hide_code=True) -def _(mo): - mo.md(r""" - ### ⚠️ ToDo: "straight-liner" detection and removal + """) return diff --git a/README.md b/README.md index d025e74..3eb074a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ Running on Ct-105 for shared access: ``` -uv run marimo edit --headless --port 8080 --host ct-105.tail44fa00.ts.net +uv run marimo run 02_quant_analysis.py --headless --port 8080 ``` \ No newline at end of file diff --git a/validation.py b/validation.py index 9828fff..54bfab9 100644 --- a/validation.py +++ b/validation.py @@ -36,11 +36,13 @@ def duration_validation(data): **⚠️ Potential outliers detected based on response duration ⚠️** - - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) - - Standard Deviation of Duration: {std_duration:.2f} seconds - - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds - - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds - - Number of Outlier Responses: {outlier_data.shape[0]} + | Metric | Value | + |--------|-------| + | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) | + | Standard Deviation of Duration | {std_duration:.2f} seconds | + | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds | + | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds | + | Number of Outlier Responses | {outlier_data.shape[0]} | Outliers: @@ -50,4 +52,101 @@ def duration_validation(data): **⚠️ NOTE: These have not been removed from the dataset ⚠️** """ + + +def check_straight_liners(data, max_score=3): + """ + Check for straight-lining behavior (selecting same value for all attributes). + + Args: + data: Polars LazyFrame + max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed). + """ + import re + + # detect columns groups based on pattern SS_...__Vxx__Choice_y + schema_names = data.collect_schema().names() + + # regex groupings + pattern = re.compile(r"(.*__V\d+)__Choice_\d+") + groups = {} + + for col in schema_names: + match = pattern.search(col) + if match: + group_key = match.group(1) + if group_key not in groups: + groups[group_key] = [] + groups[group_key].append(col) + + # Filter for groups with multiple attributes/choices + multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1} + + if not multi_attribute_groups: + return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found." + + # Build expressions + expressions = [] + + for key, cols in multi_attribute_groups.items(): + # Logic: + # 1. Create list of values + # 2. Drop nulls + # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score + + list_expr = pl.concat_list(cols).list.drop_nulls() + + # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists + # If n_unique == 1, min() is the same as the single value. + # If list is empty, min() is null, which is safe. + safe_val = list_expr.list.min() + + is_straight = ( + (list_expr.list.len() > 0) & + (list_expr.list.n_unique() == 1) & + (safe_val <= max_score) + ).alias(f"__is_straight__{key}") + + value_expr = safe_val.alias(f"__val__{key}") + + expressions.extend([is_straight, value_expr]) + + # collect data with checks + # We only need _recordId and the check columns + # We do with_columns then select implicitly/explicitly via filter/select later. + + checked_data = data.with_columns(expressions).collect() + + # Process results into a nice table + outliers = [] + + for key in multi_attribute_groups.keys(): + flag_col = f"__is_straight__{key}" + val_col = f"__val__{key}" + + filtered = checked_data.filter(pl.col(flag_col)) + + if filtered.height > 0: + rows = filtered.select(["_recordId", val_col]).rows() + for row in rows: + outliers.append({ + "Record ID": row[0], + "Question Group": key, + "Value": row[1] + }) + + if not outliers: + return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})" + + outlier_df = pl.DataFrame(outliers) + + return f"""### Straight-lining Checks: + + **⚠️ Potential straight-liners detected ⚠️** + + Respondents selected the same value (<= {max_score}) for all attributes in the following groups: + + {mo.ui.table(outlier_df)} + """ + \ No newline at end of file