JPMC-quant/validation.py

import marimo as mo
import polars as pl


def check_progress(data):
    """Check if all responses are complete based on 'progress' column."""
    if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
        return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """

    return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"


def duration_validation(data):
    """Validate response durations to identify outliers."""
    # Identify any outliers in duration
    duration_stats = data.select(
        pl.col('duration').mean().alias('mean_duration'),
        pl.col('duration').std().alias('std_duration')
    ).collect()
    mean_duration = duration_stats['mean_duration'][0]
    std_duration = duration_stats['std_duration'][0]
    upper_outlier_threshold = mean_duration + 3 * std_duration
    lower_outlier_threshold = mean_duration - 3 * std_duration

    _d = data.with_columns(
        ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
    )

    # Show durations with outlier flag is true
    outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()

    if outlier_data.shape[0] == 0:
        return "### Duration Outliers: \n\n✅ No duration outliers detected"

    return f"""### Duration Outliers:

    **⚠️ Potential outliers detected based on response duration ⚠️**

    | Metric | Value |
    |--------|-------|
    | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
    | Standard Deviation of Duration | {std_duration:.2f} seconds |
    | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
    | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
    | Number of Outlier Responses | {outlier_data.shape[0]} |

    Outliers:

    {mo.ui.table(outlier_data)}


    **⚠️ NOTE: These have not been removed from the dataset ⚠️**

    """


def check_straight_liners(data, max_score=3):
    """
    Check for straight-lining behavior (selecting same value for all attributes).

    Args:
        data: Polars LazyFrame
        max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
    """
    import re

    # detect columns groups based on pattern SS_...__Vxx__Choice_y
    schema_names = data.collect_schema().names()

    # regex groupings
    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
    groups = {}

    for col in schema_names:
        match = pattern.search(col)
        if match:
            group_key = match.group(1)
            if group_key not in groups:
                groups[group_key] = []
            groups[group_key].append(col)

    # Filter for groups with multiple attributes/choices
    multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}

    if not multi_attribute_groups:
        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."

    # Build expressions
    expressions = []

    for key, cols in multi_attribute_groups.items():
        # Logic:
        # 1. Create list of values
        # 2. Drop nulls
        # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score

        list_expr = pl.concat_list(cols).list.drop_nulls()

        # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
        # If n_unique == 1, min() is the same as the single value.
        # If list is empty, min() is null, which is safe.
        safe_val = list_expr.list.min()

        is_straight = (
            (list_expr.list.len() > 0) &
            (list_expr.list.n_unique() == 1) &
            (safe_val <= max_score)
        ).alias(f"__is_straight__{key}")

        value_expr = safe_val.alias(f"__val__{key}")

        expressions.extend([is_straight, value_expr])

    # collect data with checks
    # We only need _recordId and the check columns
    # We do with_columns then select implicitly/explicitly via filter/select later.

    checked_data = data.with_columns(expressions).collect()

    # Process results into a nice table
    outliers = []

    for key in multi_attribute_groups.keys():
        flag_col = f"__is_straight__{key}"
        val_col = f"__val__{key}"

        filtered = checked_data.filter(pl.col(flag_col))

        if filtered.height > 0:
            rows = filtered.select(["_recordId", val_col]).rows()
            for row in rows:
                outliers.append({
                    "Record ID": row[0],
                    "Question Group": key,
                    "Value": row[1]
                })

    if not outliers:
        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"

    outlier_df = pl.DataFrame(outliers)

    return f"""### Straight-lining Checks:

    **⚠️ Potential straight-liners detected ⚠️**

    Respondents selected the same value (<= {max_score}) for all attributes in the following groups:

    {mo.ui.table(outlier_df)}
    """