import marimo as mo import polars as pl def check_progress(data): """Check if all responses are complete based on 'progress' column.""" if data.collect().select(pl.col('progress').unique()).shape[0] == 1: return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """ return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️" def duration_validation(data): """Validate response durations to identify outliers.""" # Identify any outliers in duration duration_stats = data.select( pl.col('duration').mean().alias('mean_duration'), pl.col('duration').std().alias('std_duration') ).collect() mean_duration = duration_stats['mean_duration'][0] std_duration = duration_stats['std_duration'][0] upper_outlier_threshold = mean_duration + 3 * std_duration lower_outlier_threshold = mean_duration - 3 * std_duration _d = data.with_columns( ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration') ) # Show durations with outlier flag is true outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() if outlier_data.shape[0] == 0: return "### Duration Outliers: \n\n✅ No duration outliers detected" return f"""### Duration Outliers: **⚠️ Potential outliers detected based on response duration ⚠️** | Metric | Value | |--------|-------| | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) | | Standard Deviation of Duration | {std_duration:.2f} seconds | | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds | | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds | | Number of Outlier Responses | {outlier_data.shape[0]} | Outliers: {mo.ui.table(outlier_data)} **⚠️ NOTE: These have not been removed from the dataset ⚠️** """ def check_straight_liners(data, max_score=3): """ Check for straight-lining behavior (selecting same value for all attributes). Args: data: Polars LazyFrame max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed). """ import re # detect columns groups based on pattern SS_...__Vxx__Choice_y schema_names = data.collect_schema().names() # regex groupings pattern = re.compile(r"(.*__V\d+)__Choice_\d+") groups = {} for col in schema_names: match = pattern.search(col) if match: group_key = match.group(1) if group_key not in groups: groups[group_key] = [] groups[group_key].append(col) # Filter for groups with multiple attributes/choices multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1} if not multi_attribute_groups: return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found." # Build expressions expressions = [] for key, cols in multi_attribute_groups.items(): # Logic: # 1. Create list of values # 2. Drop nulls # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score list_expr = pl.concat_list(cols).list.drop_nulls() # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists # If n_unique == 1, min() is the same as the single value. # If list is empty, min() is null, which is safe. safe_val = list_expr.list.min() is_straight = ( (list_expr.list.len() > 0) & (list_expr.list.n_unique() == 1) & (safe_val <= max_score) ).alias(f"__is_straight__{key}") value_expr = safe_val.alias(f"__val__{key}") expressions.extend([is_straight, value_expr]) # collect data with checks # We only need _recordId and the check columns # We do with_columns then select implicitly/explicitly via filter/select later. checked_data = data.with_columns(expressions).collect() # Process results into a nice table outliers = [] for key in multi_attribute_groups.keys(): flag_col = f"__is_straight__{key}" val_col = f"__val__{key}" filtered = checked_data.filter(pl.col(flag_col)) if filtered.height > 0: rows = filtered.select(["_recordId", val_col]).rows() for row in rows: outliers.append({ "Record ID": row[0], "Question Group": key, "Value": row[1] }) if not outliers: return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})" outlier_df = pl.DataFrame(outliers) return f"""### Straight-lining Checks: **⚠️ Potential straight-liners detected ⚠️** Respondents selected the same value (<= {max_score}) for all attributes in the following groups: {mo.ui.table(outlier_df)} """