import marimo as mo import polars as pl def check_progress(data): """Check if all responses are complete based on 'progress' column.""" if data.collect().select(pl.col('progress').unique()).shape[0] == 1: return mo.md("""### Responses Complete: \n\n✅ All responses are complete (progress = 100) """) return mo.md("### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️") def duration_validation(data): """Validate response durations to identify outliers.""" # Identify any outliers in duration duration_stats = data.select( pl.col('duration').mean().alias('mean_duration'), pl.col('duration').std().alias('std_duration') ).collect() mean_duration = duration_stats['mean_duration'][0] std_duration = duration_stats['std_duration'][0] upper_outlier_threshold = mean_duration + 3 * std_duration lower_outlier_threshold = mean_duration - 3 * std_duration _d = data.with_columns( ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration') ) # Show durations with outlier flag is true outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() if outlier_data.shape[0] == 0: return mo.md("### Duration Outliers: \n\n✅ No duration outliers detected") return mo.md(f""" ### Duration Outliers: **⚠️ Potential outliers detected based on response duration ⚠️** - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) - Standard Deviation of Duration: {std_duration:.2f} seconds - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds - Number of Outlier Responses: {outlier_data.shape[0]} Outliers: {mo.ui.table(outlier_data)} **⚠️ NOTE: These have not been removed from the dataset ⚠️** """)