setup complete framework of analysis
This commit is contained in:
54
validation.py
Normal file
54
validation.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
|
||||
|
||||
def check_progress(data):
|
||||
"""Check if all responses are complete based on 'progress' column."""
|
||||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||||
return mo.md("""### Responses Complete: \n\n✅ All responses are complete (progress = 100) """)
|
||||
|
||||
return mo.md("### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️")
|
||||
|
||||
|
||||
def duration_validation(data):
|
||||
"""Validate response durations to identify outliers."""
|
||||
# Identify any outliers in duration
|
||||
duration_stats = data.select(
|
||||
pl.col('duration').mean().alias('mean_duration'),
|
||||
pl.col('duration').std().alias('std_duration')
|
||||
).collect()
|
||||
mean_duration = duration_stats['mean_duration'][0]
|
||||
std_duration = duration_stats['std_duration'][0]
|
||||
upper_outlier_threshold = mean_duration + 3 * std_duration
|
||||
lower_outlier_threshold = mean_duration - 3 * std_duration
|
||||
|
||||
_d = data.with_columns(
|
||||
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
||||
)
|
||||
|
||||
# Show durations with outlier flag is true
|
||||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||||
|
||||
if outlier_data.shape[0] == 0:
|
||||
return mo.md("### Duration Outliers: \n\n✅ No duration outliers detected")
|
||||
|
||||
return mo.md(f"""
|
||||
### Duration Outliers:
|
||||
|
||||
**⚠️ Potential outliers detected based on response duration ⚠️**
|
||||
|
||||
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
|
||||
- Standard Deviation of Duration: {std_duration:.2f} seconds
|
||||
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
||||
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
||||
- Number of Outlier Responses: {outlier_data.shape[0]}
|
||||
|
||||
Outliers:
|
||||
|
||||
{mo.ui.table(outlier_data)}
|
||||
|
||||
|
||||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||||
|
||||
""")
|
||||
|
||||
Reference in New Issue
Block a user