straight line fn dev
This commit is contained in:
109
validation.py
109
validation.py
@@ -36,11 +36,13 @@ def duration_validation(data):
|
||||
|
||||
**⚠️ Potential outliers detected based on response duration ⚠️**
|
||||
|
||||
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
|
||||
- Standard Deviation of Duration: {std_duration:.2f} seconds
|
||||
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
||||
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
||||
- Number of Outlier Responses: {outlier_data.shape[0]}
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
|
||||
| Standard Deviation of Duration | {std_duration:.2f} seconds |
|
||||
| Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
|
||||
| Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
|
||||
| Number of Outlier Responses | {outlier_data.shape[0]} |
|
||||
|
||||
Outliers:
|
||||
|
||||
@@ -50,4 +52,101 @@ def duration_validation(data):
|
||||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||||
|
||||
"""
|
||||
|
||||
|
||||
def check_straight_liners(data, max_score=3):
|
||||
"""
|
||||
Check for straight-lining behavior (selecting same value for all attributes).
|
||||
|
||||
Args:
|
||||
data: Polars LazyFrame
|
||||
max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
|
||||
"""
|
||||
import re
|
||||
|
||||
# detect columns groups based on pattern SS_...__Vxx__Choice_y
|
||||
schema_names = data.collect_schema().names()
|
||||
|
||||
# regex groupings
|
||||
pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
|
||||
groups = {}
|
||||
|
||||
for col in schema_names:
|
||||
match = pattern.search(col)
|
||||
if match:
|
||||
group_key = match.group(1)
|
||||
if group_key not in groups:
|
||||
groups[group_key] = []
|
||||
groups[group_key].append(col)
|
||||
|
||||
# Filter for groups with multiple attributes/choices
|
||||
multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
|
||||
|
||||
if not multi_attribute_groups:
|
||||
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
||||
|
||||
# Build expressions
|
||||
expressions = []
|
||||
|
||||
for key, cols in multi_attribute_groups.items():
|
||||
# Logic:
|
||||
# 1. Create list of values
|
||||
# 2. Drop nulls
|
||||
# 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
|
||||
|
||||
list_expr = pl.concat_list(cols).list.drop_nulls()
|
||||
|
||||
# Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
|
||||
# If n_unique == 1, min() is the same as the single value.
|
||||
# If list is empty, min() is null, which is safe.
|
||||
safe_val = list_expr.list.min()
|
||||
|
||||
is_straight = (
|
||||
(list_expr.list.len() > 0) &
|
||||
(list_expr.list.n_unique() == 1) &
|
||||
(safe_val <= max_score)
|
||||
).alias(f"__is_straight__{key}")
|
||||
|
||||
value_expr = safe_val.alias(f"__val__{key}")
|
||||
|
||||
expressions.extend([is_straight, value_expr])
|
||||
|
||||
# collect data with checks
|
||||
# We only need _recordId and the check columns
|
||||
# We do with_columns then select implicitly/explicitly via filter/select later.
|
||||
|
||||
checked_data = data.with_columns(expressions).collect()
|
||||
|
||||
# Process results into a nice table
|
||||
outliers = []
|
||||
|
||||
for key in multi_attribute_groups.keys():
|
||||
flag_col = f"__is_straight__{key}"
|
||||
val_col = f"__val__{key}"
|
||||
|
||||
filtered = checked_data.filter(pl.col(flag_col))
|
||||
|
||||
if filtered.height > 0:
|
||||
rows = filtered.select(["_recordId", val_col]).rows()
|
||||
for row in rows:
|
||||
outliers.append({
|
||||
"Record ID": row[0],
|
||||
"Question Group": key,
|
||||
"Value": row[1]
|
||||
})
|
||||
|
||||
if not outliers:
|
||||
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
|
||||
|
||||
outlier_df = pl.DataFrame(outliers)
|
||||
|
||||
return f"""### Straight-lining Checks:
|
||||
|
||||
**⚠️ Potential straight-liners detected ⚠️**
|
||||
|
||||
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
|
||||
|
||||
{mo.ui.table(outlier_df)}
|
||||
"""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user