Files
JPMC-quant/validation.py

184 lines
6.8 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import marimo as mo
import polars as pl
def check_progress(data):
"""Check if all responses are complete based on 'progress' column."""
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
def duration_validation(data):
"""Validate response durations to identify outliers."""
# Identify any outliers in duration
duration_stats = data.select(
pl.col('duration').mean().alias('mean_duration'),
pl.col('duration').std().alias('std_duration')
).collect()
mean_duration = duration_stats['mean_duration'][0]
std_duration = duration_stats['std_duration'][0]
upper_outlier_threshold = mean_duration + 3 * std_duration
lower_outlier_threshold = mean_duration - 3 * std_duration
_d = data.with_columns(
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
)
# Show durations with outlier flag is true
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
if outlier_data.shape[0] == 0:
return "### Duration Outliers: \n\n✅ No duration outliers detected"
return f"""### Duration Outliers:
**⚠️ Potential outliers detected based on response duration ⚠️**
| Metric | Value |
|--------|-------|
| Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
| Standard Deviation of Duration | {std_duration:.2f} seconds |
| Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
| Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
| Number of Outlier Responses | {outlier_data.shape[0]} |
Outliers:
{mo.ui.table(outlier_data)}
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
"""
def check_straight_liners(data, max_score=3):
"""
Check for straight-lining behavior (selecting same value for all attributes).
Args:
data: Polars LazyFrame
max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
"""
import re
# detect columns groups based on pattern SS_...__Vxx__Choice_y
schema_names = data.collect_schema().names()
# regex groupings
pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
groups = {}
for col in schema_names:
match = pattern.search(col)
if match:
group_key = match.group(1)
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(col)
# Filter for groups with multiple attributes/choices
multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
if not multi_attribute_groups:
return "### Straight-lining Checks: \n\n No multi-attribute question groups found."
# Cast all involved columns to Int64 (strict=False) to handle potential string columns
# This prevents "cannot compare string with numeric type" errors
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
data = data.with_columns([
pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
])
# Build expressions
expressions = []
for key, cols in multi_attribute_groups.items():
# Logic:
# 1. Create list of values
# 2. Drop nulls
# 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
list_expr = pl.concat_list(cols).list.drop_nulls()
# Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
# If n_unique == 1, min() is the same as the single value.
# If list is empty, min() is null, which is safe.
safe_val = list_expr.list.min()
is_straight = (
(list_expr.list.len() > 0) &
(list_expr.list.n_unique() == 1) &
(safe_val <= max_score)
).alias(f"__is_straight__{key}")
value_expr = safe_val.alias(f"__val__{key}")
expressions.extend([is_straight, value_expr])
# collect data with checks
# We only need _recordId and the check columns
# We do with_columns then select implicitly/explicitly via filter/select later.
checked_data = data.with_columns(expressions).collect()
# Process results into a nice table
outliers = []
for key, group_cols in multi_attribute_groups.items():
flag_col = f"__is_straight__{key}"
val_col = f"__val__{key}"
filtered = checked_data.filter(pl.col(flag_col))
if filtered.height > 0:
# Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
# Assuming format ends with __Choice_X
sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
# Select relevant columns: Record ID, Value, and the sorted group columns
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
for row in subset.iter_rows(named=True):
# Create ordered list of values, using 'NaN' for missing data
resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
outliers.append({
"Record ID": row["_recordId"],
"Question Group": key,
"Value": row[val_col],
"Responses": str(resp_list)
})
if not outliers:
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
outlier_df = pl.DataFrame(outliers)
return f"""### Straight-lining Checks:
**⚠️ Potential straight-liners detected ⚠️**
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
{mo.ui.table(outlier_df)}
""", outlier_df
if __name__ == "__main__":
from utils import JPMCSurvey
RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
data = S.load_data()
print("Checking Green Blue:")
print(check_straight_liners(S.get_ss_green_blue(data)[0]))
print("Checking Orange Red:")
print(check_straight_liners(S.get_ss_orange_red(data)[0]))