SL validation complete
This commit is contained in:
@@ -6,9 +6,9 @@ from theme import ColorPalette
|
||||
def check_progress(data):
|
||||
"""Check if all responses are complete based on 'progress' column."""
|
||||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||||
return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
|
||||
return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """
|
||||
|
||||
return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
|
||||
return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
|
||||
|
||||
|
||||
def duration_validation(data):
|
||||
@@ -31,9 +31,9 @@ def duration_validation(data):
|
||||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||||
|
||||
if outlier_data.shape[0] == 0:
|
||||
return "### Duration Outliers: \n\n✅ No duration outliers detected"
|
||||
return "## Duration Outliers: \n\n✅ No duration outliers detected"
|
||||
|
||||
return f"""### Duration Outliers:
|
||||
return f"""## Duration Outliers:
|
||||
|
||||
**⚠️ Potential outliers detected based on response duration ⚠️**
|
||||
|
||||
@@ -69,13 +69,25 @@ def check_straight_liners(data, max_score=3):
|
||||
schema_names = data.collect_schema().names()
|
||||
|
||||
# regex groupings
|
||||
pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
|
||||
pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+")
|
||||
pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+")
|
||||
|
||||
groups = {}
|
||||
|
||||
for col in schema_names:
|
||||
match = pattern.search(col)
|
||||
if match:
|
||||
group_key = match.group(1)
|
||||
# Check for Choice pattern (SS_...__Vxx__Choice_y)
|
||||
match_choice = pattern_choice.search(col)
|
||||
if match_choice:
|
||||
group_key = match_choice.group(1)
|
||||
if group_key not in groups:
|
||||
groups[group_key] = []
|
||||
groups[group_key].append(col)
|
||||
continue
|
||||
|
||||
# Check for Voice Scale pattern (Voice_Scale_1_10__Vxx)
|
||||
# All of these form a single group "Voice_Scale_1_10"
|
||||
if pattern_scale.search(col):
|
||||
group_key = "Voice_Scale_1_10"
|
||||
if group_key not in groups:
|
||||
groups[group_key] = []
|
||||
groups[group_key].append(col)
|
||||
@@ -86,11 +98,11 @@ def check_straight_liners(data, max_score=3):
|
||||
if not multi_attribute_groups:
|
||||
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
||||
|
||||
# Cast all involved columns to Int64 (strict=False) to handle potential string columns
|
||||
# This prevents "cannot compare string with numeric type" errors
|
||||
# Cast all involved columns to Float64 (strict=False) to handle potential string columns
|
||||
# and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well.
|
||||
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
|
||||
data = data.with_columns([
|
||||
pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
|
||||
pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols
|
||||
])
|
||||
|
||||
# Build expressions
|
||||
@@ -136,9 +148,18 @@ def check_straight_liners(data, max_score=3):
|
||||
filtered = checked_data.filter(pl.col(flag_col))
|
||||
|
||||
if filtered.height > 0:
|
||||
# Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
|
||||
# Assuming format ends with __Choice_X
|
||||
sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
|
||||
# Sort group_cols logic
|
||||
# If Choice columns, sort by choice number.
|
||||
# If Voice Scale columns (no Choice_), sort by Voice ID (Vxx)
|
||||
if all("__Choice_" in c for c in group_cols):
|
||||
key_func = lambda c: int(c.split('__Choice_')[-1])
|
||||
else:
|
||||
# Extract digits from Vxx
|
||||
def key_func(c):
|
||||
m = re.search(r"__V(\d+)", c)
|
||||
return int(m.group(1)) if m else 0
|
||||
|
||||
sorted_group_cols = sorted(group_cols, key=key_func)
|
||||
|
||||
# Select relevant columns: Record ID, Value, and the sorted group columns
|
||||
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
|
||||
@@ -155,7 +176,7 @@ def check_straight_liners(data, max_score=3):
|
||||
})
|
||||
|
||||
if not outliers:
|
||||
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
|
||||
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None
|
||||
|
||||
outlier_df = pl.DataFrame(outliers)
|
||||
|
||||
@@ -291,13 +312,12 @@ def check_straight_liners(data, max_score=3):
|
||||
|
||||
"""
|
||||
|
||||
return mo.vstack([
|
||||
mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
|
||||
return (mo.vstack([
|
||||
mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"),
|
||||
mo.ui.table(outlier_df),
|
||||
mo.md(analysis_md),
|
||||
mo.md("#### Speaking Style Question Groups"),
|
||||
alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
|
||||
])
|
||||
]), outlier_df)
|
||||
|
||||
|
||||
|
||||
@@ -311,7 +331,10 @@ if __name__ == "__main__":
|
||||
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||
data = S.load_data()
|
||||
|
||||
print("Checking Green Blue:")
|
||||
print(check_straight_liners(S.get_ss_green_blue(data)[0]))
|
||||
print("Checking Orange Red:")
|
||||
print(check_straight_liners(S.get_ss_orange_red(data)[0]))
|
||||
# print("Checking Green Blue:")
|
||||
# print(check_straight_liners(S.get_ss_green_blue(data)[0]))
|
||||
# print("Checking Orange Red:")
|
||||
# print(check_straight_liners(S.get_ss_orange_red(data)[0]))
|
||||
|
||||
print("Checking Voice Scale 1-10:")
|
||||
print(check_straight_liners(S.get_voice_scale_1_10(data)[0]))
|
||||
Reference in New Issue
Block a user