straightliner validation

This commit is contained in:
2026-01-29 18:40:18 +01:00
parent 5f9e67a312
commit 2958fed780
2 changed files with 59 additions and 17 deletions

View File

@@ -85,6 +85,13 @@ def check_straight_liners(data, max_score=3):
if not multi_attribute_groups:
return "### Straight-lining Checks: \n\n No multi-attribute question groups found."
# Cast all involved columns to Int64 (strict=False) to handle potential string columns
# This prevents "cannot compare string with numeric type" errors
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
data = data.with_columns([
pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
])
# Build expressions
expressions = []
@@ -120,19 +127,29 @@ def check_straight_liners(data, max_score=3):
# Process results into a nice table
outliers = []
for key in multi_attribute_groups.keys():
for key, group_cols in multi_attribute_groups.items():
flag_col = f"__is_straight__{key}"
val_col = f"__val__{key}"
filtered = checked_data.filter(pl.col(flag_col))
if filtered.height > 0:
rows = filtered.select(["_recordId", val_col]).rows()
for row in rows:
# Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
# Assuming format ends with __Choice_X
sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
# Select relevant columns: Record ID, Value, and the sorted group columns
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
for row in subset.iter_rows(named=True):
# Create ordered list of values, using 'NaN' for missing data
resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
outliers.append({
"Record ID": row[0],
"Record ID": row["_recordId"],
"Question Group": key,
"Value": row[1]
"Value": row[val_col],
"Responses": str(resp_list)
})
if not outliers:
@@ -147,6 +164,21 @@ def check_straight_liners(data, max_score=3):
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
{mo.ui.table(outlier_df)}
"""
""", outlier_df
if __name__ == "__main__":
from utils import JPMCSurvey
RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
data = S.load_data()
print("Checking Green Blue:")
print(check_straight_liners(S.get_ss_green_blue(data)[0]))
print("Checking Orange Red:")
print(check_straight_liners(S.get_ss_orange_red(data)[0]))