straightliner validation

This commit is contained in:
2026-01-29 18:40:18 +01:00
parent 5f9e67a312
commit 2958fed780
2 changed files with 59 additions and 17 deletions

View File

@@ -21,6 +21,7 @@ def _():
SPEAKING_STYLES, SPEAKING_STYLES,
calculate_weighted_ranking_scores, calculate_weighted_ranking_scores,
check_progress, check_progress,
check_straight_liners,
duration_validation, duration_validation,
mo, mo,
pl, pl,
@@ -58,7 +59,7 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
@app.cell @app.cell
def _(Path, RESULTS_FILE, mo): def _(Path, RESULTS_FILE, data_all, mo):
mo.md(f""" mo.md(f"""
--- ---
@@ -66,13 +67,29 @@ def _(Path, RESULTS_FILE, mo):
**Dataset:** `{Path(RESULTS_FILE).name}` **Dataset:** `{Path(RESULTS_FILE).name}`
**Responses**: `{data_all.collect().shape[0]}`
""") """)
return return
@app.cell @app.cell
def _(check_progress, data_all, duration_validation, mo): def _():
return
@app.cell
def _(
S,
check_progress,
check_straight_liners,
data_all,
duration_validation,
mo,
):
sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5)
mo.md(f""" mo.md(f"""
## Data Validation ## Data Validation
@@ -83,19 +100,12 @@ def _(check_progress, data_all, duration_validation, mo):
{duration_validation(data_all)} {duration_validation(data_all)}
{sl_content}
""") """)
return return
@app.cell
def _(mo):
mo.md(r"""
### ⚠️ ToDo: "straight-liner" detection and removal
""")
return
@app.cell @app.cell
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""

View File

@@ -85,6 +85,13 @@ def check_straight_liners(data, max_score=3):
if not multi_attribute_groups: if not multi_attribute_groups:
return "### Straight-lining Checks: \n\n No multi-attribute question groups found." return "### Straight-lining Checks: \n\n No multi-attribute question groups found."
# Cast all involved columns to Int64 (strict=False) to handle potential string columns
# This prevents "cannot compare string with numeric type" errors
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
data = data.with_columns([
pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
])
# Build expressions # Build expressions
expressions = [] expressions = []
@@ -120,19 +127,29 @@ def check_straight_liners(data, max_score=3):
# Process results into a nice table # Process results into a nice table
outliers = [] outliers = []
for key in multi_attribute_groups.keys(): for key, group_cols in multi_attribute_groups.items():
flag_col = f"__is_straight__{key}" flag_col = f"__is_straight__{key}"
val_col = f"__val__{key}" val_col = f"__val__{key}"
filtered = checked_data.filter(pl.col(flag_col)) filtered = checked_data.filter(pl.col(flag_col))
if filtered.height > 0: if filtered.height > 0:
rows = filtered.select(["_recordId", val_col]).rows() # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
for row in rows: # Assuming format ends with __Choice_X
sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
# Select relevant columns: Record ID, Value, and the sorted group columns
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
for row in subset.iter_rows(named=True):
# Create ordered list of values, using 'NaN' for missing data
resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
outliers.append({ outliers.append({
"Record ID": row[0], "Record ID": row["_recordId"],
"Question Group": key, "Question Group": key,
"Value": row[1] "Value": row[val_col],
"Responses": str(resp_list)
}) })
if not outliers: if not outliers:
@@ -147,6 +164,21 @@ def check_straight_liners(data, max_score=3):
Respondents selected the same value (<= {max_score}) for all attributes in the following groups: Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
{mo.ui.table(outlier_df)} {mo.ui.table(outlier_df)}
""" """, outlier_df
if __name__ == "__main__":
from utils import JPMCSurvey
RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
data = S.load_data()
print("Checking Green Blue:")
print(check_straight_liners(S.get_ss_green_blue(data)[0]))
print("Checking Orange Red:")
print(check_straight_liners(S.get_ss_orange_red(data)[0]))