straightliner validation
This commit is contained in:
@@ -21,6 +21,7 @@ def _():
|
|||||||
SPEAKING_STYLES,
|
SPEAKING_STYLES,
|
||||||
calculate_weighted_ranking_scores,
|
calculate_weighted_ranking_scores,
|
||||||
check_progress,
|
check_progress,
|
||||||
|
check_straight_liners,
|
||||||
duration_validation,
|
duration_validation,
|
||||||
mo,
|
mo,
|
||||||
pl,
|
pl,
|
||||||
@@ -58,7 +59,7 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(Path, RESULTS_FILE, mo):
|
def _(Path, RESULTS_FILE, data_all, mo):
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -66,13 +67,29 @@ def _(Path, RESULTS_FILE, mo):
|
|||||||
|
|
||||||
**Dataset:** `{Path(RESULTS_FILE).name}`
|
**Dataset:** `{Path(RESULTS_FILE).name}`
|
||||||
|
|
||||||
|
**Responses**: `{data_all.collect().shape[0]}`
|
||||||
|
|
||||||
|
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(check_progress, data_all, duration_validation, mo):
|
def _():
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(
|
||||||
|
S,
|
||||||
|
check_progress,
|
||||||
|
check_straight_liners,
|
||||||
|
data_all,
|
||||||
|
duration_validation,
|
||||||
|
mo,
|
||||||
|
):
|
||||||
|
sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5)
|
||||||
|
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
## Data Validation
|
## Data Validation
|
||||||
|
|
||||||
@@ -83,19 +100,12 @@ def _(check_progress, data_all, duration_validation, mo):
|
|||||||
{duration_validation(data_all)}
|
{duration_validation(data_all)}
|
||||||
|
|
||||||
|
|
||||||
|
{sl_content}
|
||||||
|
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _(mo):
|
|
||||||
mo.md(r"""
|
|
||||||
### ⚠️ ToDo: "straight-liner" detection and removal
|
|
||||||
""")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo):
|
def _(mo):
|
||||||
mo.md(r"""
|
mo.md(r"""
|
||||||
|
|||||||
@@ -85,6 +85,13 @@ def check_straight_liners(data, max_score=3):
|
|||||||
if not multi_attribute_groups:
|
if not multi_attribute_groups:
|
||||||
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
||||||
|
|
||||||
|
# Cast all involved columns to Int64 (strict=False) to handle potential string columns
|
||||||
|
# This prevents "cannot compare string with numeric type" errors
|
||||||
|
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
|
||||||
|
data = data.with_columns([
|
||||||
|
pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
|
||||||
|
])
|
||||||
|
|
||||||
# Build expressions
|
# Build expressions
|
||||||
expressions = []
|
expressions = []
|
||||||
|
|
||||||
@@ -120,19 +127,29 @@ def check_straight_liners(data, max_score=3):
|
|||||||
# Process results into a nice table
|
# Process results into a nice table
|
||||||
outliers = []
|
outliers = []
|
||||||
|
|
||||||
for key in multi_attribute_groups.keys():
|
for key, group_cols in multi_attribute_groups.items():
|
||||||
flag_col = f"__is_straight__{key}"
|
flag_col = f"__is_straight__{key}"
|
||||||
val_col = f"__val__{key}"
|
val_col = f"__val__{key}"
|
||||||
|
|
||||||
filtered = checked_data.filter(pl.col(flag_col))
|
filtered = checked_data.filter(pl.col(flag_col))
|
||||||
|
|
||||||
if filtered.height > 0:
|
if filtered.height > 0:
|
||||||
rows = filtered.select(["_recordId", val_col]).rows()
|
# Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
|
||||||
for row in rows:
|
# Assuming format ends with __Choice_X
|
||||||
|
sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
|
||||||
|
|
||||||
|
# Select relevant columns: Record ID, Value, and the sorted group columns
|
||||||
|
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
|
||||||
|
|
||||||
|
for row in subset.iter_rows(named=True):
|
||||||
|
# Create ordered list of values, using 'NaN' for missing data
|
||||||
|
resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
|
||||||
|
|
||||||
outliers.append({
|
outliers.append({
|
||||||
"Record ID": row[0],
|
"Record ID": row["_recordId"],
|
||||||
"Question Group": key,
|
"Question Group": key,
|
||||||
"Value": row[1]
|
"Value": row[val_col],
|
||||||
|
"Responses": str(resp_list)
|
||||||
})
|
})
|
||||||
|
|
||||||
if not outliers:
|
if not outliers:
|
||||||
@@ -147,6 +164,21 @@ def check_straight_liners(data, max_score=3):
|
|||||||
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
|
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
|
||||||
|
|
||||||
{mo.ui.table(outlier_df)}
|
{mo.ui.table(outlier_df)}
|
||||||
"""
|
""", outlier_df
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
from utils import JPMCSurvey
|
||||||
|
|
||||||
|
RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
|
||||||
|
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
|
||||||
|
|
||||||
|
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||||
|
data = S.load_data()
|
||||||
|
|
||||||
|
print("Checking Green Blue:")
|
||||||
|
print(check_straight_liners(S.get_ss_green_blue(data)[0]))
|
||||||
|
print("Checking Orange Red:")
|
||||||
|
print(check_straight_liners(S.get_ss_orange_red(data)[0]))
|
||||||
Reference in New Issue
Block a user