straightliner verification for SS questions
This commit is contained in:
@@ -74,11 +74,6 @@ def _(Path, RESULTS_FILE, data_all, mo):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _():
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(
|
def _(
|
||||||
S,
|
S,
|
||||||
@@ -88,7 +83,8 @@ def _(
|
|||||||
duration_validation,
|
duration_validation,
|
||||||
mo,
|
mo,
|
||||||
):
|
):
|
||||||
sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5)
|
_ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
|
||||||
|
sl_content = check_straight_liners(_ss_all, max_score=5)
|
||||||
|
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
## Data Validation
|
## Data Validation
|
||||||
@@ -106,18 +102,6 @@ def _(
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _(mo):
|
|
||||||
mo.md(r"""
|
|
||||||
---
|
|
||||||
|
|
||||||
# Data Filter
|
|
||||||
|
|
||||||
Use to select a subset of the data for the following analysis
|
|
||||||
""")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(S, mo):
|
def _(S, mo):
|
||||||
filter_form = mo.md('''
|
filter_form = mo.md('''
|
||||||
|
|||||||
147
validation.py
147
validation.py
@@ -1,6 +1,7 @@
|
|||||||
import marimo as mo
|
import marimo as mo
|
||||||
import polars as pl
|
import polars as pl
|
||||||
|
import altair as alt
|
||||||
|
from theme import ColorPalette
|
||||||
|
|
||||||
def check_progress(data):
|
def check_progress(data):
|
||||||
"""Check if all responses are complete based on 'progress' column."""
|
"""Check if all responses are complete based on 'progress' column."""
|
||||||
@@ -115,8 +116,9 @@ def check_straight_liners(data, max_score=3):
|
|||||||
).alias(f"__is_straight__{key}")
|
).alias(f"__is_straight__{key}")
|
||||||
|
|
||||||
value_expr = safe_val.alias(f"__val__{key}")
|
value_expr = safe_val.alias(f"__val__{key}")
|
||||||
|
has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}")
|
||||||
|
|
||||||
expressions.extend([is_straight, value_expr])
|
expressions.extend([is_straight, value_expr, has_data])
|
||||||
|
|
||||||
# collect data with checks
|
# collect data with checks
|
||||||
# We only need _recordId and the check columns
|
# We only need _recordId and the check columns
|
||||||
@@ -156,15 +158,146 @@ def check_straight_liners(data, max_score=3):
|
|||||||
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
|
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
|
||||||
|
|
||||||
outlier_df = pl.DataFrame(outliers)
|
outlier_df = pl.DataFrame(outliers)
|
||||||
|
|
||||||
|
# --- Analysis & Visualization ---
|
||||||
|
|
||||||
return f"""### Straight-lining Checks:
|
total_respondents = checked_data.height
|
||||||
|
|
||||||
**⚠️ Potential straight-liners detected ⚠️**
|
# 1. & 3. Percentage Calculation
|
||||||
|
group_stats = []
|
||||||
|
value_dist_data = []
|
||||||
|
|
||||||
|
# Calculate Straight-Liners for ALL groups found in Data
|
||||||
|
# Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions)
|
||||||
|
# Logic: For every group G: if G has data (len > 0), then G must be straight.
|
||||||
|
# Also, the respondent must have answered at least one question group.
|
||||||
|
|
||||||
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
|
conditions = []
|
||||||
|
has_any_data_exprs = []
|
||||||
|
|
||||||
{mo.ui.table(outlier_df)}
|
for key in multi_attribute_groups.keys():
|
||||||
""", outlier_df
|
flag_col = f"__is_straight__{key}"
|
||||||
|
data_col = f"__has_data__{key}"
|
||||||
|
|
||||||
|
# If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user.
|
||||||
|
# Equivalent: (not has_data) OR is_straight
|
||||||
|
cond = (~pl.col(data_col)) | pl.col(flag_col)
|
||||||
|
conditions.append(cond)
|
||||||
|
has_any_data_exprs.append(pl.col(data_col))
|
||||||
|
|
||||||
|
all_straight_count = checked_data.filter(
|
||||||
|
pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs)
|
||||||
|
).height
|
||||||
|
all_straight_pct = (all_straight_count / total_respondents) * 100
|
||||||
|
|
||||||
|
for key in multi_attribute_groups.keys():
|
||||||
|
flag_col = f"__is_straight__{key}"
|
||||||
|
val_col = f"__val__{key}"
|
||||||
|
|
||||||
|
# Filter for straight-liners in this specific group
|
||||||
|
sl_sub = checked_data.filter(pl.col(flag_col))
|
||||||
|
count = sl_sub.height
|
||||||
|
pct = (count / total_respondents) * 100
|
||||||
|
|
||||||
|
group_stats.append({
|
||||||
|
"Question Group": key,
|
||||||
|
"Straight-Liner %": pct,
|
||||||
|
"Count": count
|
||||||
|
})
|
||||||
|
|
||||||
|
# Get Value Distribution for this group's straight-liners
|
||||||
|
if count > 0:
|
||||||
|
# Group by the Value they straight-lined
|
||||||
|
dist = sl_sub.group_by(val_col).agg(pl.len().alias("count"))
|
||||||
|
for row in dist.iter_rows(named=True):
|
||||||
|
value_dist_data.append({
|
||||||
|
"Question Group": key,
|
||||||
|
"Value": row[val_col],
|
||||||
|
"Count": row["count"]
|
||||||
|
})
|
||||||
|
|
||||||
|
stats_df = pl.DataFrame(group_stats)
|
||||||
|
dist_df = pl.DataFrame(value_dist_data)
|
||||||
|
|
||||||
|
# Plot 1: % of Responses with Straight-Liners per Question
|
||||||
|
# Vertical bars with Count label on top
|
||||||
|
base_pct = alt.Chart(stats_df).encode(
|
||||||
|
x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending"))
|
||||||
|
)
|
||||||
|
|
||||||
|
bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode(
|
||||||
|
y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")),
|
||||||
|
tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"]
|
||||||
|
)
|
||||||
|
|
||||||
|
text_pct = base_pct.mark_text(dy=-10).encode(
|
||||||
|
y=alt.Y("Straight-Liner %:Q"),
|
||||||
|
text=alt.Text("Count")
|
||||||
|
)
|
||||||
|
|
||||||
|
chart_pct = (bars_pct + text_pct).properties(
|
||||||
|
title="Share of Responses with Straight-Liners per Question",
|
||||||
|
width=800,
|
||||||
|
height=300
|
||||||
|
)
|
||||||
|
|
||||||
|
# Plot 2: Value Distribution (Horizontal Stacked Normalized Bar)
|
||||||
|
# Question Groups sorted by Total Count
|
||||||
|
# Values stacked 1 (left) -> 5 (right)
|
||||||
|
# Legend on top
|
||||||
|
# Total count at bar end
|
||||||
|
|
||||||
|
# Sort order for Y axis (Question Group) based on total Count (descending)
|
||||||
|
# Explicitly calculate sort order from stats_df to ensure consistency across layers
|
||||||
|
# High counts at the top
|
||||||
|
sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list()
|
||||||
|
|
||||||
|
# Base chart for Bars
|
||||||
|
# Use JPMC-aligned colors (blues) instead of default categorical rainbow
|
||||||
|
# Remove legend title as per plots.py style
|
||||||
|
bars_dist = alt.Chart(dist_df).mark_bar().encode(
|
||||||
|
y=alt.Y("Question Group", sort=sorted_groups),
|
||||||
|
x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"),
|
||||||
|
color=alt.Color("Value:O",
|
||||||
|
title=None, # explicit removal of title like in plots.py
|
||||||
|
scale=alt.Scale(scheme="blues"), # Professional blue scale
|
||||||
|
legend=alt.Legend(orient="top", direction="horizontal")
|
||||||
|
),
|
||||||
|
order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right
|
||||||
|
tooltip=["Question Group", "Value", "Count"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Text layer for Total Count (using stats_df which already has totals)
|
||||||
|
# using same sort for Y
|
||||||
|
text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode(
|
||||||
|
y=alt.Y("Question Group", sort=sorted_groups),
|
||||||
|
x=alt.datum(1.0), # Position at 100%
|
||||||
|
text=alt.Text("Count")
|
||||||
|
)
|
||||||
|
|
||||||
|
chart_dist = (bars_dist + text_dist).properties(
|
||||||
|
title="Distribution of Straight-Lined Values",
|
||||||
|
width=800,
|
||||||
|
height=500
|
||||||
|
)
|
||||||
|
|
||||||
|
analysis_md = f"""
|
||||||
|
### Straight-Lining Analysis
|
||||||
|
|
||||||
|
*"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.*
|
||||||
|
|
||||||
|
* **Total Respondents**: {total_respondents}
|
||||||
|
* **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
return mo.vstack([
|
||||||
|
mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
|
||||||
|
mo.ui.table(outlier_df),
|
||||||
|
mo.md(analysis_md),
|
||||||
|
mo.md("#### Speaking Style Question Groups"),
|
||||||
|
alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user