Files
JPMC-quant/validation.py

340 lines
13 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import marimo as mo
import polars as pl
import altair as alt
from theme import ColorPalette
def check_progress(data):
"""Check if all responses are complete based on 'progress' column."""
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """
return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
def duration_validation(data):
"""Validate response durations to identify outliers."""
# Identify any outliers in duration
duration_stats = data.select(
pl.col('duration').mean().alias('mean_duration'),
pl.col('duration').std().alias('std_duration')
).collect()
mean_duration = duration_stats['mean_duration'][0]
std_duration = duration_stats['std_duration'][0]
upper_outlier_threshold = mean_duration + 3 * std_duration
lower_outlier_threshold = mean_duration - 3 * std_duration
_d = data.with_columns(
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
)
# Show durations with outlier flag is true
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
if outlier_data.shape[0] == 0:
return "## Duration Outliers: \n\n✅ No duration outliers detected"
return f"""## Duration Outliers:
**⚠️ Potential outliers detected based on response duration ⚠️**
| Metric | Value |
|--------|-------|
| Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
| Standard Deviation of Duration | {std_duration:.2f} seconds |
| Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
| Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
| Number of Outlier Responses | {outlier_data.shape[0]} |
Outliers:
{mo.ui.table(outlier_data)}
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
"""
def check_straight_liners(data, max_score=3):
"""
Check for straight-lining behavior (selecting same value for all attributes).
Args:
data: Polars LazyFrame
max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
"""
import re
# detect columns groups based on pattern SS_...__Vxx__Choice_y
schema_names = data.collect_schema().names()
# regex groupings
pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+")
pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+")
groups = {}
for col in schema_names:
# Check for Choice pattern (SS_...__Vxx__Choice_y)
match_choice = pattern_choice.search(col)
if match_choice:
group_key = match_choice.group(1)
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(col)
continue
# Check for Voice Scale pattern (Voice_Scale_1_10__Vxx)
# All of these form a single group "Voice_Scale_1_10"
if pattern_scale.search(col):
group_key = "Voice_Scale_1_10"
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(col)
# Filter for groups with multiple attributes/choices
multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
if not multi_attribute_groups:
return "### Straight-lining Checks: \n\n No multi-attribute question groups found."
# Cast all involved columns to Float64 (strict=False) to handle potential string columns
# and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well.
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
data = data.with_columns([
pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols
])
# Build expressions
expressions = []
for key, cols in multi_attribute_groups.items():
# Logic:
# 1. Create list of values
# 2. Drop nulls
# 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
list_expr = pl.concat_list(cols).list.drop_nulls()
# Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
# If n_unique == 1, min() is the same as the single value.
# If list is empty, min() is null, which is safe.
safe_val = list_expr.list.min()
is_straight = (
(list_expr.list.len() > 0) &
(list_expr.list.n_unique() == 1) &
(safe_val <= max_score)
).alias(f"__is_straight__{key}")
value_expr = safe_val.alias(f"__val__{key}")
has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}")
expressions.extend([is_straight, value_expr, has_data])
# collect data with checks
# We only need _recordId and the check columns
# We do with_columns then select implicitly/explicitly via filter/select later.
checked_data = data.with_columns(expressions).collect()
# Process results into a nice table
outliers = []
for key, group_cols in multi_attribute_groups.items():
flag_col = f"__is_straight__{key}"
val_col = f"__val__{key}"
filtered = checked_data.filter(pl.col(flag_col))
if filtered.height > 0:
# Sort group_cols logic
# If Choice columns, sort by choice number.
# If Voice Scale columns (no Choice_), sort by Voice ID (Vxx)
if all("__Choice_" in c for c in group_cols):
key_func = lambda c: int(c.split('__Choice_')[-1])
else:
# Extract digits from Vxx
def key_func(c):
m = re.search(r"__V(\d+)", c)
return int(m.group(1)) if m else 0
sorted_group_cols = sorted(group_cols, key=key_func)
# Select relevant columns: Record ID, Value, and the sorted group columns
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
for row in subset.iter_rows(named=True):
# Create ordered list of values, using 'NaN' for missing data
resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
outliers.append({
"Record ID": row["_recordId"],
"Question Group": key,
"Value": row[val_col],
"Responses": str(resp_list)
})
if not outliers:
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None
outlier_df = pl.DataFrame(outliers)
# --- Analysis & Visualization ---
total_respondents = checked_data.height
# 1. & 3. Percentage Calculation
group_stats = []
value_dist_data = []
# Calculate Straight-Liners for ALL groups found in Data
# Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions)
# Logic: For every group G: if G has data (len > 0), then G must be straight.
# Also, the respondent must have answered at least one question group.
conditions = []
has_any_data_exprs = []
for key in multi_attribute_groups.keys():
flag_col = f"__is_straight__{key}"
data_col = f"__has_data__{key}"
# If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user.
# Equivalent: (not has_data) OR is_straight
cond = (~pl.col(data_col)) | pl.col(flag_col)
conditions.append(cond)
has_any_data_exprs.append(pl.col(data_col))
all_straight_count = checked_data.filter(
pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs)
).height
all_straight_pct = (all_straight_count / total_respondents) * 100
for key in multi_attribute_groups.keys():
flag_col = f"__is_straight__{key}"
val_col = f"__val__{key}"
# Filter for straight-liners in this specific group
sl_sub = checked_data.filter(pl.col(flag_col))
count = sl_sub.height
pct = (count / total_respondents) * 100
group_stats.append({
"Question Group": key,
"Straight-Liner %": pct,
"Count": count
})
# Get Value Distribution for this group's straight-liners
if count > 0:
# Group by the Value they straight-lined
dist = sl_sub.group_by(val_col).agg(pl.len().alias("count"))
for row in dist.iter_rows(named=True):
value_dist_data.append({
"Question Group": key,
"Value": row[val_col],
"Count": row["count"]
})
stats_df = pl.DataFrame(group_stats)
dist_df = pl.DataFrame(value_dist_data)
# Plot 1: % of Responses with Straight-Liners per Question
# Vertical bars with Count label on top
base_pct = alt.Chart(stats_df).encode(
x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending"))
)
bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode(
y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")),
tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"]
)
text_pct = base_pct.mark_text(dy=-10).encode(
y=alt.Y("Straight-Liner %:Q"),
text=alt.Text("Count")
)
chart_pct = (bars_pct + text_pct).properties(
title="Share of Responses with Straight-Liners per Question",
width=800,
height=300
)
# Plot 2: Value Distribution (Horizontal Stacked Normalized Bar)
# Question Groups sorted by Total Count
# Values stacked 1 (left) -> 5 (right)
# Legend on top
# Total count at bar end
# Sort order for Y axis (Question Group) based on total Count (descending)
# Explicitly calculate sort order from stats_df to ensure consistency across layers
# High counts at the top
sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list()
# Base chart for Bars
# Use JPMC-aligned colors (blues) instead of default categorical rainbow
# Remove legend title as per plots.py style
bars_dist = alt.Chart(dist_df).mark_bar().encode(
y=alt.Y("Question Group", sort=sorted_groups),
x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"),
color=alt.Color("Value:O",
title=None, # explicit removal of title like in plots.py
scale=alt.Scale(scheme="blues"), # Professional blue scale
legend=alt.Legend(orient="top", direction="horizontal")
),
order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right
tooltip=["Question Group", "Value", "Count"]
)
# Text layer for Total Count (using stats_df which already has totals)
# using same sort for Y
text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode(
y=alt.Y("Question Group", sort=sorted_groups),
x=alt.datum(1.0), # Position at 100%
text=alt.Text("Count")
)
chart_dist = (bars_dist + text_dist).properties(
title="Distribution of Straight-Lined Values",
width=800,
height=500
)
analysis_md = f"""
### Straight-Lining Analysis
*"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.*
* **Total Respondents**: {total_respondents}
* **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents)
"""
return (mo.vstack([
mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"),
mo.ui.table(outlier_df),
mo.md(analysis_md),
alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
]), outlier_df)
if __name__ == "__main__":
from utils import QualtricsSurvey
RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
data = S.load_data()
# print("Checking Green Blue:")
# print(check_straight_liners(S.get_ss_green_blue(data)[0]))
# print("Checking Orange Red:")
# print(check_straight_liners(S.get_ss_orange_red(data)[0]))
print("Checking Voice Scale 1-10:")
print(check_straight_liners(S.get_voice_scale_1_10(data)[0]))