340 lines
13 KiB
Python
340 lines
13 KiB
Python
import marimo as mo
|
||
import polars as pl
|
||
import altair as alt
|
||
from theme import ColorPalette
|
||
|
||
def check_progress(data):
|
||
"""Check if all responses are complete based on 'progress' column."""
|
||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||
return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """
|
||
|
||
return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
|
||
|
||
|
||
def duration_validation(data):
|
||
"""Validate response durations to identify outliers."""
|
||
# Identify any outliers in duration
|
||
duration_stats = data.select(
|
||
pl.col('duration').mean().alias('mean_duration'),
|
||
pl.col('duration').std().alias('std_duration')
|
||
).collect()
|
||
mean_duration = duration_stats['mean_duration'][0]
|
||
std_duration = duration_stats['std_duration'][0]
|
||
upper_outlier_threshold = mean_duration + 3 * std_duration
|
||
lower_outlier_threshold = mean_duration - 3 * std_duration
|
||
|
||
_d = data.with_columns(
|
||
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
||
)
|
||
|
||
# Show durations with outlier flag is true
|
||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||
|
||
if outlier_data.shape[0] == 0:
|
||
return "## Duration Outliers: \n\n✅ No duration outliers detected"
|
||
|
||
return f"""## Duration Outliers:
|
||
|
||
**⚠️ Potential outliers detected based on response duration ⚠️**
|
||
|
||
| Metric | Value |
|
||
|--------|-------|
|
||
| Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
|
||
| Standard Deviation of Duration | {std_duration:.2f} seconds |
|
||
| Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
|
||
| Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
|
||
| Number of Outlier Responses | {outlier_data.shape[0]} |
|
||
|
||
Outliers:
|
||
|
||
{mo.ui.table(outlier_data)}
|
||
|
||
|
||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||
|
||
"""
|
||
|
||
|
||
def check_straight_liners(data, max_score=3):
|
||
"""
|
||
Check for straight-lining behavior (selecting same value for all attributes).
|
||
|
||
Args:
|
||
data: Polars LazyFrame
|
||
max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
|
||
"""
|
||
import re
|
||
|
||
# detect columns groups based on pattern SS_...__Vxx__Choice_y
|
||
schema_names = data.collect_schema().names()
|
||
|
||
# regex groupings
|
||
pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+")
|
||
pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+")
|
||
|
||
groups = {}
|
||
|
||
for col in schema_names:
|
||
# Check for Choice pattern (SS_...__Vxx__Choice_y)
|
||
match_choice = pattern_choice.search(col)
|
||
if match_choice:
|
||
group_key = match_choice.group(1)
|
||
if group_key not in groups:
|
||
groups[group_key] = []
|
||
groups[group_key].append(col)
|
||
continue
|
||
|
||
# Check for Voice Scale pattern (Voice_Scale_1_10__Vxx)
|
||
# All of these form a single group "Voice_Scale_1_10"
|
||
if pattern_scale.search(col):
|
||
group_key = "Voice_Scale_1_10"
|
||
if group_key not in groups:
|
||
groups[group_key] = []
|
||
groups[group_key].append(col)
|
||
|
||
# Filter for groups with multiple attributes/choices
|
||
multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
|
||
|
||
if not multi_attribute_groups:
|
||
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
||
|
||
# Cast all involved columns to Float64 (strict=False) to handle potential string columns
|
||
# and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well.
|
||
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
|
||
data = data.with_columns([
|
||
pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols
|
||
])
|
||
|
||
# Build expressions
|
||
expressions = []
|
||
|
||
for key, cols in multi_attribute_groups.items():
|
||
# Logic:
|
||
# 1. Create list of values
|
||
# 2. Drop nulls
|
||
# 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
|
||
|
||
list_expr = pl.concat_list(cols).list.drop_nulls()
|
||
|
||
# Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
|
||
# If n_unique == 1, min() is the same as the single value.
|
||
# If list is empty, min() is null, which is safe.
|
||
safe_val = list_expr.list.min()
|
||
|
||
is_straight = (
|
||
(list_expr.list.len() > 0) &
|
||
(list_expr.list.n_unique() == 1) &
|
||
(safe_val <= max_score)
|
||
).alias(f"__is_straight__{key}")
|
||
|
||
value_expr = safe_val.alias(f"__val__{key}")
|
||
has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}")
|
||
|
||
expressions.extend([is_straight, value_expr, has_data])
|
||
|
||
# collect data with checks
|
||
# We only need _recordId and the check columns
|
||
# We do with_columns then select implicitly/explicitly via filter/select later.
|
||
|
||
checked_data = data.with_columns(expressions).collect()
|
||
|
||
# Process results into a nice table
|
||
outliers = []
|
||
|
||
for key, group_cols in multi_attribute_groups.items():
|
||
flag_col = f"__is_straight__{key}"
|
||
val_col = f"__val__{key}"
|
||
|
||
filtered = checked_data.filter(pl.col(flag_col))
|
||
|
||
if filtered.height > 0:
|
||
# Sort group_cols logic
|
||
# If Choice columns, sort by choice number.
|
||
# If Voice Scale columns (no Choice_), sort by Voice ID (Vxx)
|
||
if all("__Choice_" in c for c in group_cols):
|
||
key_func = lambda c: int(c.split('__Choice_')[-1])
|
||
else:
|
||
# Extract digits from Vxx
|
||
def key_func(c):
|
||
m = re.search(r"__V(\d+)", c)
|
||
return int(m.group(1)) if m else 0
|
||
|
||
sorted_group_cols = sorted(group_cols, key=key_func)
|
||
|
||
# Select relevant columns: Record ID, Value, and the sorted group columns
|
||
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
|
||
|
||
for row in subset.iter_rows(named=True):
|
||
# Create ordered list of values, using 'NaN' for missing data
|
||
resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols]
|
||
|
||
outliers.append({
|
||
"Record ID": row["_recordId"],
|
||
"Question Group": key,
|
||
"Value": row[val_col],
|
||
"Responses": str(resp_list)
|
||
})
|
||
|
||
if not outliers:
|
||
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None
|
||
|
||
outlier_df = pl.DataFrame(outliers)
|
||
|
||
# --- Analysis & Visualization ---
|
||
|
||
total_respondents = checked_data.height
|
||
|
||
# 1. & 3. Percentage Calculation
|
||
group_stats = []
|
||
value_dist_data = []
|
||
|
||
# Calculate Straight-Liners for ALL groups found in Data
|
||
# Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions)
|
||
# Logic: For every group G: if G has data (len > 0), then G must be straight.
|
||
# Also, the respondent must have answered at least one question group.
|
||
|
||
conditions = []
|
||
has_any_data_exprs = []
|
||
|
||
for key in multi_attribute_groups.keys():
|
||
flag_col = f"__is_straight__{key}"
|
||
data_col = f"__has_data__{key}"
|
||
|
||
# If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user.
|
||
# Equivalent: (not has_data) OR is_straight
|
||
cond = (~pl.col(data_col)) | pl.col(flag_col)
|
||
conditions.append(cond)
|
||
has_any_data_exprs.append(pl.col(data_col))
|
||
|
||
all_straight_count = checked_data.filter(
|
||
pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs)
|
||
).height
|
||
all_straight_pct = (all_straight_count / total_respondents) * 100
|
||
|
||
for key in multi_attribute_groups.keys():
|
||
flag_col = f"__is_straight__{key}"
|
||
val_col = f"__val__{key}"
|
||
|
||
# Filter for straight-liners in this specific group
|
||
sl_sub = checked_data.filter(pl.col(flag_col))
|
||
count = sl_sub.height
|
||
pct = (count / total_respondents) * 100
|
||
|
||
group_stats.append({
|
||
"Question Group": key,
|
||
"Straight-Liner %": pct,
|
||
"Count": count
|
||
})
|
||
|
||
# Get Value Distribution for this group's straight-liners
|
||
if count > 0:
|
||
# Group by the Value they straight-lined
|
||
dist = sl_sub.group_by(val_col).agg(pl.len().alias("count"))
|
||
for row in dist.iter_rows(named=True):
|
||
value_dist_data.append({
|
||
"Question Group": key,
|
||
"Value": row[val_col],
|
||
"Count": row["count"]
|
||
})
|
||
|
||
stats_df = pl.DataFrame(group_stats)
|
||
dist_df = pl.DataFrame(value_dist_data)
|
||
|
||
# Plot 1: % of Responses with Straight-Liners per Question
|
||
# Vertical bars with Count label on top
|
||
base_pct = alt.Chart(stats_df).encode(
|
||
x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending"))
|
||
)
|
||
|
||
bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode(
|
||
y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")),
|
||
tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"]
|
||
)
|
||
|
||
text_pct = base_pct.mark_text(dy=-10).encode(
|
||
y=alt.Y("Straight-Liner %:Q"),
|
||
text=alt.Text("Count")
|
||
)
|
||
|
||
chart_pct = (bars_pct + text_pct).properties(
|
||
title="Share of Responses with Straight-Liners per Question",
|
||
width=800,
|
||
height=300
|
||
)
|
||
|
||
# Plot 2: Value Distribution (Horizontal Stacked Normalized Bar)
|
||
# Question Groups sorted by Total Count
|
||
# Values stacked 1 (left) -> 5 (right)
|
||
# Legend on top
|
||
# Total count at bar end
|
||
|
||
# Sort order for Y axis (Question Group) based on total Count (descending)
|
||
# Explicitly calculate sort order from stats_df to ensure consistency across layers
|
||
# High counts at the top
|
||
sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list()
|
||
|
||
# Base chart for Bars
|
||
# Use JPMC-aligned colors (blues) instead of default categorical rainbow
|
||
# Remove legend title as per plots.py style
|
||
bars_dist = alt.Chart(dist_df).mark_bar().encode(
|
||
y=alt.Y("Question Group", sort=sorted_groups),
|
||
x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"),
|
||
color=alt.Color("Value:O",
|
||
title=None, # explicit removal of title like in plots.py
|
||
scale=alt.Scale(scheme="blues"), # Professional blue scale
|
||
legend=alt.Legend(orient="top", direction="horizontal")
|
||
),
|
||
order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right
|
||
tooltip=["Question Group", "Value", "Count"]
|
||
)
|
||
|
||
# Text layer for Total Count (using stats_df which already has totals)
|
||
# using same sort for Y
|
||
text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode(
|
||
y=alt.Y("Question Group", sort=sorted_groups),
|
||
x=alt.datum(1.0), # Position at 100%
|
||
text=alt.Text("Count")
|
||
)
|
||
|
||
chart_dist = (bars_dist + text_dist).properties(
|
||
title="Distribution of Straight-Lined Values",
|
||
width=800,
|
||
height=500
|
||
)
|
||
|
||
analysis_md = f"""
|
||
### Straight-Lining Analysis
|
||
|
||
*"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.*
|
||
|
||
* **Total Respondents**: {total_respondents}
|
||
* **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents)
|
||
|
||
"""
|
||
|
||
return (mo.vstack([
|
||
mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"),
|
||
mo.ui.table(outlier_df),
|
||
mo.md(analysis_md),
|
||
alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
|
||
]), outlier_df)
|
||
|
||
|
||
|
||
if __name__ == "__main__":
|
||
|
||
from utils import QualtricsSurvey
|
||
|
||
RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv"
|
||
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
|
||
|
||
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
|
||
data = S.load_data()
|
||
|
||
# print("Checking Green Blue:")
|
||
# print(check_straight_liners(S.get_ss_green_blue(data)[0]))
|
||
# print("Checking Orange Red:")
|
||
# print(check_straight_liners(S.get_ss_orange_red(data)[0]))
|
||
|
||
print("Checking Voice Scale 1-10:")
|
||
print(check_straight_liners(S.get_voice_scale_1_10(data)[0])) |