import marimo as mo import polars as pl import altair as alt from theme import ColorPalette def check_progress(data): """Check if all responses are complete based on 'progress' column.""" if data.collect().select(pl.col('progress').unique()).shape[0] == 1: return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """ return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️" def duration_validation(data): """Validate response durations to identify outliers.""" # Identify any outliers in duration duration_stats = data.select( pl.col('duration').mean().alias('mean_duration'), pl.col('duration').std().alias('std_duration') ).collect() mean_duration = duration_stats['mean_duration'][0] std_duration = duration_stats['std_duration'][0] upper_outlier_threshold = mean_duration + 3 * std_duration lower_outlier_threshold = mean_duration - 3 * std_duration _d = data.with_columns( ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration') ) # Show durations with outlier flag is true outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() if outlier_data.shape[0] == 0: return "### Duration Outliers: \n\n✅ No duration outliers detected" return f"""### Duration Outliers: **⚠️ Potential outliers detected based on response duration ⚠️** | Metric | Value | |--------|-------| | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) | | Standard Deviation of Duration | {std_duration:.2f} seconds | | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds | | Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds | | Number of Outlier Responses | {outlier_data.shape[0]} | Outliers: {mo.ui.table(outlier_data)} **⚠️ NOTE: These have not been removed from the dataset ⚠️** """ def check_straight_liners(data, max_score=3): """ Check for straight-lining behavior (selecting same value for all attributes). Args: data: Polars LazyFrame max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed). """ import re # detect columns groups based on pattern SS_...__Vxx__Choice_y schema_names = data.collect_schema().names() # regex groupings pattern = re.compile(r"(.*__V\d+)__Choice_\d+") groups = {} for col in schema_names: match = pattern.search(col) if match: group_key = match.group(1) if group_key not in groups: groups[group_key] = [] groups[group_key].append(col) # Filter for groups with multiple attributes/choices multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1} if not multi_attribute_groups: return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found." # Cast all involved columns to Int64 (strict=False) to handle potential string columns # This prevents "cannot compare string with numeric type" errors all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols] data = data.with_columns([ pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols ]) # Build expressions expressions = [] for key, cols in multi_attribute_groups.items(): # Logic: # 1. Create list of values # 2. Drop nulls # 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score list_expr = pl.concat_list(cols).list.drop_nulls() # Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists # If n_unique == 1, min() is the same as the single value. # If list is empty, min() is null, which is safe. safe_val = list_expr.list.min() is_straight = ( (list_expr.list.len() > 0) & (list_expr.list.n_unique() == 1) & (safe_val <= max_score) ).alias(f"__is_straight__{key}") value_expr = safe_val.alias(f"__val__{key}") has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}") expressions.extend([is_straight, value_expr, has_data]) # collect data with checks # We only need _recordId and the check columns # We do with_columns then select implicitly/explicitly via filter/select later. checked_data = data.with_columns(expressions).collect() # Process results into a nice table outliers = [] for key, group_cols in multi_attribute_groups.items(): flag_col = f"__is_straight__{key}" val_col = f"__val__{key}" filtered = checked_data.filter(pl.col(flag_col)) if filtered.height > 0: # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.) # Assuming format ends with __Choice_X sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1])) # Select relevant columns: Record ID, Value, and the sorted group columns subset = filtered.select(["_recordId", val_col] + sorted_group_cols) for row in subset.iter_rows(named=True): # Create ordered list of values, using 'NaN' for missing data resp_list = [row[c] if row[c] is not None else 'NaN' for c in sorted_group_cols] outliers.append({ "Record ID": row["_recordId"], "Question Group": key, "Value": row[val_col], "Responses": str(resp_list) }) if not outliers: return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})" outlier_df = pl.DataFrame(outliers) # --- Analysis & Visualization --- total_respondents = checked_data.height # 1. & 3. Percentage Calculation group_stats = [] value_dist_data = [] # Calculate Straight-Liners for ALL groups found in Data # Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions) # Logic: For every group G: if G has data (len > 0), then G must be straight. # Also, the respondent must have answered at least one question group. conditions = [] has_any_data_exprs = [] for key in multi_attribute_groups.keys(): flag_col = f"__is_straight__{key}" data_col = f"__has_data__{key}" # If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user. # Equivalent: (not has_data) OR is_straight cond = (~pl.col(data_col)) | pl.col(flag_col) conditions.append(cond) has_any_data_exprs.append(pl.col(data_col)) all_straight_count = checked_data.filter( pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs) ).height all_straight_pct = (all_straight_count / total_respondents) * 100 for key in multi_attribute_groups.keys(): flag_col = f"__is_straight__{key}" val_col = f"__val__{key}" # Filter for straight-liners in this specific group sl_sub = checked_data.filter(pl.col(flag_col)) count = sl_sub.height pct = (count / total_respondents) * 100 group_stats.append({ "Question Group": key, "Straight-Liner %": pct, "Count": count }) # Get Value Distribution for this group's straight-liners if count > 0: # Group by the Value they straight-lined dist = sl_sub.group_by(val_col).agg(pl.len().alias("count")) for row in dist.iter_rows(named=True): value_dist_data.append({ "Question Group": key, "Value": row[val_col], "Count": row["count"] }) stats_df = pl.DataFrame(group_stats) dist_df = pl.DataFrame(value_dist_data) # Plot 1: % of Responses with Straight-Liners per Question # Vertical bars with Count label on top base_pct = alt.Chart(stats_df).encode( x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending")) ) bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode( y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")), tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"] ) text_pct = base_pct.mark_text(dy=-10).encode( y=alt.Y("Straight-Liner %:Q"), text=alt.Text("Count") ) chart_pct = (bars_pct + text_pct).properties( title="Share of Responses with Straight-Liners per Question", width=800, height=300 ) # Plot 2: Value Distribution (Horizontal Stacked Normalized Bar) # Question Groups sorted by Total Count # Values stacked 1 (left) -> 5 (right) # Legend on top # Total count at bar end # Sort order for Y axis (Question Group) based on total Count (descending) # Explicitly calculate sort order from stats_df to ensure consistency across layers # High counts at the top sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list() # Base chart for Bars # Use JPMC-aligned colors (blues) instead of default categorical rainbow # Remove legend title as per plots.py style bars_dist = alt.Chart(dist_df).mark_bar().encode( y=alt.Y("Question Group", sort=sorted_groups), x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"), color=alt.Color("Value:O", title=None, # explicit removal of title like in plots.py scale=alt.Scale(scheme="blues"), # Professional blue scale legend=alt.Legend(orient="top", direction="horizontal") ), order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right tooltip=["Question Group", "Value", "Count"] ) # Text layer for Total Count (using stats_df which already has totals) # using same sort for Y text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode( y=alt.Y("Question Group", sort=sorted_groups), x=alt.datum(1.0), # Position at 100% text=alt.Text("Count") ) chart_dist = (bars_dist + text_dist).properties( title="Distribution of Straight-Lined Values", width=800, height=500 ) analysis_md = f""" ### Straight-Lining Analysis *"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.* * **Total Respondents**: {total_respondents} * **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents) """ return mo.vstack([ mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"), mo.ui.table(outlier_df), mo.md(analysis_md), mo.md("#### Speaking Style Question Groups"), alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent") ]) if __name__ == "__main__": from utils import JPMCSurvey RESULTS_FILE = "data/exports/OneDrive_2026-01-28/1-28-26 Afternoon/JPMC_Chase Brand Personality_Quant Round 1_January 28, 2026_Afternoon_Labels.csv" QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf" S = JPMCSurvey(RESULTS_FILE, QSF_FILE) data = S.load_data() print("Checking Green Blue:") print(check_straight_liners(S.get_ss_green_blue(data)[0])) print("Checking Orange Red:") print(check_straight_liners(S.get_ss_orange_red(data)[0]))