diff --git a/02_quant_analysis.py b/02_quant_analysis.py index 63d4690..1df26eb 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -74,11 +74,6 @@ def _(Path, RESULTS_FILE, data_all, mo): return -@app.cell -def _(): - return - - @app.cell def _( S, @@ -88,7 +83,8 @@ def _( duration_validation, mo, ): - sl_content, sl_df = check_straight_liners(S.get_ss_green_blue(data_all)[0], max_score=5) + _ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId') + sl_content = check_straight_liners(_ss_all, max_score=5) mo.md(f""" ## Data Validation @@ -106,18 +102,6 @@ def _( return -@app.cell -def _(mo): - mo.md(r""" - --- - - # Data Filter - - Use to select a subset of the data for the following analysis - """) - return - - @app.cell(hide_code=True) def _(S, mo): filter_form = mo.md(''' diff --git a/validation.py b/validation.py index 6caef5e..e159f60 100644 --- a/validation.py +++ b/validation.py @@ -1,6 +1,7 @@ import marimo as mo import polars as pl - +import altair as alt +from theme import ColorPalette def check_progress(data): """Check if all responses are complete based on 'progress' column.""" @@ -115,8 +116,9 @@ def check_straight_liners(data, max_score=3): ).alias(f"__is_straight__{key}") value_expr = safe_val.alias(f"__val__{key}") + has_data = (list_expr.list.len() > 0).alias(f"__has_data__{key}") - expressions.extend([is_straight, value_expr]) + expressions.extend([is_straight, value_expr, has_data]) # collect data with checks # We only need _recordId and the check columns @@ -156,15 +158,146 @@ def check_straight_liners(data, max_score=3): return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})" outlier_df = pl.DataFrame(outliers) + + # --- Analysis & Visualization --- - return f"""### Straight-lining Checks: + total_respondents = checked_data.height - **⚠️ Potential straight-liners detected ⚠️** + # 1. & 3. Percentage Calculation + group_stats = [] + value_dist_data = [] + + # Calculate Straight-Liners for ALL groups found in Data + # Condition: Respondent straight-lined ALL questions that they actually answered (ignoring empty/skipped questions) + # Logic: For every group G: if G has data (len > 0), then G must be straight. + # Also, the respondent must have answered at least one question group. - Respondents selected the same value (<= {max_score}) for all attributes in the following groups: + conditions = [] + has_any_data_exprs = [] - {mo.ui.table(outlier_df)} - """, outlier_df + for key in multi_attribute_groups.keys(): + flag_col = f"__is_straight__{key}" + data_col = f"__has_data__{key}" + + # If has_data is True, is_straight MUST be True for it to count as valid straight-lining behavior for that user. + # Equivalent: (not has_data) OR is_straight + cond = (~pl.col(data_col)) | pl.col(flag_col) + conditions.append(cond) + has_any_data_exprs.append(pl.col(data_col)) + + all_straight_count = checked_data.filter( + pl.all_horizontal(conditions) & pl.any_horizontal(has_any_data_exprs) + ).height + all_straight_pct = (all_straight_count / total_respondents) * 100 + + for key in multi_attribute_groups.keys(): + flag_col = f"__is_straight__{key}" + val_col = f"__val__{key}" + + # Filter for straight-liners in this specific group + sl_sub = checked_data.filter(pl.col(flag_col)) + count = sl_sub.height + pct = (count / total_respondents) * 100 + + group_stats.append({ + "Question Group": key, + "Straight-Liner %": pct, + "Count": count + }) + + # Get Value Distribution for this group's straight-liners + if count > 0: + # Group by the Value they straight-lined + dist = sl_sub.group_by(val_col).agg(pl.len().alias("count")) + for row in dist.iter_rows(named=True): + value_dist_data.append({ + "Question Group": key, + "Value": row[val_col], + "Count": row["count"] + }) + + stats_df = pl.DataFrame(group_stats) + dist_df = pl.DataFrame(value_dist_data) + + # Plot 1: % of Responses with Straight-Liners per Question + # Vertical bars with Count label on top + base_pct = alt.Chart(stats_df).encode( + x=alt.X("Question Group", sort=alt.EncodingSortField(field="Straight-Liner %", order="descending")) + ) + + bars_pct = base_pct.mark_bar(color=ColorPalette.PRIMARY).encode( + y=alt.Y("Straight-Liner %:Q", axis=alt.Axis(format=".1f", title="Share of all responses [%]")), + tooltip=["Question Group", alt.Tooltip("Straight-Liner %:Q", format=".1f"), "Count"] + ) + + text_pct = base_pct.mark_text(dy=-10).encode( + y=alt.Y("Straight-Liner %:Q"), + text=alt.Text("Count") + ) + + chart_pct = (bars_pct + text_pct).properties( + title="Share of Responses with Straight-Liners per Question", + width=800, + height=300 + ) + + # Plot 2: Value Distribution (Horizontal Stacked Normalized Bar) + # Question Groups sorted by Total Count + # Values stacked 1 (left) -> 5 (right) + # Legend on top + # Total count at bar end + + # Sort order for Y axis (Question Group) based on total Count (descending) + # Explicitly calculate sort order from stats_df to ensure consistency across layers + # High counts at the top + sorted_groups = stats_df.sort("Count", descending=True)["Question Group"].to_list() + + # Base chart for Bars + # Use JPMC-aligned colors (blues) instead of default categorical rainbow + # Remove legend title as per plots.py style + bars_dist = alt.Chart(dist_df).mark_bar().encode( + y=alt.Y("Question Group", sort=sorted_groups), + x=alt.X("Count", stack="normalize", axis=alt.Axis(format="%"), title="Share of SL Responses"), + color=alt.Color("Value:O", + title=None, # explicit removal of title like in plots.py + scale=alt.Scale(scheme="blues"), # Professional blue scale + legend=alt.Legend(orient="top", direction="horizontal") + ), + order=alt.Order("Value", sort="ascending"), # Ensures 1 is Left, 5 is Right + tooltip=["Question Group", "Value", "Count"] + ) + + # Text layer for Total Count (using stats_df which already has totals) + # using same sort for Y + text_dist = alt.Chart(stats_df).mark_text(align='left', dx=5).encode( + y=alt.Y("Question Group", sort=sorted_groups), + x=alt.datum(1.0), # Position at 100% + text=alt.Text("Count") + ) + + chart_dist = (bars_dist + text_dist).properties( + title="Distribution of Straight-Lined Values", + width=800, + height=500 + ) + + analysis_md = f""" + ### Straight-Lining Analysis + + *"Straight-lining" is defined here as selecting the same response value for all attributes within a multi-attribute question group.* + + * **Total Respondents**: {total_respondents} + * **Respondents straight-lining ALL questions presented to them**: {all_straight_pct:.2f}% ({all_straight_count} respondents) + + """ + + return mo.vstack([ + mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"), + mo.ui.table(outlier_df), + mo.md(analysis_md), + mo.md("#### Speaking Style Question Groups"), + alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent") + ])