diff --git a/02_quant_analysis.py b/02_quant_analysis.py index 1df26eb..2c33ce8 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -74,6 +74,13 @@ def _(Path, RESULTS_FILE, data_all, mo): return +@app.cell +def _(): + sl_ss_max_score = 5 + sl_v1_10_max_score = 10 + return sl_ss_max_score, sl_v1_10_max_score + + @app.cell def _( S, @@ -82,12 +89,20 @@ def _( data_all, duration_validation, mo, + sl_ss_max_score, + sl_v1_10_max_score, ): _ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId') - sl_content = check_straight_liners(_ss_all, max_score=5) + _sl_ss_c, sl_ss_df = check_straight_liners(_ss_all, max_score=sl_ss_max_score) + + _sl_v1_10_c, sl_v1_10_df = check_straight_liners( + S.get_voice_scale_1_10(data_all)[0], + max_score=sl_v1_10_max_score + ) + mo.md(f""" - ## Data Validation + # Data Validation {check_progress(data_all)} @@ -96,12 +111,30 @@ def _( {duration_validation(data_all)} - {sl_content} + ## Speaking Style - Straight Liners + {_sl_ss_c} + + ## Voice Score Scale 1-10 - Straight Liners + {_sl_v1_10_c} """) return +@app.cell +def _(data_all): + # # Drop any Voice Scale 1-10 responses with straight-lining, using sl_v1_10_df _responseId values + # records_to_drop = sl_v1_10_df.select('Record ID').to_series().to_list() + + # data_validated = data_all.filter(~pl.col('_recordId').is_in(records_to_drop)) + + # mo.md(f""" + # Dropped `{len(records_to_drop)}` responses with straight-lining in Voice Scale 1-10 evaluation. + # """) + data_validated = data_all + return (data_validated,) + + @app.cell(hide_code=True) def _(S, mo): filter_form = mo.md(''' @@ -138,9 +171,9 @@ def _(S, mo): @app.cell -def _(S, data_all, filter_form, mo): +def _(S, data_validated, filter_form, mo): mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**")) - _d = S.filter_data(data_all, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer']) + _d = S.filter_data(data_validated, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer']) # Stop execution and prevent other cells from running if no data is selected mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**")) @@ -363,8 +396,16 @@ def _(S, mo, vscales): return -@app.cell(hide_code=True) -def _(): +@app.cell +def _(vscales): + target_cols=[c for c in vscales.columns if c not in ['_recordId']] + target_cols + return (target_cols,) + + +@app.cell +def _(target_cols, utils, vscales): + vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=target_cols) return diff --git a/99_example_ingest_qualtrics_export.py b/99_example_ingest_qualtrics_export.py index e43e744..094a5a6 100644 --- a/99_example_ingest_qualtrics_export.py +++ b/99_example_ingest_qualtrics_export.py @@ -205,7 +205,7 @@ def _(mo): @app.cell def _(data, survey): vscales = survey.get_voice_scale_1_10(data)[0].collect() - vscales + print(vscales.head()) return (vscales,) diff --git a/utils.py b/utils.py index 532f133..28d3a2c 100644 --- a/utils.py +++ b/utils.py @@ -349,6 +349,66 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame: return pl.DataFrame(scores).sort('Weighted Score', descending=True) +def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame: + """ + Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std). + Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row. + """ + + # Using list evaluation for row-wise stats + # We create a temporary list column containing values from all target columns + df_norm = df.with_columns( + pl.concat_list(target_cols) + .list.eval( + # Apply standardization: (x - mean) / std + # std(ddof=1) is the sample standard deviation + (pl.element() - pl.element().mean()) / pl.element().std(ddof=1) + ) + .alias("_normalized_values") + ) + + # Unpack the list back to original columns + # list.get(i) retrieves the i-th element which corresponds to target_cols[i] + return df_norm.with_columns([ + pl.col("_normalized_values").list.get(i).alias(target_cols[i]) + for i in range(len(target_cols)) + ]).drop("_normalized_values") + + +def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame: + """ + Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std). + Computes a single mean and standard deviation across ALL values in the target_cols and applies it. + Ignores null values (NaNs). + """ + # Ensure eager for scalar extraction + was_lazy = isinstance(df, pl.LazyFrame) + if was_lazy: + df = df.collect() + + if len(target_cols) == 0: + return df.lazy() if was_lazy else df + + # Calculate global stats efficiently by stacking all columns + stats = df.select(target_cols).melt().select([ + pl.col("value").mean().alias("mean"), + pl.col("value").std().alias("std") + ]) + + global_mean = stats["mean"][0] + global_std = stats["std"][0] + + if global_std is None or global_std == 0: + return df.lazy() if was_lazy else df + + res = df.with_columns([ + ((pl.col(col) - global_mean) / global_std).alias(col) + for col in target_cols + ]) + + return res.lazy() if was_lazy else res + + class JPMCSurvey(JPMCPlotsMixin): """Class to handle JPMorgan Chase survey data.""" diff --git a/validation.py b/validation.py index e159f60..2efc416 100644 --- a/validation.py +++ b/validation.py @@ -6,9 +6,9 @@ from theme import ColorPalette def check_progress(data): """Check if all responses are complete based on 'progress' column.""" if data.collect().select(pl.col('progress').unique()).shape[0] == 1: - return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """ + return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """ - return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️" + return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️" def duration_validation(data): @@ -31,9 +31,9 @@ def duration_validation(data): outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() if outlier_data.shape[0] == 0: - return "### Duration Outliers: \n\n✅ No duration outliers detected" + return "## Duration Outliers: \n\n✅ No duration outliers detected" - return f"""### Duration Outliers: + return f"""## Duration Outliers: **⚠️ Potential outliers detected based on response duration ⚠️** @@ -69,13 +69,25 @@ def check_straight_liners(data, max_score=3): schema_names = data.collect_schema().names() # regex groupings - pattern = re.compile(r"(.*__V\d+)__Choice_\d+") + pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+") + pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+") + groups = {} for col in schema_names: - match = pattern.search(col) - if match: - group_key = match.group(1) + # Check for Choice pattern (SS_...__Vxx__Choice_y) + match_choice = pattern_choice.search(col) + if match_choice: + group_key = match_choice.group(1) + if group_key not in groups: + groups[group_key] = [] + groups[group_key].append(col) + continue + + # Check for Voice Scale pattern (Voice_Scale_1_10__Vxx) + # All of these form a single group "Voice_Scale_1_10" + if pattern_scale.search(col): + group_key = "Voice_Scale_1_10" if group_key not in groups: groups[group_key] = [] groups[group_key].append(col) @@ -86,11 +98,11 @@ def check_straight_liners(data, max_score=3): if not multi_attribute_groups: return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found." - # Cast all involved columns to Int64 (strict=False) to handle potential string columns - # This prevents "cannot compare string with numeric type" errors + # Cast all involved columns to Float64 (strict=False) to handle potential string columns + # and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well. all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols] data = data.with_columns([ - pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols + pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols ]) # Build expressions @@ -136,9 +148,18 @@ def check_straight_liners(data, max_score=3): filtered = checked_data.filter(pl.col(flag_col)) if filtered.height > 0: - # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.) - # Assuming format ends with __Choice_X - sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1])) + # Sort group_cols logic + # If Choice columns, sort by choice number. + # If Voice Scale columns (no Choice_), sort by Voice ID (Vxx) + if all("__Choice_" in c for c in group_cols): + key_func = lambda c: int(c.split('__Choice_')[-1]) + else: + # Extract digits from Vxx + def key_func(c): + m = re.search(r"__V(\d+)", c) + return int(m.group(1)) if m else 0 + + sorted_group_cols = sorted(group_cols, key=key_func) # Select relevant columns: Record ID, Value, and the sorted group columns subset = filtered.select(["_recordId", val_col] + sorted_group_cols) @@ -155,7 +176,7 @@ def check_straight_liners(data, max_score=3): }) if not outliers: - return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})" + return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None outlier_df = pl.DataFrame(outliers) @@ -291,13 +312,12 @@ def check_straight_liners(data, max_score=3): """ - return mo.vstack([ - mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"), + return (mo.vstack([ + mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"), mo.ui.table(outlier_df), mo.md(analysis_md), - mo.md("#### Speaking Style Question Groups"), alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent") - ]) + ]), outlier_df) @@ -311,7 +331,10 @@ if __name__ == "__main__": S = JPMCSurvey(RESULTS_FILE, QSF_FILE) data = S.load_data() - print("Checking Green Blue:") - print(check_straight_liners(S.get_ss_green_blue(data)[0])) - print("Checking Orange Red:") - print(check_straight_liners(S.get_ss_orange_red(data)[0])) \ No newline at end of file + # print("Checking Green Blue:") + # print(check_straight_liners(S.get_ss_green_blue(data)[0])) + # print("Checking Orange Red:") + # print(check_straight_liners(S.get_ss_orange_red(data)[0])) + + print("Checking Voice Scale 1-10:") + print(check_straight_liners(S.get_voice_scale_1_10(data)[0])) \ No newline at end of file