diff --git a/02_quant_analysis.py b/02_quant_analysis.py index edc787e..bc8b8b7 100644 --- a/02_quant_analysis.py +++ b/02_quant_analysis.py @@ -388,38 +388,48 @@ def _(S, data, mo): return (vscales,) +@app.cell +def _(pl, vscales): + # Count non-null values per row + nn_vscale = vscales.with_columns( + non_null_count = pl.sum_horizontal(pl.all().exclude("_recordID").is_not_null()) + ) + nn_vscale.collect()['non_null_count'].describe() + return + + @app.cell(hide_code=True) def _(S, mo, vscales): mo.md(f""" ### How does each voice score on a scale from 1-10? - {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000, domain=[1,10]))} + {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)"))} """) return @app.cell -def _(utils, vscales): +def _(S, mo, utils, vscales): _target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']] vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=_target_cols) - vscales_row_norm - return (vscales_row_norm,) - -@app.cell -def _(S, mo, vscales_row_norm): mo.md(f""" ### Voice scale 1-10 normalized per respondent? - {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_row_norm, x_label='Voice', width=1000))} + {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_row_norm, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10) - Normalized per Respondent"))} """) return @app.cell -def _(mo): - mo.md(r""" - +def _(S, mo, utils, vscales): + _target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']] + vscales_global_norm = utils.normalize_global_values(vscales.collect(), target_cols=_target_cols) + + mo.md(f""" + ### Voice scale 1-10 normalized per respondent? + + {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_global_norm, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10) - Normalized Across All Respondents"))} """) return diff --git a/utils.py b/utils.py index 3b6fc0a..ebb309f 100644 --- a/utils.py +++ b/utils.py @@ -352,31 +352,44 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame: def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame: """ Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization). - Formula: ((x - min) / (max - min)) * 10 - Ignores null values (NaNs). + Formula: ((x - row_min) / (row_max - row_min)) * 10 + + Nulls are preserved as nulls. If all non-null values in a row are equal (max == min), + those values become 5.0 (midpoint of the scale). + + Parameters + ---------- + df : pl.DataFrame + Input dataframe. + target_cols : list[str] + List of column names to normalize. + + Returns + ------- + pl.DataFrame + DataFrame with target columns normalized row-wise. """ + # Calculate row min and max across target columns (ignoring nulls) + row_min = pl.min_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols]) + row_max = pl.max_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols]) + row_range = row_max - row_min - # Using list evaluation for row-wise stats - # We create a temporary list column containing values from all target columns - # Ensure columns are cast to Float64 to avoid type errors with mixed/string data - df_norm = df.with_columns( - pl.concat_list([pl.col(c).cast(pl.Float64) for c in target_cols]) - .list.eval( - # Apply Min-Max scaling to 0-10 - ( - (pl.element() - pl.element().min()) / - (pl.element().max() - pl.element().min()) - ) * 10 + # Build normalized column expressions + norm_exprs = [] + for col in target_cols: + norm_exprs.append( + pl.when(row_range == 0) + .then( + # If range is 0 (all values equal), return 5.0 for non-null, null for null + pl.when(pl.col(col).is_null()).then(None).otherwise(5.0) + ) + .otherwise( + ((pl.col(col).cast(pl.Float64) - row_min) / row_range) * 10 + ) + .alias(col) ) - .alias("_normalized_values") - ) - # Unpack the list back to original columns - # list.get(i) retrieves the i-th element which corresponds to target_cols[i] - return df_norm.with_columns([ - pl.col("_normalized_values").list.get(i).alias(target_cols[i]) - for i in range(len(target_cols)) - ]).drop("_normalized_values") + return df.with_columns(norm_exprs) def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame: