fixed normalization functions

2026-01-29 21:53:58 +01:00
parent becc435d3c
commit 036dd911df
2 changed files with 55 additions and 32 deletions
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -388,38 +388,48 @@ def _(S, data, mo):
    return (vscales,)


+@app.cell
+def _(pl, vscales):
+    # Count non-null values per row
+    nn_vscale = vscales.with_columns(
+        non_null_count = pl.sum_horizontal(pl.all().exclude("_recordID").is_not_null())
+    )
+    nn_vscale.collect()['non_null_count'].describe()
+    return
+
+
@app.cell(hide_code=True)
 def _(S, mo, vscales):
    mo.md(f"""
    ### How does each voice score on a scale from 1-10?

-    {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000, domain=[1,10]))}
+    {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)"))}
    """)
    return


@app.cell
-def _(utils, vscales):
+def _(S, mo, utils, vscales):
    _target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']]
    vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=_target_cols)
-    vscales_row_norm
-    return (vscales_row_norm,)

-
-@app.cell
-def _(S, mo, vscales_row_norm):
    mo.md(f"""
    ### Voice scale 1-10 normalized per respondent?

-    {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_row_norm, x_label='Voice', width=1000))}
+    {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_row_norm, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10) - Normalized per Respondent"))}
    """)
    return


@app.cell
-def _(mo):
-    mo.md(r"""
+def _(S, mo, utils, vscales):
+    _target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']]
+    vscales_global_norm = utils.normalize_global_values(vscales.collect(), target_cols=_target_cols)

+    mo.md(f"""
+    ### Voice scale 1-10 normalized per respondent?
+
+    {mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_global_norm, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10) - Normalized Across All Respondents"))}
    """)
    return

--- a/utils.py
+++ b/utils.py
@@ -352,31 +352,44 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
 def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
    """
    Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization).
-    Formula: ((x - min) / (max - min)) * 10
-    Ignores null values (NaNs).
+    Formula: ((x - row_min) / (row_max - row_min)) * 10
+    
+    Nulls are preserved as nulls. If all non-null values in a row are equal (max == min),
+    those values become 5.0 (midpoint of the scale).
+    
+    Parameters
+    ----------
+    df : pl.DataFrame
+        Input dataframe.
+    target_cols : list[str]
+        List of column names to normalize.
+        
+    Returns
+    -------
+    pl.DataFrame
+        DataFrame with target columns normalized row-wise.
    """
+    # Calculate row min and max across target columns (ignoring nulls)
+    row_min = pl.min_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols])
+    row_max = pl.max_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols])
+    row_range = row_max - row_min
    
-    # Using list evaluation for row-wise stats
-    # We create a temporary list column containing values from all target columns
-    # Ensure columns are cast to Float64 to avoid type errors with mixed/string data
-    df_norm = df.with_columns(
-        pl.concat_list([pl.col(c).cast(pl.Float64) for c in target_cols])
-        .list.eval(
-            # Apply Min-Max scaling to 0-10
-            (
-                (pl.element() - pl.element().min()) / 
-                (pl.element().max() - pl.element().min())
-            ) * 10
+    # Build normalized column expressions
+    norm_exprs = []
+    for col in target_cols:
+        norm_exprs.append(
+            pl.when(row_range == 0)
+            .then(
+                # If range is 0 (all values equal), return 5.0 for non-null, null for null
+                pl.when(pl.col(col).is_null()).then(None).otherwise(5.0)
+            )
+            .otherwise(
+                ((pl.col(col).cast(pl.Float64) - row_min) / row_range) * 10
+            )
+            .alias(col)
        )
-        .alias("_normalized_values")
-    )
    
-    # Unpack the list back to original columns
-    # list.get(i) retrieves the i-th element which corresponds to target_cols[i]
-    return df_norm.with_columns([
-        pl.col("_normalized_values").list.get(i).alias(target_cols[i])
-        for i in range(len(target_cols))
-    ]).drop("_normalized_values")
+    return df.with_columns(norm_exprs)


 def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame: