SL validation complete

2026-01-29 20:39:16 +01:00
parent c1729d4896
commit 8aee09f968
4 changed files with 155 additions and 31 deletions
--- a/utils.py
+++ b/utils.py
@@ -349,6 +349,66 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
    return pl.DataFrame(scores).sort('Weighted Score', descending=True)


+def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
+    """
+    Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std).
+    Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row.
+    """
+    
+    # Using list evaluation for row-wise stats
+    # We create a temporary list column containing values from all target columns
+    df_norm = df.with_columns(
+        pl.concat_list(target_cols)
+        .list.eval(
+            # Apply standardization: (x - mean) / std
+            # std(ddof=1) is the sample standard deviation
+            (pl.element() - pl.element().mean()) / pl.element().std(ddof=1)
+        )
+        .alias("_normalized_values")
+    )
+    
+    # Unpack the list back to original columns
+    # list.get(i) retrieves the i-th element which corresponds to target_cols[i]
+    return df_norm.with_columns([
+        pl.col("_normalized_values").list.get(i).alias(target_cols[i])
+        for i in range(len(target_cols))
+    ]).drop("_normalized_values")
+
+
+def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
+    """
+    Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std).
+    Computes a single mean and standard deviation across ALL values in the target_cols and applies it.
+    Ignores null values (NaNs).
+    """
+    # Ensure eager for scalar extraction
+    was_lazy = isinstance(df, pl.LazyFrame)
+    if was_lazy:
+        df = df.collect()
+        
+    if len(target_cols) == 0:
+        return df.lazy() if was_lazy else df
+
+    # Calculate global stats efficiently by stacking all columns
+    stats = df.select(target_cols).melt().select([
+        pl.col("value").mean().alias("mean"),
+        pl.col("value").std().alias("std")
+    ])
+    
+    global_mean = stats["mean"][0]
+    global_std = stats["std"][0]
+    
+    if global_std is None or global_std == 0:
+        return df.lazy() if was_lazy else df
+
+    res = df.with_columns([
+        ((pl.col(col) - global_mean) / global_std).alias(col)
+        for col in target_cols
+    ])
+    
+    return res.lazy() if was_lazy else res
+
+
 class JPMCSurvey(JPMCPlotsMixin):
    """Class to handle JPMorgan Chase survey data."""