fixed normalization functions

This commit is contained in:
2026-01-29 21:53:58 +01:00
parent becc435d3c
commit 036dd911df
2 changed files with 55 additions and 32 deletions

View File

@@ -352,31 +352,44 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
"""
Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization).
Formula: ((x - min) / (max - min)) * 10
Ignores null values (NaNs).
Formula: ((x - row_min) / (row_max - row_min)) * 10
Nulls are preserved as nulls. If all non-null values in a row are equal (max == min),
those values become 5.0 (midpoint of the scale).
Parameters
----------
df : pl.DataFrame
Input dataframe.
target_cols : list[str]
List of column names to normalize.
Returns
-------
pl.DataFrame
DataFrame with target columns normalized row-wise.
"""
# Calculate row min and max across target columns (ignoring nulls)
row_min = pl.min_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols])
row_max = pl.max_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols])
row_range = row_max - row_min
# Using list evaluation for row-wise stats
# We create a temporary list column containing values from all target columns
# Ensure columns are cast to Float64 to avoid type errors with mixed/string data
df_norm = df.with_columns(
pl.concat_list([pl.col(c).cast(pl.Float64) for c in target_cols])
.list.eval(
# Apply Min-Max scaling to 0-10
(
(pl.element() - pl.element().min()) /
(pl.element().max() - pl.element().min())
) * 10
# Build normalized column expressions
norm_exprs = []
for col in target_cols:
norm_exprs.append(
pl.when(row_range == 0)
.then(
# If range is 0 (all values equal), return 5.0 for non-null, null for null
pl.when(pl.col(col).is_null()).then(None).otherwise(5.0)
)
.otherwise(
((pl.col(col).cast(pl.Float64) - row_min) / row_range) * 10
)
.alias(col)
)
.alias("_normalized_values")
)
# Unpack the list back to original columns
# list.get(i) retrieves the i-th element which corresponds to target_cols[i]
return df_norm.with_columns([
pl.col("_normalized_values").list.get(i).alias(target_cols[i])
for i in range(len(target_cols))
]).drop("_normalized_values")
return df.with_columns(norm_exprs)
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame: