SL validation complete
This commit is contained in:
60
utils.py
60
utils.py
@@ -349,6 +349,66 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
|
||||
return pl.DataFrame(scores).sort('Weighted Score', descending=True)
|
||||
|
||||
|
||||
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||
"""
|
||||
Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std).
|
||||
Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row.
|
||||
"""
|
||||
|
||||
# Using list evaluation for row-wise stats
|
||||
# We create a temporary list column containing values from all target columns
|
||||
df_norm = df.with_columns(
|
||||
pl.concat_list(target_cols)
|
||||
.list.eval(
|
||||
# Apply standardization: (x - mean) / std
|
||||
# std(ddof=1) is the sample standard deviation
|
||||
(pl.element() - pl.element().mean()) / pl.element().std(ddof=1)
|
||||
)
|
||||
.alias("_normalized_values")
|
||||
)
|
||||
|
||||
# Unpack the list back to original columns
|
||||
# list.get(i) retrieves the i-th element which corresponds to target_cols[i]
|
||||
return df_norm.with_columns([
|
||||
pl.col("_normalized_values").list.get(i).alias(target_cols[i])
|
||||
for i in range(len(target_cols))
|
||||
]).drop("_normalized_values")
|
||||
|
||||
|
||||
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||
"""
|
||||
Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std).
|
||||
Computes a single mean and standard deviation across ALL values in the target_cols and applies it.
|
||||
Ignores null values (NaNs).
|
||||
"""
|
||||
# Ensure eager for scalar extraction
|
||||
was_lazy = isinstance(df, pl.LazyFrame)
|
||||
if was_lazy:
|
||||
df = df.collect()
|
||||
|
||||
if len(target_cols) == 0:
|
||||
return df.lazy() if was_lazy else df
|
||||
|
||||
# Calculate global stats efficiently by stacking all columns
|
||||
stats = df.select(target_cols).melt().select([
|
||||
pl.col("value").mean().alias("mean"),
|
||||
pl.col("value").std().alias("std")
|
||||
])
|
||||
|
||||
global_mean = stats["mean"][0]
|
||||
global_std = stats["std"][0]
|
||||
|
||||
if global_std is None or global_std == 0:
|
||||
return df.lazy() if was_lazy else df
|
||||
|
||||
res = df.with_columns([
|
||||
((pl.col(col) - global_mean) / global_std).alias(col)
|
||||
for col in target_cols
|
||||
])
|
||||
|
||||
return res.lazy() if was_lazy else res
|
||||
|
||||
|
||||
class JPMCSurvey(JPMCPlotsMixin):
|
||||
"""Class to handle JPMorgan Chase survey data."""
|
||||
|
||||
|
||||
Reference in New Issue
Block a user