drop voice46 from scales 1-10. fix plots breakline in title
This commit is contained in:
48
utils.py
48
utils.py
@@ -351,18 +351,22 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
|
||||
|
||||
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||
"""
|
||||
Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std).
|
||||
Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row.
|
||||
Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization).
|
||||
Formula: ((x - min) / (max - min)) * 10
|
||||
Ignores null values (NaNs).
|
||||
"""
|
||||
|
||||
# Using list evaluation for row-wise stats
|
||||
# We create a temporary list column containing values from all target columns
|
||||
# Ensure columns are cast to Float64 to avoid type errors with mixed/string data
|
||||
df_norm = df.with_columns(
|
||||
pl.concat_list(target_cols)
|
||||
pl.concat_list([pl.col(c).cast(pl.Float64) for c in target_cols])
|
||||
.list.eval(
|
||||
# Apply standardization: (x - mean) / std
|
||||
# std(ddof=1) is the sample standard deviation
|
||||
(pl.element() - pl.element().mean()) / pl.element().std(ddof=1)
|
||||
# Apply Min-Max scaling to 0-10
|
||||
(
|
||||
(pl.element() - pl.element().min()) /
|
||||
(pl.element().max() - pl.element().min())
|
||||
) * 10
|
||||
)
|
||||
.alias("_normalized_values")
|
||||
)
|
||||
@@ -377,8 +381,8 @@ def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFra
|
||||
|
||||
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||
"""
|
||||
Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std).
|
||||
Computes a single mean and standard deviation across ALL values in the target_cols and applies it.
|
||||
Normalizes values in the specified columns globally to 0-10 scale.
|
||||
Formula: ((x - global_min) / (global_max - global_min)) * 10
|
||||
Ignores null values (NaNs).
|
||||
"""
|
||||
# Ensure eager for scalar extraction
|
||||
@@ -390,19 +394,23 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
|
||||
return df.lazy() if was_lazy else df
|
||||
|
||||
# Calculate global stats efficiently by stacking all columns
|
||||
stats = df.select(target_cols).melt().select([
|
||||
pl.col("value").mean().alias("mean"),
|
||||
pl.col("value").std().alias("std")
|
||||
# Cast to Float64 to ensure numeric calculations
|
||||
stats = df.select([pl.col(c).cast(pl.Float64) for c in target_cols]).melt().select([
|
||||
pl.col("value").min().alias("min"),
|
||||
pl.col("value").max().alias("max")
|
||||
])
|
||||
|
||||
global_mean = stats["mean"][0]
|
||||
global_std = stats["std"][0]
|
||||
global_min = stats["min"][0]
|
||||
global_max = stats["max"][0]
|
||||
|
||||
if global_std is None or global_std == 0:
|
||||
# Handle edge case where all values are same or none exist
|
||||
if global_min is None or global_max is None or global_max == global_min:
|
||||
return df.lazy() if was_lazy else df
|
||||
|
||||
global_range = global_max - global_min
|
||||
|
||||
res = df.with_columns([
|
||||
((pl.col(col) - global_mean) / global_std).alias(col)
|
||||
(((pl.col(col).cast(pl.Float64) - global_min) / global_range) * 10).alias(col)
|
||||
for col in target_cols
|
||||
])
|
||||
|
||||
@@ -649,10 +657,12 @@ class JPMCSurvey(JPMCPlotsMixin):
|
||||
return subset, None
|
||||
|
||||
|
||||
def get_voice_scale_1_10(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
|
||||
def get_voice_scale_1_10(self, q: pl.LazyFrame, drop_cols=['Voice_Scale_1_10__V46']) -> Union[pl.LazyFrame, None]:
|
||||
"""Extract columns containing the Voice Scale 1-10 ratings for the Chase virtual assistant.
|
||||
|
||||
Returns subquery that can be chained with other polars queries.
|
||||
|
||||
Drops scores for V46 as it was improperly configured in the survey and thus did not show up for respondents.
|
||||
"""
|
||||
|
||||
QIDs_map = {}
|
||||
@@ -662,6 +672,12 @@ class JPMCSurvey(JPMCPlotsMixin):
|
||||
# Convert "Voice 16 Scale 1-10_1" to "Scale_1_10__Voice_16"
|
||||
QIDs_map[qid] = f"Voice_Scale_1_10__V{val['QName'].split()[1]}"
|
||||
|
||||
for col in drop_cols:
|
||||
if col in QIDs_map.values():
|
||||
# remove from QIDs_map
|
||||
qid_to_remove = [k for k,v in QIDs_map.items() if v == col][0]
|
||||
del QIDs_map[qid_to_remove]
|
||||
|
||||
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user