drop voice46 from scales 1-10. fix plots breakline in title

This commit is contained in:
2026-01-29 21:10:56 +01:00
parent 8aee09f968
commit becc435d3c
3 changed files with 75 additions and 50 deletions

View File

@@ -351,18 +351,22 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
"""
Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std).
Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row.
Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization).
Formula: ((x - min) / (max - min)) * 10
Ignores null values (NaNs).
"""
# Using list evaluation for row-wise stats
# We create a temporary list column containing values from all target columns
# Ensure columns are cast to Float64 to avoid type errors with mixed/string data
df_norm = df.with_columns(
pl.concat_list(target_cols)
pl.concat_list([pl.col(c).cast(pl.Float64) for c in target_cols])
.list.eval(
# Apply standardization: (x - mean) / std
# std(ddof=1) is the sample standard deviation
(pl.element() - pl.element().mean()) / pl.element().std(ddof=1)
# Apply Min-Max scaling to 0-10
(
(pl.element() - pl.element().min()) /
(pl.element().max() - pl.element().min())
) * 10
)
.alias("_normalized_values")
)
@@ -377,8 +381,8 @@ def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFra
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
"""
Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std).
Computes a single mean and standard deviation across ALL values in the target_cols and applies it.
Normalizes values in the specified columns globally to 0-10 scale.
Formula: ((x - global_min) / (global_max - global_min)) * 10
Ignores null values (NaNs).
"""
# Ensure eager for scalar extraction
@@ -390,19 +394,23 @@ def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.Data
return df.lazy() if was_lazy else df
# Calculate global stats efficiently by stacking all columns
stats = df.select(target_cols).melt().select([
pl.col("value").mean().alias("mean"),
pl.col("value").std().alias("std")
# Cast to Float64 to ensure numeric calculations
stats = df.select([pl.col(c).cast(pl.Float64) for c in target_cols]).melt().select([
pl.col("value").min().alias("min"),
pl.col("value").max().alias("max")
])
global_mean = stats["mean"][0]
global_std = stats["std"][0]
global_min = stats["min"][0]
global_max = stats["max"][0]
if global_std is None or global_std == 0:
# Handle edge case where all values are same or none exist
if global_min is None or global_max is None or global_max == global_min:
return df.lazy() if was_lazy else df
global_range = global_max - global_min
res = df.with_columns([
((pl.col(col) - global_mean) / global_std).alias(col)
(((pl.col(col).cast(pl.Float64) - global_min) / global_range) * 10).alias(col)
for col in target_cols
])
@@ -649,10 +657,12 @@ class JPMCSurvey(JPMCPlotsMixin):
return subset, None
def get_voice_scale_1_10(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
def get_voice_scale_1_10(self, q: pl.LazyFrame, drop_cols=['Voice_Scale_1_10__V46']) -> Union[pl.LazyFrame, None]:
"""Extract columns containing the Voice Scale 1-10 ratings for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
Drops scores for V46 as it was improperly configured in the survey and thus did not show up for respondents.
"""
QIDs_map = {}
@@ -662,6 +672,12 @@ class JPMCSurvey(JPMCPlotsMixin):
# Convert "Voice 16 Scale 1-10_1" to "Scale_1_10__Voice_16"
QIDs_map[qid] = f"Voice_Scale_1_10__V{val['QName'].split()[1]}"
for col in drop_cols:
if col in QIDs_map.values():
# remove from QIDs_map
qid_to_remove = [k for k,v in QIDs_map.items() if v == col][0]
del QIDs_map[qid_to_remove]
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None