fixed normalization functions
This commit is contained in:
@@ -388,38 +388,48 @@ def _(S, data, mo):
|
|||||||
return (vscales,)
|
return (vscales,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(pl, vscales):
|
||||||
|
# Count non-null values per row
|
||||||
|
nn_vscale = vscales.with_columns(
|
||||||
|
non_null_count = pl.sum_horizontal(pl.all().exclude("_recordID").is_not_null())
|
||||||
|
)
|
||||||
|
nn_vscale.collect()['non_null_count'].describe()
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(S, mo, vscales):
|
def _(S, mo, vscales):
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
### How does each voice score on a scale from 1-10?
|
### How does each voice score on a scale from 1-10?
|
||||||
|
|
||||||
{mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000, domain=[1,10]))}
|
{mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)"))}
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(utils, vscales):
|
def _(S, mo, utils, vscales):
|
||||||
_target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']]
|
_target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']]
|
||||||
vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=_target_cols)
|
vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=_target_cols)
|
||||||
vscales_row_norm
|
|
||||||
return (vscales_row_norm,)
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
|
||||||
def _(S, mo, vscales_row_norm):
|
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
### Voice scale 1-10 normalized per respondent?
|
### Voice scale 1-10 normalized per respondent?
|
||||||
|
|
||||||
{mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_row_norm, x_label='Voice', width=1000))}
|
{mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_row_norm, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10) - Normalized per Respondent"))}
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo):
|
def _(S, mo, utils, vscales):
|
||||||
mo.md(r"""
|
_target_cols=[c for c in vscales.collect().columns if c not in ['_recordId']]
|
||||||
|
vscales_global_norm = utils.normalize_global_values(vscales.collect(), target_cols=_target_cols)
|
||||||
|
|
||||||
|
mo.md(f"""
|
||||||
|
### Voice scale 1-10 normalized per respondent?
|
||||||
|
|
||||||
|
{mo.ui.altair_chart(S.plot_average_scores_with_counts(vscales_global_norm, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10) - Normalized Across All Respondents"))}
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
53
utils.py
53
utils.py
@@ -352,31 +352,44 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
|
|||||||
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||||
"""
|
"""
|
||||||
Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization).
|
Normalizes values in the specified columns row-wise to 0-10 scale (Min-Max normalization).
|
||||||
Formula: ((x - min) / (max - min)) * 10
|
Formula: ((x - row_min) / (row_max - row_min)) * 10
|
||||||
Ignores null values (NaNs).
|
|
||||||
|
Nulls are preserved as nulls. If all non-null values in a row are equal (max == min),
|
||||||
|
those values become 5.0 (midpoint of the scale).
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
df : pl.DataFrame
|
||||||
|
Input dataframe.
|
||||||
|
target_cols : list[str]
|
||||||
|
List of column names to normalize.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pl.DataFrame
|
||||||
|
DataFrame with target columns normalized row-wise.
|
||||||
"""
|
"""
|
||||||
|
# Calculate row min and max across target columns (ignoring nulls)
|
||||||
|
row_min = pl.min_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols])
|
||||||
|
row_max = pl.max_horizontal([pl.col(c).cast(pl.Float64) for c in target_cols])
|
||||||
|
row_range = row_max - row_min
|
||||||
|
|
||||||
# Using list evaluation for row-wise stats
|
# Build normalized column expressions
|
||||||
# We create a temporary list column containing values from all target columns
|
norm_exprs = []
|
||||||
# Ensure columns are cast to Float64 to avoid type errors with mixed/string data
|
for col in target_cols:
|
||||||
df_norm = df.with_columns(
|
norm_exprs.append(
|
||||||
pl.concat_list([pl.col(c).cast(pl.Float64) for c in target_cols])
|
pl.when(row_range == 0)
|
||||||
.list.eval(
|
.then(
|
||||||
# Apply Min-Max scaling to 0-10
|
# If range is 0 (all values equal), return 5.0 for non-null, null for null
|
||||||
(
|
pl.when(pl.col(col).is_null()).then(None).otherwise(5.0)
|
||||||
(pl.element() - pl.element().min()) /
|
|
||||||
(pl.element().max() - pl.element().min())
|
|
||||||
) * 10
|
|
||||||
)
|
)
|
||||||
.alias("_normalized_values")
|
.otherwise(
|
||||||
|
((pl.col(col).cast(pl.Float64) - row_min) / row_range) * 10
|
||||||
|
)
|
||||||
|
.alias(col)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Unpack the list back to original columns
|
return df.with_columns(norm_exprs)
|
||||||
# list.get(i) retrieves the i-th element which corresponds to target_cols[i]
|
|
||||||
return df_norm.with_columns([
|
|
||||||
pl.col("_normalized_values").list.get(i).alias(target_cols[i])
|
|
||||||
for i in range(len(target_cols))
|
|
||||||
]).drop("_normalized_values")
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||||
|
|||||||
Reference in New Issue
Block a user