SL validation complete
This commit is contained in:
@@ -74,6 +74,13 @@ def _(Path, RESULTS_FILE, data_all, mo):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
sl_ss_max_score = 5
|
||||||
|
sl_v1_10_max_score = 10
|
||||||
|
return sl_ss_max_score, sl_v1_10_max_score
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(
|
def _(
|
||||||
S,
|
S,
|
||||||
@@ -82,12 +89,20 @@ def _(
|
|||||||
data_all,
|
data_all,
|
||||||
duration_validation,
|
duration_validation,
|
||||||
mo,
|
mo,
|
||||||
|
sl_ss_max_score,
|
||||||
|
sl_v1_10_max_score,
|
||||||
):
|
):
|
||||||
_ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
|
_ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
|
||||||
sl_content = check_straight_liners(_ss_all, max_score=5)
|
_sl_ss_c, sl_ss_df = check_straight_liners(_ss_all, max_score=sl_ss_max_score)
|
||||||
|
|
||||||
|
_sl_v1_10_c, sl_v1_10_df = check_straight_liners(
|
||||||
|
S.get_voice_scale_1_10(data_all)[0],
|
||||||
|
max_score=sl_v1_10_max_score
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
## Data Validation
|
# Data Validation
|
||||||
|
|
||||||
{check_progress(data_all)}
|
{check_progress(data_all)}
|
||||||
|
|
||||||
@@ -96,12 +111,30 @@ def _(
|
|||||||
{duration_validation(data_all)}
|
{duration_validation(data_all)}
|
||||||
|
|
||||||
|
|
||||||
{sl_content}
|
## Speaking Style - Straight Liners
|
||||||
|
{_sl_ss_c}
|
||||||
|
|
||||||
|
|
||||||
|
## Voice Score Scale 1-10 - Straight Liners
|
||||||
|
{_sl_v1_10_c}
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(data_all):
|
||||||
|
# # Drop any Voice Scale 1-10 responses with straight-lining, using sl_v1_10_df _responseId values
|
||||||
|
# records_to_drop = sl_v1_10_df.select('Record ID').to_series().to_list()
|
||||||
|
|
||||||
|
# data_validated = data_all.filter(~pl.col('_recordId').is_in(records_to_drop))
|
||||||
|
|
||||||
|
# mo.md(f"""
|
||||||
|
# Dropped `{len(records_to_drop)}` responses with straight-lining in Voice Scale 1-10 evaluation.
|
||||||
|
# """)
|
||||||
|
data_validated = data_all
|
||||||
|
return (data_validated,)
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
def _(S, mo):
|
def _(S, mo):
|
||||||
filter_form = mo.md('''
|
filter_form = mo.md('''
|
||||||
@@ -138,9 +171,9 @@ def _(S, mo):
|
|||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(S, data_all, filter_form, mo):
|
def _(S, data_validated, filter_form, mo):
|
||||||
mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
|
mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
|
||||||
_d = S.filter_data(data_all, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer'])
|
_d = S.filter_data(data_validated, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer'])
|
||||||
|
|
||||||
# Stop execution and prevent other cells from running if no data is selected
|
# Stop execution and prevent other cells from running if no data is selected
|
||||||
mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
|
mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
|
||||||
@@ -363,8 +396,16 @@ def _(S, mo, vscales):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell
|
||||||
def _():
|
def _(vscales):
|
||||||
|
target_cols=[c for c in vscales.columns if c not in ['_recordId']]
|
||||||
|
target_cols
|
||||||
|
return (target_cols,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(target_cols, utils, vscales):
|
||||||
|
vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=target_cols)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ def _(mo):
|
|||||||
@app.cell
|
@app.cell
|
||||||
def _(data, survey):
|
def _(data, survey):
|
||||||
vscales = survey.get_voice_scale_1_10(data)[0].collect()
|
vscales = survey.get_voice_scale_1_10(data)[0].collect()
|
||||||
vscales
|
print(vscales.head())
|
||||||
return (vscales,)
|
return (vscales,)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
60
utils.py
60
utils.py
@@ -349,6 +349,66 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
|
|||||||
return pl.DataFrame(scores).sort('Weighted Score', descending=True)
|
return pl.DataFrame(scores).sort('Weighted Score', descending=True)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std).
|
||||||
|
Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Using list evaluation for row-wise stats
|
||||||
|
# We create a temporary list column containing values from all target columns
|
||||||
|
df_norm = df.with_columns(
|
||||||
|
pl.concat_list(target_cols)
|
||||||
|
.list.eval(
|
||||||
|
# Apply standardization: (x - mean) / std
|
||||||
|
# std(ddof=1) is the sample standard deviation
|
||||||
|
(pl.element() - pl.element().mean()) / pl.element().std(ddof=1)
|
||||||
|
)
|
||||||
|
.alias("_normalized_values")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Unpack the list back to original columns
|
||||||
|
# list.get(i) retrieves the i-th element which corresponds to target_cols[i]
|
||||||
|
return df_norm.with_columns([
|
||||||
|
pl.col("_normalized_values").list.get(i).alias(target_cols[i])
|
||||||
|
for i in range(len(target_cols))
|
||||||
|
]).drop("_normalized_values")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
|
||||||
|
"""
|
||||||
|
Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std).
|
||||||
|
Computes a single mean and standard deviation across ALL values in the target_cols and applies it.
|
||||||
|
Ignores null values (NaNs).
|
||||||
|
"""
|
||||||
|
# Ensure eager for scalar extraction
|
||||||
|
was_lazy = isinstance(df, pl.LazyFrame)
|
||||||
|
if was_lazy:
|
||||||
|
df = df.collect()
|
||||||
|
|
||||||
|
if len(target_cols) == 0:
|
||||||
|
return df.lazy() if was_lazy else df
|
||||||
|
|
||||||
|
# Calculate global stats efficiently by stacking all columns
|
||||||
|
stats = df.select(target_cols).melt().select([
|
||||||
|
pl.col("value").mean().alias("mean"),
|
||||||
|
pl.col("value").std().alias("std")
|
||||||
|
])
|
||||||
|
|
||||||
|
global_mean = stats["mean"][0]
|
||||||
|
global_std = stats["std"][0]
|
||||||
|
|
||||||
|
if global_std is None or global_std == 0:
|
||||||
|
return df.lazy() if was_lazy else df
|
||||||
|
|
||||||
|
res = df.with_columns([
|
||||||
|
((pl.col(col) - global_mean) / global_std).alias(col)
|
||||||
|
for col in target_cols
|
||||||
|
])
|
||||||
|
|
||||||
|
return res.lazy() if was_lazy else res
|
||||||
|
|
||||||
|
|
||||||
class JPMCSurvey(JPMCPlotsMixin):
|
class JPMCSurvey(JPMCPlotsMixin):
|
||||||
"""Class to handle JPMorgan Chase survey data."""
|
"""Class to handle JPMorgan Chase survey data."""
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,9 @@ from theme import ColorPalette
|
|||||||
def check_progress(data):
|
def check_progress(data):
|
||||||
"""Check if all responses are complete based on 'progress' column."""
|
"""Check if all responses are complete based on 'progress' column."""
|
||||||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||||||
return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
|
return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """
|
||||||
|
|
||||||
return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
|
return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
|
||||||
|
|
||||||
|
|
||||||
def duration_validation(data):
|
def duration_validation(data):
|
||||||
@@ -31,9 +31,9 @@ def duration_validation(data):
|
|||||||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||||||
|
|
||||||
if outlier_data.shape[0] == 0:
|
if outlier_data.shape[0] == 0:
|
||||||
return "### Duration Outliers: \n\n✅ No duration outliers detected"
|
return "## Duration Outliers: \n\n✅ No duration outliers detected"
|
||||||
|
|
||||||
return f"""### Duration Outliers:
|
return f"""## Duration Outliers:
|
||||||
|
|
||||||
**⚠️ Potential outliers detected based on response duration ⚠️**
|
**⚠️ Potential outliers detected based on response duration ⚠️**
|
||||||
|
|
||||||
@@ -69,13 +69,25 @@ def check_straight_liners(data, max_score=3):
|
|||||||
schema_names = data.collect_schema().names()
|
schema_names = data.collect_schema().names()
|
||||||
|
|
||||||
# regex groupings
|
# regex groupings
|
||||||
pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
|
pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+")
|
||||||
|
pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+")
|
||||||
|
|
||||||
groups = {}
|
groups = {}
|
||||||
|
|
||||||
for col in schema_names:
|
for col in schema_names:
|
||||||
match = pattern.search(col)
|
# Check for Choice pattern (SS_...__Vxx__Choice_y)
|
||||||
if match:
|
match_choice = pattern_choice.search(col)
|
||||||
group_key = match.group(1)
|
if match_choice:
|
||||||
|
group_key = match_choice.group(1)
|
||||||
|
if group_key not in groups:
|
||||||
|
groups[group_key] = []
|
||||||
|
groups[group_key].append(col)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check for Voice Scale pattern (Voice_Scale_1_10__Vxx)
|
||||||
|
# All of these form a single group "Voice_Scale_1_10"
|
||||||
|
if pattern_scale.search(col):
|
||||||
|
group_key = "Voice_Scale_1_10"
|
||||||
if group_key not in groups:
|
if group_key not in groups:
|
||||||
groups[group_key] = []
|
groups[group_key] = []
|
||||||
groups[group_key].append(col)
|
groups[group_key].append(col)
|
||||||
@@ -86,11 +98,11 @@ def check_straight_liners(data, max_score=3):
|
|||||||
if not multi_attribute_groups:
|
if not multi_attribute_groups:
|
||||||
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
||||||
|
|
||||||
# Cast all involved columns to Int64 (strict=False) to handle potential string columns
|
# Cast all involved columns to Float64 (strict=False) to handle potential string columns
|
||||||
# This prevents "cannot compare string with numeric type" errors
|
# and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well.
|
||||||
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
|
all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
|
||||||
data = data.with_columns([
|
data = data.with_columns([
|
||||||
pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
|
pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols
|
||||||
])
|
])
|
||||||
|
|
||||||
# Build expressions
|
# Build expressions
|
||||||
@@ -136,9 +148,18 @@ def check_straight_liners(data, max_score=3):
|
|||||||
filtered = checked_data.filter(pl.col(flag_col))
|
filtered = checked_data.filter(pl.col(flag_col))
|
||||||
|
|
||||||
if filtered.height > 0:
|
if filtered.height > 0:
|
||||||
# Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
|
# Sort group_cols logic
|
||||||
# Assuming format ends with __Choice_X
|
# If Choice columns, sort by choice number.
|
||||||
sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
|
# If Voice Scale columns (no Choice_), sort by Voice ID (Vxx)
|
||||||
|
if all("__Choice_" in c for c in group_cols):
|
||||||
|
key_func = lambda c: int(c.split('__Choice_')[-1])
|
||||||
|
else:
|
||||||
|
# Extract digits from Vxx
|
||||||
|
def key_func(c):
|
||||||
|
m = re.search(r"__V(\d+)", c)
|
||||||
|
return int(m.group(1)) if m else 0
|
||||||
|
|
||||||
|
sorted_group_cols = sorted(group_cols, key=key_func)
|
||||||
|
|
||||||
# Select relevant columns: Record ID, Value, and the sorted group columns
|
# Select relevant columns: Record ID, Value, and the sorted group columns
|
||||||
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
|
subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
|
||||||
@@ -155,7 +176,7 @@ def check_straight_liners(data, max_score=3):
|
|||||||
})
|
})
|
||||||
|
|
||||||
if not outliers:
|
if not outliers:
|
||||||
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
|
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None
|
||||||
|
|
||||||
outlier_df = pl.DataFrame(outliers)
|
outlier_df = pl.DataFrame(outliers)
|
||||||
|
|
||||||
@@ -291,13 +312,12 @@ def check_straight_liners(data, max_score=3):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return mo.vstack([
|
return (mo.vstack([
|
||||||
mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
|
mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"),
|
||||||
mo.ui.table(outlier_df),
|
mo.ui.table(outlier_df),
|
||||||
mo.md(analysis_md),
|
mo.md(analysis_md),
|
||||||
mo.md("#### Speaking Style Question Groups"),
|
|
||||||
alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
|
alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
|
||||||
])
|
]), outlier_df)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -311,7 +331,10 @@ if __name__ == "__main__":
|
|||||||
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||||
data = S.load_data()
|
data = S.load_data()
|
||||||
|
|
||||||
print("Checking Green Blue:")
|
# print("Checking Green Blue:")
|
||||||
print(check_straight_liners(S.get_ss_green_blue(data)[0]))
|
# print(check_straight_liners(S.get_ss_green_blue(data)[0]))
|
||||||
print("Checking Orange Red:")
|
# print("Checking Orange Red:")
|
||||||
print(check_straight_liners(S.get_ss_orange_red(data)[0]))
|
# print(check_straight_liners(S.get_ss_orange_red(data)[0]))
|
||||||
|
|
||||||
|
print("Checking Voice Scale 1-10:")
|
||||||
|
print(check_straight_liners(S.get_voice_scale_1_10(data)[0]))
|
||||||
Reference in New Issue
Block a user