SL validation complete

2026-01-29 20:39:16 +01:00
parent c1729d4896
commit 8aee09f968
4 changed files with 155 additions and 31 deletions
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -74,6 +74,13 @@ def _(Path, RESULTS_FILE, data_all, mo):
    return


+@app.cell
+def _():
+    sl_ss_max_score = 5
+    sl_v1_10_max_score = 10
+    return sl_ss_max_score, sl_v1_10_max_score
+
+
@app.cell
 def _(
    S,
@@ -82,12 +89,20 @@ def _(
    data_all,
    duration_validation,
    mo,
+    sl_ss_max_score,
+    sl_v1_10_max_score,
 ):
    _ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
-    sl_content = check_straight_liners(_ss_all, max_score=5)
+    _sl_ss_c, sl_ss_df = check_straight_liners(_ss_all, max_score=sl_ss_max_score)
+
+    _sl_v1_10_c, sl_v1_10_df = check_straight_liners(
+        S.get_voice_scale_1_10(data_all)[0], 
+        max_score=sl_v1_10_max_score
+    )
+

    mo.md(f"""
-    ## Data Validation
+    # Data Validation

    {check_progress(data_all)}

@@ -96,12 +111,30 @@ def _(
    {duration_validation(data_all)}


-    {sl_content}
+    ## Speaking Style - Straight Liners
+    {_sl_ss_c}

+
+    ## Voice Score Scale 1-10 - Straight Liners
+    {_sl_v1_10_c}
    """)
    return


+@app.cell
+def _(data_all):
+    # # Drop any Voice Scale 1-10 responses with straight-lining, using sl_v1_10_df _responseId values
+    # records_to_drop = sl_v1_10_df.select('Record ID').to_series().to_list()
+
+    # data_validated = data_all.filter(~pl.col('_recordId').is_in(records_to_drop))
+
+    # mo.md(f"""
+    # Dropped `{len(records_to_drop)}` responses with straight-lining in Voice Scale 1-10 evaluation.
+    # """)
+    data_validated = data_all
+    return (data_validated,)
+
+
@app.cell(hide_code=True)
 def _(S, mo):
    filter_form = mo.md('''
@@ -138,9 +171,9 @@ def _(S, mo):


@app.cell
-def _(S, data_all, filter_form, mo):
+def _(S, data_validated, filter_form, mo):
    mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
-    _d = S.filter_data(data_all, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer'])
+    _d = S.filter_data(data_validated, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer'])

    # Stop execution and prevent other cells from running if no data is selected
    mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
@@ -363,8 +396,16 @@ def _(S, mo, vscales):
    return


-@app.cell(hide_code=True)
-def _():
+@app.cell
+def _(vscales):
+    target_cols=[c for c in vscales.columns if c not in ['_recordId']]
+    target_cols
+    return (target_cols,)
+
+
+@app.cell
+def _(target_cols, utils, vscales):
+    vscales_row_norm = utils.normalize_row_values(vscales.collect(), target_cols=target_cols)
    return


--- a/99_example_ingest_qualtrics_export.py
+++ b/99_example_ingest_qualtrics_export.py
@@ -205,7 +205,7 @@ def _(mo):
@app.cell
 def _(data, survey):
    vscales = survey.get_voice_scale_1_10(data)[0].collect()
-    vscales
+    print(vscales.head())
    return (vscales,)


--- a/utils.py
+++ b/utils.py
@@ -349,6 +349,66 @@ def calculate_weighted_ranking_scores(df: pl.LazyFrame) -> pl.DataFrame:
    return pl.DataFrame(scores).sort('Weighted Score', descending=True)


+def normalize_row_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
+    """
+    Normalizes values in the specified columns row-wise (Standardization: (x - mean) / std).
+    Ignores null values (NaNs). Only applied if there are at least 2 non-null values in the row.
+    """
+    
+    # Using list evaluation for row-wise stats
+    # We create a temporary list column containing values from all target columns
+    df_norm = df.with_columns(
+        pl.concat_list(target_cols)
+        .list.eval(
+            # Apply standardization: (x - mean) / std
+            # std(ddof=1) is the sample standard deviation
+            (pl.element() - pl.element().mean()) / pl.element().std(ddof=1)
+        )
+        .alias("_normalized_values")
+    )
+    
+    # Unpack the list back to original columns
+    # list.get(i) retrieves the i-th element which corresponds to target_cols[i]
+    return df_norm.with_columns([
+        pl.col("_normalized_values").list.get(i).alias(target_cols[i])
+        for i in range(len(target_cols))
+    ]).drop("_normalized_values")
+
+
+def normalize_global_values(df: pl.DataFrame, target_cols: list[str]) -> pl.DataFrame:
+    """
+    Normalizes values in the specified columns globally (Standardization: (x - global_mean) / global_std).
+    Computes a single mean and standard deviation across ALL values in the target_cols and applies it.
+    Ignores null values (NaNs).
+    """
+    # Ensure eager for scalar extraction
+    was_lazy = isinstance(df, pl.LazyFrame)
+    if was_lazy:
+        df = df.collect()
+        
+    if len(target_cols) == 0:
+        return df.lazy() if was_lazy else df
+
+    # Calculate global stats efficiently by stacking all columns
+    stats = df.select(target_cols).melt().select([
+        pl.col("value").mean().alias("mean"),
+        pl.col("value").std().alias("std")
+    ])
+    
+    global_mean = stats["mean"][0]
+    global_std = stats["std"][0]
+    
+    if global_std is None or global_std == 0:
+        return df.lazy() if was_lazy else df
+
+    res = df.with_columns([
+        ((pl.col(col) - global_mean) / global_std).alias(col)
+        for col in target_cols
+    ])
+    
+    return res.lazy() if was_lazy else res
+
+
 class JPMCSurvey(JPMCPlotsMixin):
    """Class to handle JPMorgan Chase survey data."""
    
--- a/validation.py
+++ b/validation.py
@@ -6,9 +6,9 @@ from theme import ColorPalette
 def check_progress(data):
    """Check if all responses are complete based on 'progress' column."""
    if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
-        return """### Responses Complete: \n\n✅ All responses are complete (progress = 100) """
+        return """## Responses Complete: \n\n✅ All responses are complete (progress = 100) """
    
-    return "### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"
+    return "## Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️"


 def duration_validation(data):
@@ -31,9 +31,9 @@ def duration_validation(data):
    outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()

    if outlier_data.shape[0] == 0:
-        return "### Duration Outliers: \n\n✅ No duration outliers detected"
+        return "## Duration Outliers: \n\n✅ No duration outliers detected"

-    return f"""### Duration Outliers:
+    return f"""## Duration Outliers:
    
    **⚠️ Potential outliers detected based on response duration ⚠️**
    
@@ -69,13 +69,25 @@ def check_straight_liners(data, max_score=3):
    schema_names = data.collect_schema().names()
    
    # regex groupings
-    pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
+    pattern_choice = re.compile(r"(.*__V\d+)__Choice_\d+")
+    pattern_scale = re.compile(r"Voice_Scale_1_10__V\d+")
+    
    groups = {}
    
    for col in schema_names:
-        match = pattern.search(col)
-        if match:
-            group_key = match.group(1)
+        # Check for Choice pattern (SS_...__Vxx__Choice_y)
+        match_choice = pattern_choice.search(col)
+        if match_choice:
+            group_key = match_choice.group(1)
+            if group_key not in groups:
+                groups[group_key] = []
+            groups[group_key].append(col)
+            continue
+            
+        # Check for Voice Scale pattern (Voice_Scale_1_10__Vxx)
+        # All of these form a single group "Voice_Scale_1_10"
+        if pattern_scale.search(col):
+            group_key = "Voice_Scale_1_10"
            if group_key not in groups:
                groups[group_key] = []
            groups[group_key].append(col)
@@ -86,11 +98,11 @@ def check_straight_liners(data, max_score=3):
    if not multi_attribute_groups:
        return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."

-    # Cast all involved columns to Int64 (strict=False) to handle potential string columns
-    # This prevents "cannot compare string with numeric type" errors
+    # Cast all involved columns to Float64 (strict=False) to handle potential string columns
+    # and 1-10 scale floats (e.g. 5.5). Float64 covers integers as well.
    all_group_cols = [col for cols in multi_attribute_groups.values() for col in cols]
    data = data.with_columns([
-        pl.col(col).cast(pl.Int64, strict=False) for col in all_group_cols
+        pl.col(col).cast(pl.Float64, strict=False) for col in all_group_cols
    ])

    # Build expressions
@@ -136,9 +148,18 @@ def check_straight_liners(data, max_score=3):
        filtered = checked_data.filter(pl.col(flag_col))
        
        if filtered.height > 0:
-            # Sort group_cols by choice number to ensure order (Choice_1, Choice_2, etc.)
-            # Assuming format ends with __Choice_X
-            sorted_group_cols = sorted(group_cols, key=lambda c: int(c.split('__Choice_')[-1]))
+            # Sort group_cols logic
+            # If Choice columns, sort by choice number.
+            # If Voice Scale columns (no Choice_), sort by Voice ID (Vxx)
+            if all("__Choice_" in c for c in group_cols):
+                 key_func = lambda c: int(c.split('__Choice_')[-1])
+            else:
+                 # Extract digits from Vxx
+                 def key_func(c):
+                     m = re.search(r"__V(\d+)", c)
+                     return int(m.group(1)) if m else 0
+            
+            sorted_group_cols = sorted(group_cols, key=key_func)
            
            # Select relevant columns: Record ID, Value, and the sorted group columns
            subset = filtered.select(["_recordId", val_col] + sorted_group_cols)
@@ -155,7 +176,7 @@ def check_straight_liners(data, max_score=3):
                })

    if not outliers:
-        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
+        return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})", None
        
    outlier_df = pl.DataFrame(outliers)

@@ -291,13 +312,12 @@ def check_straight_liners(data, max_score=3):
    
    """
    
-    return mo.vstack([
-        mo.md(f"### Straight-lining Checks:\n\n**⚠️ Potential straight-liners detected ⚠️**\n\n"),
+    return (mo.vstack([
+        mo.md(f"**⚠️ Potential straight-liners detected ⚠️**\n\n"),
        mo.ui.table(outlier_df),
        mo.md(analysis_md),
-        mo.md("#### Speaking Style Question Groups"),
        alt.vconcat(chart_pct, chart_dist).resolve_legend(color="independent")
-    ])
+    ]), outlier_df)



@@ -311,7 +331,10 @@ if __name__ == "__main__":
    S = JPMCSurvey(RESULTS_FILE, QSF_FILE)
    data = S.load_data()
    
-    print("Checking Green Blue:")
-    print(check_straight_liners(S.get_ss_green_blue(data)[0]))
-    print("Checking Orange Red:")
-    print(check_straight_liners(S.get_ss_orange_red(data)[0]))
+    # print("Checking Green Blue:")
+    # print(check_straight_liners(S.get_ss_green_blue(data)[0]))
+    # print("Checking Orange Red:")
+    # print(check_straight_liners(S.get_ss_orange_red(data)[0]))
+    
+    print("Checking Voice Scale 1-10:")
+    print(check_straight_liners(S.get_voice_scale_1_10(data)[0]))