straight line fn dev

This commit is contained in:
2026-01-29 13:20:32 +01:00
parent 70719702ec
commit bc12df28a5
4 changed files with 160 additions and 44 deletions

View File

@@ -11,15 +11,13 @@ def _():
from pathlib import Path from pathlib import Path
from utils import JPMCSurvey, combine_exclusive_columns from utils import JPMCSurvey, combine_exclusive_columns
from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution return JPMCSurvey, combine_exclusive_columns, mo, pl
return (
JPMCSurvey,
combine_exclusive_columns, @app.cell
mo, def _(mo):
pl, mo.outline()
plot_average_scores_with_counts, return
plot_top3_ranking_distribution,
)
@app.cell @app.cell
@@ -70,7 +68,6 @@ def _(data, mo, pl):
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️") return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
check_progress(data) check_progress(data)
return return
@@ -229,10 +226,18 @@ def _(mo):
@app.cell @app.cell
def _(data, survey): def _(data, survey):
_lf, _choice_map = survey.get_ss_green_blue(data) _lf, _choice_map = survey.get_ss_green_blue(data)
# _lf.collect()
print(_lf.collect().head()) print(_lf.collect().head())
return return
@app.cell
def _(df):
df
return
@app.cell @app.cell
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""
@@ -297,7 +302,6 @@ def _(data, survey):
traits_refined = survey.get_character_refine(data)[0] traits_refined = survey.get_character_refine(data)[0]
traits_refined.collect() traits_refined.collect()
return (traits_refined,) return (traits_refined,)

View File

@@ -10,7 +10,7 @@ def _():
import polars as pl import polars as pl
from pathlib import Path from pathlib import Path
from validation import check_progress, duration_validation from validation import check_progress, duration_validation, check_straight_liners
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
import utils import utils
@@ -28,6 +28,18 @@ def _():
) )
@app.cell(hide_code=True)
def _(mo):
mo.outline(label="Table of Contents")
return
@app.cell
def _():
# Select Dataset
return
@app.cell @app.cell
def _(mo): def _(mo):
file_browser = mo.ui.file_browser( file_browser = mo.ui.file_browser(
@@ -63,21 +75,27 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
return S, data_all return S, data_all
@app.cell(hide_code=True) @app.cell
def _(Path, RESULTS_FILE, data_all, mo): def _():
mo.md(f""" # check_straight_liners(S.get_ss_green_blue(data_all)[0])
---
# Load Data
**Dataset:** `{Path(RESULTS_FILE).name}`
{mo.ui.table(data_all.collect())}
""")
return return
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(Path, RESULTS_FILE, mo):
mo.md(f"""
---
# Load Data
**Dataset:** `{Path(RESULTS_FILE).name}`
""")
return
@app.cell
def _(check_progress, data_all, duration_validation, mo): def _(check_progress, data_all, duration_validation, mo):
mo.md(f""" mo.md(f"""
## Data Validation ## Data Validation
@@ -87,14 +105,9 @@ def _(check_progress, data_all, duration_validation, mo):
{duration_validation(data_all)} {duration_validation(data_all)}
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### ⚠️ ToDo: "straight-liner" detection and removal
""") """)
return return

View File

@@ -1,5 +1,5 @@
Running on Ct-105 for shared access: Running on Ct-105 for shared access:
``` ```
uv run marimo edit --headless --port 8080 --host ct-105.tail44fa00.ts.net uv run marimo run 02_quant_analysis.py --headless --port 8080
``` ```

View File

@@ -36,11 +36,13 @@ def duration_validation(data):
**⚠️ Potential outliers detected based on response duration ⚠️** **⚠️ Potential outliers detected based on response duration ⚠️**
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) | Metric | Value |
- Standard Deviation of Duration: {std_duration:.2f} seconds |--------|-------|
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds | Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds | Standard Deviation of Duration | {std_duration:.2f} seconds |
- Number of Outlier Responses: {outlier_data.shape[0]} | Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
| Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
| Number of Outlier Responses | {outlier_data.shape[0]} |
Outliers: Outliers:
@@ -51,3 +53,100 @@ def duration_validation(data):
""" """
def check_straight_liners(data, max_score=3):
"""
Check for straight-lining behavior (selecting same value for all attributes).
Args:
data: Polars LazyFrame
max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
"""
import re
# detect columns groups based on pattern SS_...__Vxx__Choice_y
schema_names = data.collect_schema().names()
# regex groupings
pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
groups = {}
for col in schema_names:
match = pattern.search(col)
if match:
group_key = match.group(1)
if group_key not in groups:
groups[group_key] = []
groups[group_key].append(col)
# Filter for groups with multiple attributes/choices
multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
if not multi_attribute_groups:
return "### Straight-lining Checks: \n\n No multi-attribute question groups found."
# Build expressions
expressions = []
for key, cols in multi_attribute_groups.items():
# Logic:
# 1. Create list of values
# 2. Drop nulls
# 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
list_expr = pl.concat_list(cols).list.drop_nulls()
# Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
# If n_unique == 1, min() is the same as the single value.
# If list is empty, min() is null, which is safe.
safe_val = list_expr.list.min()
is_straight = (
(list_expr.list.len() > 0) &
(list_expr.list.n_unique() == 1) &
(safe_val <= max_score)
).alias(f"__is_straight__{key}")
value_expr = safe_val.alias(f"__val__{key}")
expressions.extend([is_straight, value_expr])
# collect data with checks
# We only need _recordId and the check columns
# We do with_columns then select implicitly/explicitly via filter/select later.
checked_data = data.with_columns(expressions).collect()
# Process results into a nice table
outliers = []
for key in multi_attribute_groups.keys():
flag_col = f"__is_straight__{key}"
val_col = f"__val__{key}"
filtered = checked_data.filter(pl.col(flag_col))
if filtered.height > 0:
rows = filtered.select(["_recordId", val_col]).rows()
for row in rows:
outliers.append({
"Record ID": row[0],
"Question Group": key,
"Value": row[1]
})
if not outliers:
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
outlier_df = pl.DataFrame(outliers)
return f"""### Straight-lining Checks:
**⚠️ Potential straight-liners detected ⚠️**
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
{mo.ui.table(outlier_df)}
"""