straight line fn dev
This commit is contained in:
@@ -11,15 +11,13 @@ def _():
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from utils import JPMCSurvey, combine_exclusive_columns
|
from utils import JPMCSurvey, combine_exclusive_columns
|
||||||
from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
|
return JPMCSurvey, combine_exclusive_columns, mo, pl
|
||||||
return (
|
|
||||||
JPMCSurvey,
|
|
||||||
combine_exclusive_columns,
|
@app.cell
|
||||||
mo,
|
def _(mo):
|
||||||
pl,
|
mo.outline()
|
||||||
plot_average_scores_with_counts,
|
return
|
||||||
plot_top3_ranking_distribution,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
@@ -66,11 +64,10 @@ def _(data, mo, pl):
|
|||||||
def check_progress(data):
|
def check_progress(data):
|
||||||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||||||
return mo.md("""## ✅ All responses are complete (progress = 100) """)
|
return mo.md("""## ✅ All responses are complete (progress = 100) """)
|
||||||
|
|
||||||
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
|
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
|
||||||
|
|
||||||
check_progress(data)
|
check_progress(data)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@@ -87,11 +84,11 @@ def _(data, mo, pl):
|
|||||||
std_duration = duration_stats['std_duration'][0]
|
std_duration = duration_stats['std_duration'][0]
|
||||||
upper_outlier_threshold = mean_duration + 3 * std_duration
|
upper_outlier_threshold = mean_duration + 3 * std_duration
|
||||||
lower_outlier_threshold = mean_duration - 3 * std_duration
|
lower_outlier_threshold = mean_duration - 3 * std_duration
|
||||||
|
|
||||||
_d = data.with_columns(
|
_d = data.with_columns(
|
||||||
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
||||||
)
|
)
|
||||||
|
|
||||||
# Show durations with outlier flag is true
|
# Show durations with outlier flag is true
|
||||||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||||||
|
|
||||||
@@ -105,16 +102,16 @@ def _(data, mo, pl):
|
|||||||
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
||||||
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
||||||
- Number of Outlier Responses: {outlier_data.shape[0]}
|
- Number of Outlier Responses: {outlier_data.shape[0]}
|
||||||
|
|
||||||
Outliers:
|
Outliers:
|
||||||
|
|
||||||
{mo.ui.table(outlier_data)}
|
{mo.ui.table(outlier_data)}
|
||||||
|
|
||||||
|
|
||||||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
duration_validation(data)
|
duration_validation(data)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -229,10 +226,18 @@ def _(mo):
|
|||||||
@app.cell
|
@app.cell
|
||||||
def _(data, survey):
|
def _(data, survey):
|
||||||
_lf, _choice_map = survey.get_ss_green_blue(data)
|
_lf, _choice_map = survey.get_ss_green_blue(data)
|
||||||
|
# _lf.collect()
|
||||||
print(_lf.collect().head())
|
print(_lf.collect().head())
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(df):
|
||||||
|
|
||||||
|
df
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo):
|
def _(mo):
|
||||||
mo.md(r"""
|
mo.md(r"""
|
||||||
@@ -297,7 +302,6 @@ def _(data, survey):
|
|||||||
traits_refined = survey.get_character_refine(data)[0]
|
traits_refined = survey.get_character_refine(data)[0]
|
||||||
|
|
||||||
traits_refined.collect()
|
traits_refined.collect()
|
||||||
|
|
||||||
return (traits_refined,)
|
return (traits_refined,)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ def _():
|
|||||||
import polars as pl
|
import polars as pl
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from validation import check_progress, duration_validation
|
from validation import check_progress, duration_validation, check_straight_liners
|
||||||
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
||||||
import utils
|
import utils
|
||||||
|
|
||||||
@@ -28,6 +28,18 @@ def _():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.outline(label="Table of Contents")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
# Select Dataset
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo):
|
def _(mo):
|
||||||
file_browser = mo.ui.file_browser(
|
file_browser = mo.ui.file_browser(
|
||||||
@@ -63,21 +75,27 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE, mo):
|
|||||||
return S, data_all
|
return S, data_all
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell
|
||||||
def _(Path, RESULTS_FILE, data_all, mo):
|
def _():
|
||||||
mo.md(f"""
|
# check_straight_liners(S.get_ss_green_blue(data_all)[0])
|
||||||
---
|
|
||||||
|
|
||||||
# Load Data
|
|
||||||
|
|
||||||
**Dataset:** `{Path(RESULTS_FILE).name}`
|
|
||||||
|
|
||||||
{mo.ui.table(data_all.collect())}
|
|
||||||
""")
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
|
def _(Path, RESULTS_FILE, mo):
|
||||||
|
mo.md(f"""
|
||||||
|
|
||||||
|
---
|
||||||
|
# Load Data
|
||||||
|
|
||||||
|
**Dataset:** `{Path(RESULTS_FILE).name}`
|
||||||
|
|
||||||
|
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
def _(check_progress, data_all, duration_validation, mo):
|
def _(check_progress, data_all, duration_validation, mo):
|
||||||
mo.md(f"""
|
mo.md(f"""
|
||||||
## Data Validation
|
## Data Validation
|
||||||
@@ -87,14 +105,9 @@ def _(check_progress, data_all, duration_validation, mo):
|
|||||||
|
|
||||||
|
|
||||||
{duration_validation(data_all)}
|
{duration_validation(data_all)}
|
||||||
""")
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
|
||||||
def _(mo):
|
|
||||||
mo.md(r"""
|
|
||||||
### ⚠️ ToDo: "straight-liner" detection and removal
|
|
||||||
""")
|
""")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
Running on Ct-105 for shared access:
|
Running on Ct-105 for shared access:
|
||||||
|
|
||||||
```
|
```
|
||||||
uv run marimo edit --headless --port 8080 --host ct-105.tail44fa00.ts.net
|
uv run marimo run 02_quant_analysis.py --headless --port 8080
|
||||||
```
|
```
|
||||||
109
validation.py
109
validation.py
@@ -36,11 +36,13 @@ def duration_validation(data):
|
|||||||
|
|
||||||
**⚠️ Potential outliers detected based on response duration ⚠️**
|
**⚠️ Potential outliers detected based on response duration ⚠️**
|
||||||
|
|
||||||
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
|
| Metric | Value |
|
||||||
- Standard Deviation of Duration: {std_duration:.2f} seconds
|
|--------|-------|
|
||||||
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
| Mean Duration | {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) |
|
||||||
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
| Standard Deviation of Duration | {std_duration:.2f} seconds |
|
||||||
- Number of Outlier Responses: {outlier_data.shape[0]}
|
| Upper Outlier Threshold (Mean + 3*Std) | {upper_outlier_threshold:.2f} seconds |
|
||||||
|
| Lower Outlier Threshold (Mean - 3*Std) | {lower_outlier_threshold:.2f} seconds |
|
||||||
|
| Number of Outlier Responses | {outlier_data.shape[0]} |
|
||||||
|
|
||||||
Outliers:
|
Outliers:
|
||||||
|
|
||||||
@@ -50,4 +52,101 @@ def duration_validation(data):
|
|||||||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def check_straight_liners(data, max_score=3):
|
||||||
|
"""
|
||||||
|
Check for straight-lining behavior (selecting same value for all attributes).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data: Polars LazyFrame
|
||||||
|
max_score: The maximum score that is flagged if straight-lined (e.g., if 4, then 5s are allowed).
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# detect columns groups based on pattern SS_...__Vxx__Choice_y
|
||||||
|
schema_names = data.collect_schema().names()
|
||||||
|
|
||||||
|
# regex groupings
|
||||||
|
pattern = re.compile(r"(.*__V\d+)__Choice_\d+")
|
||||||
|
groups = {}
|
||||||
|
|
||||||
|
for col in schema_names:
|
||||||
|
match = pattern.search(col)
|
||||||
|
if match:
|
||||||
|
group_key = match.group(1)
|
||||||
|
if group_key not in groups:
|
||||||
|
groups[group_key] = []
|
||||||
|
groups[group_key].append(col)
|
||||||
|
|
||||||
|
# Filter for groups with multiple attributes/choices
|
||||||
|
multi_attribute_groups = {k: v for k, v in groups.items() if len(v) > 1}
|
||||||
|
|
||||||
|
if not multi_attribute_groups:
|
||||||
|
return "### Straight-lining Checks: \n\nℹ️ No multi-attribute question groups found."
|
||||||
|
|
||||||
|
# Build expressions
|
||||||
|
expressions = []
|
||||||
|
|
||||||
|
for key, cols in multi_attribute_groups.items():
|
||||||
|
# Logic:
|
||||||
|
# 1. Create list of values
|
||||||
|
# 2. Drop nulls
|
||||||
|
# 3. Check if all remaining are equal (n_unique == 1) AND value <= max_score
|
||||||
|
|
||||||
|
list_expr = pl.concat_list(cols).list.drop_nulls()
|
||||||
|
|
||||||
|
# Use .list.min() instead of .list.get(0) to avoid "index out of bounds" on empty lists
|
||||||
|
# If n_unique == 1, min() is the same as the single value.
|
||||||
|
# If list is empty, min() is null, which is safe.
|
||||||
|
safe_val = list_expr.list.min()
|
||||||
|
|
||||||
|
is_straight = (
|
||||||
|
(list_expr.list.len() > 0) &
|
||||||
|
(list_expr.list.n_unique() == 1) &
|
||||||
|
(safe_val <= max_score)
|
||||||
|
).alias(f"__is_straight__{key}")
|
||||||
|
|
||||||
|
value_expr = safe_val.alias(f"__val__{key}")
|
||||||
|
|
||||||
|
expressions.extend([is_straight, value_expr])
|
||||||
|
|
||||||
|
# collect data with checks
|
||||||
|
# We only need _recordId and the check columns
|
||||||
|
# We do with_columns then select implicitly/explicitly via filter/select later.
|
||||||
|
|
||||||
|
checked_data = data.with_columns(expressions).collect()
|
||||||
|
|
||||||
|
# Process results into a nice table
|
||||||
|
outliers = []
|
||||||
|
|
||||||
|
for key in multi_attribute_groups.keys():
|
||||||
|
flag_col = f"__is_straight__{key}"
|
||||||
|
val_col = f"__val__{key}"
|
||||||
|
|
||||||
|
filtered = checked_data.filter(pl.col(flag_col))
|
||||||
|
|
||||||
|
if filtered.height > 0:
|
||||||
|
rows = filtered.select(["_recordId", val_col]).rows()
|
||||||
|
for row in rows:
|
||||||
|
outliers.append({
|
||||||
|
"Record ID": row[0],
|
||||||
|
"Question Group": key,
|
||||||
|
"Value": row[1]
|
||||||
|
})
|
||||||
|
|
||||||
|
if not outliers:
|
||||||
|
return f"### Straight-lining Checks: \n\n✅ No straight-liners detected (value <= {max_score})"
|
||||||
|
|
||||||
|
outlier_df = pl.DataFrame(outliers)
|
||||||
|
|
||||||
|
return f"""### Straight-lining Checks:
|
||||||
|
|
||||||
|
**⚠️ Potential straight-liners detected ⚠️**
|
||||||
|
|
||||||
|
Respondents selected the same value (<= {max_score}) for all attributes in the following groups:
|
||||||
|
|
||||||
|
{mo.ui.table(outlier_df)}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user