Files
JPMC-quant/99_example_ingest_qualtrics_export.py
2026-01-29 20:39:16 +01:00

325 lines
6.2 KiB
Python

import marimo
__generated_with = "0.19.2"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import polars as pl
from pathlib import Path
from utils import JPMCSurvey, combine_exclusive_columns
return JPMCSurvey, combine_exclusive_columns, mo, pl
@app.cell
def _(mo):
mo.outline()
return
@app.cell
def _():
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
# RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
return QSF_FILE, RESULTS_FILE
@app.cell
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
data = survey.load_data()
data.collect()
return data, survey
@app.cell
def _(survey):
survey.qid_descr_map
return
app._unparsable_cell(
r"""
data.
""",
name="_"
)
@app.cell
def _(mo):
mo.md(r"""
# Data Validation
""")
return
@app.cell
def _(data, mo, pl):
# all progress is 100
def check_progress(data):
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
return mo.md("""## ✅ All responses are complete (progress = 100) """)
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
check_progress(data)
return
@app.cell
def _(data, mo, pl):
def duration_validation(data):
# Identify any outliers in duration
duration_stats = data.select(
pl.col('duration').mean().alias('mean_duration'),
pl.col('duration').std().alias('std_duration')
).collect()
mean_duration = duration_stats['mean_duration'][0]
std_duration = duration_stats['std_duration'][0]
upper_outlier_threshold = mean_duration + 3 * std_duration
lower_outlier_threshold = mean_duration - 3 * std_duration
_d = data.with_columns(
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
)
# Show durations with outlier flag is true
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
if outlier_data.shape[0] == 0:
return mo.md("## ✅ No duration outliers detected")
return mo.md(f"""
## ⚠️ Duration Outliers Detected ⚠️
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
- Standard Deviation of Duration: {std_duration:.2f} seconds
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
- Number of Outlier Responses: {outlier_data.shape[0]}
Outliers:
{mo.ui.table(outlier_data)}
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
""")
duration_validation(data)
return
@app.cell
def _(mo):
mo.md(r"""
# Data Analysis
""")
return
@app.cell
def _(mo):
mo.md(r"""
## Demographics
""")
return
@app.cell
def _(data, survey):
survey.get_demographics(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Top 8 traits
""")
return
@app.cell
def _(data, survey):
survey.get_top_8_traits(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Top 3 traits
""")
return
@app.cell
def _(data, survey):
survey.get_top_3_traits(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Character Ranking
""")
return
@app.cell
def _(data, survey):
survey.get_character_ranking(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Voices 18 -> 8 -> 3
""")
return
@app.cell
def _(data, survey):
survey.get_18_8_3(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Voice Scales 1-10
""")
return
@app.cell
def _(data, survey):
vscales = survey.get_voice_scale_1_10(data)[0].collect()
print(vscales.head())
return (vscales,)
@app.cell
def _(plot_average_scores_with_counts, vscales):
plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
return
@app.cell
def _(mo):
mo.md(r"""
## SS Green Blue
""")
return
@app.cell
def _(data, survey):
_lf, _choice_map = survey.get_ss_green_blue(data)
# _lf.collect()
print(_lf.collect().head())
return
@app.cell
def _(df):
df
return
@app.cell
def _(mo):
mo.md(r"""
## Top 3 Voices
""")
return
@app.cell
def _(data, survey):
top3_voices = survey.get_top_3_voices(data)[0].collect()
top3_voices
return (top3_voices,)
@app.cell
def _():
# print(top3_voices.head())
return
@app.cell
def _(plot_top3_ranking_distribution, top3_voices):
plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
return
@app.cell
def _(mo):
mo.md(r"""
## SS Orange / Red
""")
return
@app.cell
def _(data, survey):
_lf, choice_map = survey.get_ss_orange_red(data)
_d = _lf.collect()
_d
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Character Refine
""")
return
@app.cell
def _(data, survey):
traits_original = survey.get_top_8_traits(data)[0]
traits_original.collect()
return (traits_original,)
@app.cell
def _(data, survey):
traits_refined = survey.get_character_refine(data)[0]
traits_refined.collect()
return (traits_refined,)
@app.cell
def _(combine_exclusive_columns, traits_refined):
traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined')
traits_refined_comb
return (traits_refined_comb,)
@app.cell
def _(traits_original, traits_refined_comb):
# merge the two dataframes side by side for comparison
traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId')
print(traits_comparison.collect().head())
return
if __name__ == "__main__":
app.run()