rename example notebooks and finish ppt pipeline functions
This commit is contained in:
324
99_example_ingest_qualtrics_export.py
Normal file
324
99_example_ingest_qualtrics_export.py
Normal file
@@ -0,0 +1,324 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.19.2"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
||||
from utils import JPMCSurvey, combine_exclusive_columns
|
||||
return JPMCSurvey, combine_exclusive_columns, mo, pl
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.outline()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
|
||||
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
# RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
|
||||
return QSF_FILE, RESULTS_FILE
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
|
||||
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||
data = survey.load_data()
|
||||
data.collect()
|
||||
return data, survey
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(survey):
|
||||
survey.qid_descr_map
|
||||
return
|
||||
|
||||
|
||||
app._unparsable_cell(
|
||||
r"""
|
||||
data.
|
||||
""",
|
||||
name="_"
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Data Validation
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, mo, pl):
|
||||
# all progress is 100
|
||||
def check_progress(data):
|
||||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||||
return mo.md("""## ✅ All responses are complete (progress = 100) """)
|
||||
|
||||
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
|
||||
|
||||
check_progress(data)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, mo, pl):
|
||||
|
||||
def duration_validation(data):
|
||||
# Identify any outliers in duration
|
||||
duration_stats = data.select(
|
||||
pl.col('duration').mean().alias('mean_duration'),
|
||||
pl.col('duration').std().alias('std_duration')
|
||||
).collect()
|
||||
mean_duration = duration_stats['mean_duration'][0]
|
||||
std_duration = duration_stats['std_duration'][0]
|
||||
upper_outlier_threshold = mean_duration + 3 * std_duration
|
||||
lower_outlier_threshold = mean_duration - 3 * std_duration
|
||||
|
||||
_d = data.with_columns(
|
||||
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
||||
)
|
||||
|
||||
# Show durations with outlier flag is true
|
||||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||||
|
||||
if outlier_data.shape[0] == 0:
|
||||
return mo.md("## ✅ No duration outliers detected")
|
||||
|
||||
return mo.md(f"""
|
||||
## ⚠️ Duration Outliers Detected ⚠️
|
||||
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
|
||||
- Standard Deviation of Duration: {std_duration:.2f} seconds
|
||||
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
||||
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
||||
- Number of Outlier Responses: {outlier_data.shape[0]}
|
||||
|
||||
Outliers:
|
||||
|
||||
{mo.ui.table(outlier_data)}
|
||||
|
||||
|
||||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||||
|
||||
""")
|
||||
|
||||
duration_validation(data)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Data Analysis
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Demographics
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_demographics(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Top 8 traits
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_top_8_traits(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Top 3 traits
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_top_3_traits(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Character Ranking
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_character_ranking(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Voices 18 -> 8 -> 3
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_18_8_3(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Voice Scales 1-10
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
vscales = survey.get_voice_scale_1_10(data)[0].collect()
|
||||
vscales
|
||||
return (vscales,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(plot_average_scores_with_counts, vscales):
|
||||
plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## SS Green Blue
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
_lf, _choice_map = survey.get_ss_green_blue(data)
|
||||
# _lf.collect()
|
||||
print(_lf.collect().head())
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df):
|
||||
|
||||
df
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Top 3 Voices
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
top3_voices = survey.get_top_3_voices(data)[0].collect()
|
||||
top3_voices
|
||||
return (top3_voices,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
|
||||
# print(top3_voices.head())
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(plot_top3_ranking_distribution, top3_voices):
|
||||
plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## SS Orange / Red
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
_lf, choice_map = survey.get_ss_orange_red(data)
|
||||
_d = _lf.collect()
|
||||
_d
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Character Refine
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
traits_original = survey.get_top_8_traits(data)[0]
|
||||
traits_original.collect()
|
||||
return (traits_original,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
traits_refined = survey.get_character_refine(data)[0]
|
||||
|
||||
traits_refined.collect()
|
||||
return (traits_refined,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(combine_exclusive_columns, traits_refined):
|
||||
traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined')
|
||||
traits_refined_comb
|
||||
return (traits_refined_comb,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(traits_original, traits_refined_comb):
|
||||
# merge the two dataframes side by side for comparison
|
||||
traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId')
|
||||
print(traits_comparison.collect().head())
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user