import marimo __generated_with = "0.19.2" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import polars as pl from pathlib import Path from utils import QualtricsSurvey, combine_exclusive_columns return QualtricsSurvey, combine_exclusive_columns, mo, pl @app.cell def _(mo): mo.outline() return @app.cell def _(): RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv' QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' # RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv' return QSF_FILE, RESULTS_FILE @app.cell def _(QualtricsSurvey, QSF_FILE, RESULTS_FILE): survey = QualtricsSurvey(RESULTS_FILE, QSF_FILE) data = survey.load_data() data.collect() return data, survey @app.cell def _(survey): survey.qid_descr_map return @app.cell def _(mo): mo.md(r""" # Data Validation """) return @app.cell def _(data, mo, pl): # all progress is 100 def check_progress(data): if data.collect().select(pl.col('progress').unique()).shape[0] == 1: return mo.md("""## ✅ All responses are complete (progress = 100) """) return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️") check_progress(data) return @app.cell def _(data, mo, pl): def duration_validation(data): # Identify any outliers in duration duration_stats = data.select( pl.col('duration').mean().alias('mean_duration'), pl.col('duration').std().alias('std_duration') ).collect() mean_duration = duration_stats['mean_duration'][0] std_duration = duration_stats['std_duration'][0] upper_outlier_threshold = mean_duration + 3 * std_duration lower_outlier_threshold = mean_duration - 3 * std_duration _d = data.with_columns( ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration') ) # Show durations with outlier flag is true outlier_data = _d.filter(pl.col('outlier_duration') == True).collect() if outlier_data.shape[0] == 0: return mo.md("## ✅ No duration outliers detected") return mo.md(f""" ## ⚠️ Duration Outliers Detected ⚠️ - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes) - Standard Deviation of Duration: {std_duration:.2f} seconds - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds - Number of Outlier Responses: {outlier_data.shape[0]} Outliers: {mo.ui.table(outlier_data)} **⚠️ NOTE: These have not been removed from the dataset ⚠️** """) duration_validation(data) return @app.cell def _(mo): mo.md(r""" # Data Analysis """) return @app.cell def _(mo): mo.md(r""" ## Demographics """) return @app.cell def _(data, survey): survey.get_demographics(data)[0].collect() return @app.cell def _(mo): mo.md(r""" ## Top 8 traits """) return @app.cell def _(data, survey): survey.get_top_8_traits(data)[0].collect() return @app.cell def _(mo): mo.md(r""" ## Top 3 traits """) return @app.cell def _(data, survey): survey.get_top_3_traits(data)[0].collect() return @app.cell def _(mo): mo.md(r""" ## Character Ranking """) return @app.cell def _(data, survey): survey.get_character_ranking(data)[0].collect() return @app.cell def _(mo): mo.md(r""" ## Voices 18 -> 8 -> 3 """) return @app.cell def _(data, survey): survey.get_18_8_3(data)[0].collect() return @app.cell def _(mo): mo.md(r""" ## Voice Scales 1-10 """) return @app.cell def _(data, survey): vscales = survey.get_voice_scale_1_10(data)[0].collect() print(vscales.head()) return (vscales,) @app.cell def _(plot_average_scores_with_counts, vscales): plot_average_scores_with_counts(vscales, x_label='Voice', width=1000) return @app.cell def _(mo): mo.md(r""" ## SS Green Blue """) return @app.cell def _(data, survey): _lf, _choice_map = survey.get_ss_green_blue(data) # _lf.collect() print(_lf.collect().head()) return @app.cell def _(df): df return @app.cell def _(mo): mo.md(r""" ## Top 3 Voices """) return @app.cell def _(data, survey): top3_voices = survey.get_top_3_voices(data)[0].collect() top3_voices return (top3_voices,) @app.cell def _(): # print(top3_voices.head()) return @app.cell def _(plot_top3_ranking_distribution, top3_voices): plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000) return @app.cell def _(mo): mo.md(r""" ## SS Orange / Red """) return @app.cell def _(data, survey): _lf, choice_map = survey.get_ss_orange_red(data) _d = _lf.collect() _d return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Character Refine """) return @app.cell def _(data, survey): traits_original = survey.get_top_8_traits(data)[0] traits_original.collect() return (traits_original,) @app.cell def _(data, survey): traits_refined = survey.get_character_refine(data)[0] traits_refined.collect() return (traits_refined,) @app.cell def _(combine_exclusive_columns, traits_refined): traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined') traits_refined_comb return (traits_refined_comb,) @app.cell def _(traits_original, traits_refined_comb): # merge the two dataframes side by side for comparison traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId') print(traits_comparison.collect().head()) return if __name__ == "__main__": app.run()