rename example notebooks and finish ppt pipeline functions

2026-01-29 16:07:55 +01:00
parent 3ee25f9e33
commit 5f9e67a312
4 changed files with 241 additions and 64 deletions
--- a/99_example_ingest_qualtrics_export.py
+++ b/99_example_ingest_qualtrics_export.py
@@ -0,0 +1,324 @@
+import marimo
+
+__generated_with = "0.19.2"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import polars as pl
+    from pathlib import Path
+
+    from utils import JPMCSurvey, combine_exclusive_columns
+    return JPMCSurvey, combine_exclusive_columns, mo, pl
+
+
+@app.cell
+def _(mo):
+    mo.outline()
+    return
+
+
+@app.cell
+def _():
+    RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
+    QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+    # RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
+    return QSF_FILE, RESULTS_FILE
+
+
+@app.cell
+def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
+    survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
+    data = survey.load_data()
+    data.collect()
+    return data, survey
+
+
+@app.cell
+def _(survey):
+    survey.qid_descr_map
+    return
+
+
+app._unparsable_cell(
+    r"""
+    data.
+    """,
+    name="_"
+)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Data Validation
+    """)
+    return
+
+
+@app.cell
+def _(data, mo, pl):
+    # all progress is 100
+    def check_progress(data):
+        if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
+            return mo.md("""## ✅ All responses are complete (progress = 100) """)
+
+        return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
+
+    check_progress(data)
+    return
+
+
+@app.cell
+def _(data, mo, pl):
+
+    def duration_validation(data):
+        # Identify any outliers in duration
+        duration_stats = data.select(
+            pl.col('duration').mean().alias('mean_duration'),
+            pl.col('duration').std().alias('std_duration')
+        ).collect()
+        mean_duration = duration_stats['mean_duration'][0]
+        std_duration = duration_stats['std_duration'][0]
+        upper_outlier_threshold = mean_duration + 3 * std_duration
+        lower_outlier_threshold = mean_duration - 3 * std_duration
+
+        _d = data.with_columns(
+            ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
+        )
+
+        # Show durations with outlier flag is true
+        outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
+
+        if outlier_data.shape[0] == 0:
+            return mo.md("## ✅ No duration outliers detected")
+
+        return mo.md(f"""
+        ## ⚠️ Duration Outliers Detected ⚠️
+        - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
+        - Standard Deviation of Duration: {std_duration:.2f} seconds
+        - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
+        - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
+        - Number of Outlier Responses: {outlier_data.shape[0]}
+
+        Outliers:
+
+        {mo.ui.table(outlier_data)}
+
+
+        **⚠️ NOTE: These have not been removed from the dataset ⚠️**
+
+        """)
+
+    duration_validation(data)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Data Analysis
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Demographics
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    survey.get_demographics(data)[0].collect()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Top 8 traits
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    survey.get_top_8_traits(data)[0].collect()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Top 3 traits
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    survey.get_top_3_traits(data)[0].collect()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Character Ranking
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    survey.get_character_ranking(data)[0].collect()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Voices 18 -> 8 -> 3
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    survey.get_18_8_3(data)[0].collect()
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Voice Scales 1-10
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    vscales = survey.get_voice_scale_1_10(data)[0].collect()
+    vscales
+    return (vscales,)
+
+
+@app.cell
+def _(plot_average_scores_with_counts, vscales):
+    plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## SS Green Blue
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    _lf, _choice_map = survey.get_ss_green_blue(data)
+    # _lf.collect()
+    print(_lf.collect().head())
+    return
+
+
+@app.cell
+def _(df):
+
+    df
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Top 3 Voices
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    top3_voices = survey.get_top_3_voices(data)[0].collect()
+    top3_voices
+    return (top3_voices,)
+
+
+@app.cell
+def _():
+
+    # print(top3_voices.head())
+    return
+
+
+@app.cell
+def _(plot_top3_ranking_distribution, top3_voices):
+    plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## SS Orange / Red
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    _lf, choice_map = survey.get_ss_orange_red(data)
+    _d = _lf.collect()
+    _d
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Character Refine
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    traits_original = survey.get_top_8_traits(data)[0]
+    traits_original.collect()
+    return (traits_original,)
+
+
+@app.cell
+def _(data, survey):
+    traits_refined = survey.get_character_refine(data)[0]
+
+    traits_refined.collect()
+    return (traits_refined,)
+
+
+@app.cell
+def _(combine_exclusive_columns, traits_refined):
+    traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined')
+    traits_refined_comb
+    return (traits_refined_comb,)
+
+
+@app.cell
+def _(traits_original, traits_refined_comb):
+    # merge the two dataframes side by side for comparison
+    traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId')
+    print(traits_comparison.collect().head())
+    return
+
+
+if __name__ == "__main__":
+    app.run()