JPMC-quant/99_example_ingest_qualtrics_export.py

import marimo

__generated_with = "0.19.2"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    import polars as pl
    from pathlib import Path

    from utils import JPMCSurvey, combine_exclusive_columns
    return JPMCSurvey, combine_exclusive_columns, mo, pl


@app.cell
def _(mo):
    mo.outline()
    return


@app.cell
def _():
    RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
    QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
    # RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
    return QSF_FILE, RESULTS_FILE


@app.cell
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
    survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
    data = survey.load_data()
    data.collect()
    return data, survey


@app.cell
def _(survey):
    survey.qid_descr_map
    return


app._unparsable_cell(
    r"""
    data.
    """,
    name="_"
)


@app.cell
def _(mo):
    mo.md(r"""
    # Data Validation
    """)
    return


@app.cell
def _(data, mo, pl):
    # all progress is 100
    def check_progress(data):
        if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
            return mo.md("""## ✅ All responses are complete (progress = 100) """)

        return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")

    check_progress(data)
    return


@app.cell
def _(data, mo, pl):

    def duration_validation(data):
        # Identify any outliers in duration
        duration_stats = data.select(
            pl.col('duration').mean().alias('mean_duration'),
            pl.col('duration').std().alias('std_duration')
        ).collect()
        mean_duration = duration_stats['mean_duration'][0]
        std_duration = duration_stats['std_duration'][0]
        upper_outlier_threshold = mean_duration + 3 * std_duration
        lower_outlier_threshold = mean_duration - 3 * std_duration

        _d = data.with_columns(
            ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
        )

        # Show durations with outlier flag is true
        outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()

        if outlier_data.shape[0] == 0:
            return mo.md("## ✅ No duration outliers detected")

        return mo.md(f"""
        ## ⚠️ Duration Outliers Detected ⚠️
        - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
        - Standard Deviation of Duration: {std_duration:.2f} seconds
        - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
        - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
        - Number of Outlier Responses: {outlier_data.shape[0]}

        Outliers:

        {mo.ui.table(outlier_data)}


        **⚠️ NOTE: These have not been removed from the dataset ⚠️**

        """)

    duration_validation(data)
    return


@app.cell
def _(mo):
    mo.md(r"""
    # Data Analysis
    """)
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Demographics
    """)
    return


@app.cell
def _(data, survey):
    survey.get_demographics(data)[0].collect()
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Top 8 traits
    """)
    return


@app.cell
def _(data, survey):
    survey.get_top_8_traits(data)[0].collect()
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Top 3 traits
    """)
    return


@app.cell
def _(data, survey):
    survey.get_top_3_traits(data)[0].collect()
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Character Ranking
    """)
    return


@app.cell
def _(data, survey):
    survey.get_character_ranking(data)[0].collect()
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Voices 18 -> 8 -> 3
    """)
    return


@app.cell
def _(data, survey):
    survey.get_18_8_3(data)[0].collect()
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Voice Scales 1-10
    """)
    return


@app.cell
def _(data, survey):
    vscales = survey.get_voice_scale_1_10(data)[0].collect()
    vscales
    return (vscales,)


@app.cell
def _(plot_average_scores_with_counts, vscales):
    plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## SS Green Blue
    """)
    return


@app.cell
def _(data, survey):
    _lf, _choice_map = survey.get_ss_green_blue(data)
    # _lf.collect()
    print(_lf.collect().head())
    return


@app.cell
def _(df):

    df
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## Top 3 Voices
    """)
    return


@app.cell
def _(data, survey):
    top3_voices = survey.get_top_3_voices(data)[0].collect()
    top3_voices
    return (top3_voices,)


@app.cell
def _():

    # print(top3_voices.head())
    return


@app.cell
def _(plot_top3_ranking_distribution, top3_voices):
    plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
    return


@app.cell
def _(mo):
    mo.md(r"""
    ## SS Orange / Red
    """)
    return


@app.cell
def _(data, survey):
    _lf, choice_map = survey.get_ss_orange_red(data)
    _d = _lf.collect()
    _d
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Character Refine
    """)
    return


@app.cell
def _(data, survey):
    traits_original = survey.get_top_8_traits(data)[0]
    traits_original.collect()
    return (traits_original,)


@app.cell
def _(data, survey):
    traits_refined = survey.get_character_refine(data)[0]

    traits_refined.collect()
    return (traits_refined,)


@app.cell
def _(combine_exclusive_columns, traits_refined):
    traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined')
    traits_refined_comb
    return (traits_refined_comb,)


@app.cell
def _(traits_original, traits_refined_comb):
    # merge the two dataframes side by side for comparison
    traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId')
    print(traits_comparison.collect().head())
    return


if __name__ == "__main__":
    app.run()