JPMC-quant/03_quant_report.py

import marimo

__generated_with = "0.19.7"
app = marimo.App(width="full")

with app.setup:
    import marimo as mo
    import polars as pl
    from pathlib import Path

    from validation import check_progress, duration_validation, check_straight_liners
    from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
    import utils

    from speaking_styles import SPEAKING_STYLES


@app.cell
def _():

    file_browser = mo.ui.file_browser(
        initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
    )
    file_browser
    return (file_browser,)


@app.cell
def _(file_browser):
    mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
    RESULTS_FILE = Path(file_browser.path(index=0))
    QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
    return QSF_FILE, RESULTS_FILE


@app.cell
def _(QSF_FILE, RESULTS_FILE):
    S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
    try:
        data_all = S.load_data()
    except NotImplementedError as e:
        mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
    return S, data_all


@app.cell(hide_code=True)
def _(RESULTS_FILE, data_all):
    mo.md(rf"""
    ---
    # Load Data

    **Dataset:** {Path(RESULTS_FILE).name}

    **Responses**: {data_all.collect().shape[0]}
    """)
    return


@app.cell
def _(S, data_all):
    sl_ss_max_score = 5
    sl_v1_10_max_score = 10

    _ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
    _sl_ss_c, sl_ss_df = check_straight_liners(_ss_all, max_score=sl_ss_max_score)

    _sl_v1_10_c, sl_v1_10_df = check_straight_liners(
        S.get_voice_scale_1_10(data_all)[0],
        max_score=sl_v1_10_max_score
    )


    mo.md(f"""

    {check_progress(data_all)}


    {duration_validation(data_all)}


    ## Speaking Style - Straight Liners
    {_sl_ss_c}


    ## Voice Score Scale 1-10 - Straight Liners
    {_sl_v1_10_c}
    """)
    return


@app.cell
def _(data_all):
    # # Drop any Voice Scale 1-10 responses with straight-lining, using sl_v1_10_df _responseId values
    # records_to_drop = sl_v1_10_df.select('Record ID').to_series().to_list()

    # data_validated = data_all.filter(~pl.col('_recordId').is_in(records_to_drop))

    # mo.md(f"""
    # Dropped `{len(records_to_drop)}` responses with straight-lining in Voice Scale 1-10 evaluation.
    # """)
    data_validated = data_all
    return (data_validated,)


@app.cell
def _():


    return


@app.cell(hide_code=True)
def _():
    #
    return


@app.cell
def _():
    mo.md(r"""
    ## Lucia confirmation missing 'Consumer' data
    """)
    return


@app.cell
def _(S, data_validated):
    demographics = S.get_demographics(data_validated)[0].collect()
    demographics
    return (demographics,)


@app.cell(hide_code=True)
def _(demographics):
    # Demographics where 'Consumer' is null
    demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
    demographics_no_consumer
    return (demographics_no_consumer,)


@app.cell
def _(data_all, demographics_no_consumer):
    # check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
    assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
    return


@app.cell
def _():
    mo.md(r"""
    # Filter Data (Global corrections)
    """)
    return


@app.cell
def _(S):
    filter_form = mo.md('''


    {age}

    {gender}

    {ethnicity}

    {income}

    {consumer}
    '''
    ).batch(
        age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
        gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
        ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
        income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
        consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
    ).form()
    mo.md(f'''
    ---

    # Data Filter

    {filter_form}
    ''')
    return


@app.cell
def _(data_validated):
    # mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
    # _d = S.filter_data(data_validated, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer'])

    # # Stop execution and prevent other cells from running if no data is selected
    # mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
    # data = _d

    data = data_validated
    data.collect()
    return (data,)


@app.cell
def _():
    return


@app.cell
def _():
    # Check if all business owners are missing a 'Consumer type' in demographics
    # assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
    return


@app.cell
def _():
    mo.md(r"""
    # Demographic Distributions
    """)
    return


@app.cell
def _():
    demo_plot_cols = [
        'Age',
        'Gender',
        # 'Race/Ethnicity',
        'Bussiness_Owner',
        'Consumer'
    ]
    return (demo_plot_cols,)


@app.cell
def _(S, data, demo_plot_cols):
    _content = """

    """
    for c in demo_plot_cols:
        _fig = S.plot_demographic_distribution(
            data=S.get_demographics(data)[0],
            column=c,
            title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
        )
        _content += f"""{mo.ui.altair_chart(_fig)}\n\n"""

    mo.md(_content)
    return


@app.cell
def _():
    mo.md(r"""
    ---

    # Brand Character Results
    """)
    return


@app.cell(disabled=True)
def _():
    mo.md(r"""
    ## Best performing: Original vs Refined frankenstein
    """)
    return


@app.cell(disabled=True)
def _(S, data):
    char_refine_rank = S.get_character_refine(data)[0]
    # print(char_rank.collect().head())
    print(char_refine_rank.collect().head())
    return


@app.cell(disabled=True)
def _():
    mo.md(r"""
    ## Character ranking points
    """)
    return


@app.cell
def _(S, char_rank):
    char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
    S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
    return


@app.cell
def _():
    mo.md(r"""
    ## Character ranking 1-2-3
    """)
    return


@app.cell
def _(S, data):
    char_rank = S.get_character_ranking(data)[0]
    return (char_rank,)


@app.cell
def _(S, char_rank):
    S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
    return


@app.cell
def _():
    mo.md(r"""
    ### Statistical Significance Character Ranking
    """)
    return


@app.cell(disabled=True)
def _(S, char_rank):
    _pairwise_df, _meta = S.compute_ranking_significance(char_rank)

    # print(_pairwise_df.columns)

    mo.md(f"""


    {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}

    {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
    """)
    return


@app.cell(disabled=True)
def _():
    mo.md(r"""
    ## Character Ranking: times 1st place
    """)
    return


@app.cell
def _(S, char_rank):
    S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
    return


@app.cell
def _():
    mo.md(r"""
    ## Prominent predefined personality traits wordcloud
    """)
    return


@app.cell
def _(S, data):
    top8_traits = S.get_top_8_traits(data)[0]
    S.plot_traits_wordcloud(
        data=top8_traits,
        column='Top_8_Traits',
        title="Most Prominent Personality Traits",
    )
    return


@app.cell
def _():
    mo.md(r"""
    ## Trait frequency per brand character
    """)
    return


@app.cell
def _(S, data):
    char_df = S.get_character_refine(data)[0]
    return (char_df,)


@app.cell
def _(S, char_df):
    from theme import ColorPalette

    # Assuming you already have char_df (your data from get_character_refine or similar)
    characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
    character_colors = {
        'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
        'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
        'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
        'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
    }

    # Build consistent sort order (by total frequency across all characters)
    all_trait_counts = {}
    for char in characters:
        freq_df, _ = S.transform_character_trait_frequency(char_df, char)
        for row in freq_df.iter_rows(named=True):
            all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']

    consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])

    _content = """"""
    # Generate 4 plots (one per character)
    for char in characters:
        freq_df, _ = S.transform_character_trait_frequency(char_df, char)
        main_color, highlight_color = character_colors[char]
        chart = S.plot_single_character_trait_frequency(
            data=freq_df,
            character_name=char,
            bar_color=main_color,
            highlight_color=highlight_color,
            trait_sort_order=consistent_sort_order,
        )
        _content += f"""
        {mo.ui.altair_chart(chart)}


    """

    mo.md(_content)
    return


@app.cell(disabled=True)
def _():
    mo.md(r"""
    ## Statistical significance best characters

    zie chat
    > voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
    >
    """)
    return


@app.cell(disabled=True)
def _():
    return


@app.cell
def _():
    return


@app.cell
def _():
    mo.md(r"""
    ---

    # Spoken Voice Results
    """)
    return


@app.cell
def _():
    mo.md(r"""
    ## Top 8 Most Chosen out of 18
    """)
    return


@app.cell
def _(S, data):
    v_18_8_3 = S.get_18_8_3(data)[0]
    return (v_18_8_3,)


@app.cell
def _(S, v_18_8_3):
    S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice')
    return


@app.cell
def _():
    mo.md(r"""
    ## Top 3 most chosen out of 8
    """)
    return


@app.cell
def _(S, v_18_8_3):
    S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice')
    return


@app.cell(hide_code=True)
def _():
    mo.md(r"""
    ## Which voice is ranked best in the ranking question for top 3?

    (not best 3 out of 8 question)
    """)
    return


@app.cell
def _(S, data):
    top3_voices = S.get_top_3_voices(data)[0]
    top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
    return top3_voices, top3_voices_weighted


@app.cell
def _(S, top3_voices):
    S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Voice Rankings (1st, 2nd, 3rd)")
    return


@app.cell
def _():
    mo.md(r"""
    ### Statistical significance for voice ranking
    """)
    return


@app.cell
def _():
    # print(top3_voices.collect().head())
    return


@app.cell
def _():

    # _pairwise_df, _metadata = S.compute_ranking_significance(
    #     top3_voices,alpha=0.05,correction="none")

    # # View significant pairs
    # # print(pairwise_df.filter(pl.col('significant') == True))

    # # Create heatmap visualization
    # _heatmap = S.plot_significance_heatmap(
    #     _pairwise_df,
    #     metadata=_metadata,
    #     title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
    # )

    # # Create summary bar chart
    # _summary = S.plot_significance_summary(
    #     _pairwise_df,
    #     metadata=_metadata
    # )

    # mo.md(f"""
    # {mo.ui.altair_chart(_heatmap)}

    # {mo.ui.altair_chart(_summary)}
    # """)
    return


@app.cell(hide_code=True)
def _():
    mo.md(r"""
    ## Weighted Popularity Scores
    """)
    return


@app.cell
def _(S, top3_voices_weighted):
    _plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)")

    mo.md(f"""
    {mo.ui.altair_chart(_plot)}
    """)
    return


@app.cell
def _():
    ## Voice Ranked 1st the most
    return


@app.cell
def _(S, top3_voices):
    S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice')
    return


@app.cell
def _():
    mo.md(r"""
    ## Voice Scale 1-10
    """)
    return


@app.cell
def _(S, data):
    # Get your voice scale data (from notebook)
    voice_1_10, _ = S.get_voice_scale_1_10(data)
    S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)")
    return (voice_1_10,)


@app.cell(disabled=True)
def _():
    mo.md(r"""
    ### Statistical Significance (Scale 1-10)
    """)
    return


@app.cell(disabled=True)
def _(S, voice_1_10):
    # Compute pairwise significance tests
    pairwise_df, metadata = S.compute_pairwise_significance(
        voice_1_10,
        test_type="mannwhitney",  # or "ttest", "chi2", "auto"
        alpha=0.05,
        correction="bonferroni"   # or "holm", "none"
    )

    # View significant pairs
    # print(pairwise_df.filter(pl.col('significant') == True))

    # Create heatmap visualization
    _heatmap = S.plot_significance_heatmap(
        pairwise_df,
        metadata=metadata,
        title="Voice Rating Significance<br>(Pairwise Comparisons)"
    )

    # Create summary bar chart
    _summary = S.plot_significance_summary(
        pairwise_df,
        metadata=metadata
    )

    mo.md(f"""
    {mo.ui.altair_chart(_heatmap)}

    {mo.ui.altair_chart(_summary)}
    """)
    return


@app.cell
def _():
    return


if __name__ == "__main__":
    app.run()