JPMC-quant/03_quant_report.script.py


__generated_with = "0.19.7"

# %%
import marimo as mo
import polars as pl
from pathlib import Path
import argparse
import json

from validation import check_progress, duration_validation, check_straight_liners
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
import utils

from speaking_styles import SPEAKING_STYLES

# %%
# CLI argument parsing for batch automation
# When run as script: python 03_quant_report.script.py --age '["18 to 21 years"]' --consumer '["Starter"]'
# When run in Jupyter: args will use defaults (all filters = None = all options selected)

# Central filter configuration - add new filters here only
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
FILTER_CONFIG = {
    'age': 'options_age',
    'gender': 'options_gender',
    'ethnicity': 'options_ethnicity',
    'income': 'options_income',
    'consumer': 'options_consumer',
    'business_owner': 'options_business_owner',
    'ai_user': 'options_ai_user',
    'investable_assets': 'options_investable_assets',
    'industry': 'options_industry',
}

def parse_cli_args():
    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')

    # Dynamically add filter arguments from config
    for filter_name in FILTER_CONFIG:
        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')

    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')

    # Only parse if running as script (not in Jupyter/interactive)
    try:
        # Check if running in Jupyter by looking for ipykernel
        get_ipython()  # noqa: F821
        # Return namespace with all filters set to None
        return argparse.Namespace(**{f: None for f in FILTER_CONFIG}, filter_name=None)
    except NameError:
        args = parser.parse_args()
        # Parse JSON strings to lists
        for filter_name in FILTER_CONFIG:
            val = getattr(args, filter_name)
            setattr(args, filter_name, json.loads(val) if val else None)
        return args

cli_args = parse_cli_args()

# %%

# file_browser = mo.ui.file_browser(
#     initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
# )
# file_browser

# # %%
# mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
# RESULTS_FILE = Path(file_browser.path(index=0))

RESULTS_FILE = 'data/exports/2-3-26_Copy-2-2-26/JPMC_Chase Brand Personality_Quant Round 1_February 2, 2026_Labels.csv'
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'

# %%
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
try:
    data_all = S.load_data()
except NotImplementedError as e:
    mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))

# %%
BEST_CHOSEN_CHARACTER = "the_coach"

# # %%
# filter_form = mo.md('''


# {age}

# {gender}

# {ethnicity}

# {income}

# {consumer}
# '''
# ).batch(
#     age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
#     gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
#     ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
#     income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
#     consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
# ).form()
# mo.md(f'''
# ---

# # Data Filter

# {filter_form}
# ''')

# %%
# mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
# CLI args: None means "all options selected" (use S.options_* defaults)
# Build filter values dict dynamically from FILTER_CONFIG
_active_filters = {}
for filter_name, options_attr in FILTER_CONFIG.items():
    cli_value = getattr(cli_args, filter_name)
    all_options = getattr(S, options_attr)
    _active_filters[filter_name] = cli_value if cli_value is not None else all_options

_d = S.filter_data(data_all, **_active_filters)

# Write filter description file if filter-name is provided
if cli_args.filter_name and S.fig_save_dir:
    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
    _filter_slug = S._get_filter_slug()
    _filter_slug_dir = S.fig_save_dir / _filter_slug
    _filter_slug_dir.mkdir(parents=True, exist_ok=True)

    # Build filter description
    _filter_desc_lines = [
        f"Filter: {cli_args.filter_name}",
        "",
        "Applied Filters:",
    ]
    _short_desc_parts = []
    for filter_name, options_attr in FILTER_CONFIG.items():
        all_options = getattr(S, options_attr)
        values = _active_filters[filter_name]
        display_name = filter_name.replace('_', ' ').title()
        if values != all_options:
            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
        else:
            _filter_desc_lines.append(f"  {display_name}: All")

    # Write detailed description INSIDE the filter-slug directory
    _filter_file = _filter_slug_dir / f"{cli_args.filter_name}.txt"
    _filter_file.write_text('\n'.join(_filter_desc_lines))

    # Append to summary index file at figures/<export_date>/filter_index.txt
    _summary_file = S.fig_save_dir / "filter_index.txt"
    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"

    # Append or create the summary file
    if _summary_file.exists():
        _existing = _summary_file.read_text()
        # Avoid duplicate entries for same slug
        if _filter_slug not in _existing:
            with _summary_file.open('a') as f:
                f.write(_summary_line)
    else:
        _header = "Filter Index\n" + "=" * 80 + "\n\n"
        _header += "Directory  |  Filter Name  |  Description\n"
        _header += "-" * 80 + "\n"
        _summary_file.write_text(_header + _summary_line)

# Stop execution and prevent other cells from running if no data is selected
# mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
data = _d

# data = data_validated
data.collect()

# %%


# %%
# Check if all business owners are missing a 'Consumer type' in demographics
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."

# %%
mo.md(r"""
# Demographic Distributions
""")

# %%
demo_plot_cols = [
    'Age',
    'Gender',
    # 'Race/Ethnicity',
    'Bussiness_Owner',
    'Consumer'
]

# %%
_content = """

"""
for c in demo_plot_cols:
    _fig = S.plot_demographic_distribution(
        data=S.get_demographics(data)[0],
        column=c,
        title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
    )
    _content += f"""{mo.ui.altair_chart(_fig)}\n\n"""

mo.md(_content)

# %%
mo.md(r"""
---

# Brand Character Results
""")

# %%
mo.md(r"""
## Best performing: Original vs Refined frankenstein
""")

# %%
char_refine_rank = S.get_character_refine(data)[0]
# print(char_rank.collect().head())
print(char_refine_rank.collect().head())

# %%
mo.md(r"""
## Character ranking points
""")

# %%
mo.md(r"""
## Character ranking 1-2-3
""")

# %%
char_rank = S.get_character_ranking(data)[0]

# %%
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')

# %%
S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')

# %%
mo.md(r"""
### Statistical Significance Character Ranking
""")

# %%
# _pairwise_df, _meta = S.compute_ranking_significance(char_rank)

# # print(_pairwise_df.columns)

# mo.md(f"""


# {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}

# {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
# """)

# %%
mo.md(r"""
## Character Ranking: times 1st place
""")

# %%
S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')

# %%
mo.md(r"""
## Prominent predefined personality traits wordcloud
""")

# %%
top8_traits = S.get_top_8_traits(data)[0]
S.plot_traits_wordcloud(
    data=top8_traits,
    column='Top_8_Traits',
    title="Most Prominent Personality Traits",
)

# %%
mo.md(r"""
## Trait frequency per brand character
""")

# %%
char_df = S.get_character_refine(data)[0]

# %%
from theme import ColorPalette

# Assuming you already have char_df (your data from get_character_refine or similar)
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
character_colors = {
    'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
    'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
    'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
    'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
}

# Build consistent sort order (by total frequency across all characters)
all_trait_counts = {}
for char in characters:
    freq_df, _ = S.transform_character_trait_frequency(char_df, char)
    for row in freq_df.iter_rows(named=True):
        all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']

consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])

_content = """"""
# Generate 4 plots (one per character)
for char in characters:
    freq_df, _ = S.transform_character_trait_frequency(char_df, char)
    main_color, highlight_color = character_colors[char]
    chart = S.plot_single_character_trait_frequency(
        data=freq_df,
        character_name=char,
        bar_color=main_color,
        highlight_color=highlight_color,
        trait_sort_order=consistent_sort_order,
    )
    _content += f"""
    {mo.ui.altair_chart(chart)}


"""

mo.md(_content)

# %%
mo.md(r"""
## Statistical significance best characters

zie chat
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
>
""")

# %%


# %%


# %%
mo.md(r"""
---

# Spoken Voice Results
""")

# %%
COLOR_GENDER = True

# %%
mo.md(r"""
## Top 8 Most Chosen out of 18
""")

# %%
v_18_8_3 = S.get_18_8_3(data)[0]

# %%
S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice', color_gender=COLOR_GENDER)

# %%
mo.md(r"""
## Top 3 most chosen out of 8
""")

# %%
S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice', color_gender=COLOR_GENDER)

# %%
mo.md(r"""
## Voice Ranking Weighted Score
""")

# %%
top3_voices = S.get_top_3_voices(data)[0]
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)

# %%
S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)", color_gender=COLOR_GENDER)

# %%
mo.md(r"""
## Which voice is ranked best in the ranking question for top 3?

(not best 3 out of 8 question)
""")

# %%
S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Top 3 Voice Rankings (1st, 2nd, 3rd)", color_gender=COLOR_GENDER)

# %%
mo.md(r"""
### Statistical significance for voice ranking
""")

# %%
# print(top3_voices.collect().head())

# %%

# _pairwise_df, _metadata = S.compute_ranking_significance(
#     top3_voices,alpha=0.05,correction="none")

# # View significant pairs
# # print(pairwise_df.filter(pl.col('significant') == True))

# # Create heatmap visualization
# _heatmap = S.plot_significance_heatmap(
#     _pairwise_df,
#     metadata=_metadata,
#     title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
# )

# # Create summary bar chart
# _summary = S.plot_significance_summary(
#     _pairwise_df,
#     metadata=_metadata
# )

# mo.md(f"""
# {mo.ui.altair_chart(_heatmap)}

# {mo.ui.altair_chart(_summary)}
# """)

# %%
## Voice Ranked 1st the most

# %%
S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice', color_gender=COLOR_GENDER)

# %%
mo.md(r"""
## Voice Scale 1-10
""")

# %%
# Get your voice scale data (from notebook)
voice_1_10, _ = S.get_voice_scale_1_10(data)
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)", color_gender=COLOR_GENDER)

# %%
mo.md(r"""
### Statistical Significance (Scale 1-10)
""")

# %%
# Compute pairwise significance tests
# pairwise_df, metadata = S.compute_pairwise_significance(
#     voice_1_10,
#     test_type="mannwhitney",  # or "ttest", "chi2", "auto"
#     alpha=0.05,
#     correction="bonferroni"   # or "holm", "none"
# )

# # View significant pairs
# # print(pairwise_df.filter(pl.col('significant') == True))

# # Create heatmap visualization
# _heatmap = S.plot_significance_heatmap(
#     pairwise_df,
#     metadata=metadata,
#     title="Voice Rating Significance<br>(Pairwise Comparisons)"
# )

# # Create summary bar chart
# _summary = S.plot_significance_summary(
#     pairwise_df,
#     metadata=metadata
# )

# mo.md(f"""
# {mo.ui.altair_chart(_heatmap)}

# {mo.ui.altair_chart(_summary)}
# """)

# %%


# %%
mo.md(r"""
## Ranking points for Voice per Chosen Brand Character

**missing mapping**
""")

# %%
mo.md(r"""
## Correlation Speaking Styles
""")

# %%
ss_or, choice_map_or = S.get_ss_orange_red(data)
ss_gb, choice_map_gb = S.get_ss_green_blue(data)

# Combine the data
ss_all = ss_or.join(ss_gb, on='_recordId')
_d = ss_all.collect()

choice_map = {**choice_map_or, **choice_map_gb}
# print(_d.head())
# print(choice_map)
ss_long = utils.process_speaking_style_data(ss_all, choice_map)

df_style = utils.process_speaking_style_data(ss_all, choice_map)

vscales = S.get_voice_scale_1_10(data)[0]
df_scale_long = utils.process_voice_scale_data(vscales)

joined_scale = df_style.join(df_scale_long, on=["_recordId", "Voice"], how="inner")

df_ranking = utils.process_voice_ranking_data(top3_voices)
joined_ranking = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')

# %%
joined_ranking.head()

# %%
mo.md(r"""
### Colors vs Scale 1-10
""")

# %%
# Transform to get one row per color with average correlation
color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, SPEAKING_STYLES)
S.plot_speaking_style_color_correlation(
    data=color_corr_scale,
    title="Correlation: Speaking Style Colors and Voice Scale 1-10"
)

# %%
mo.md(r"""
### Colors vs Ranking Points
""")

# %%
color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
    joined_ranking,
    SPEAKING_STYLES,
    target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
    data=color_corr_ranking,
    title="Correlation: Speaking Style Colors and Voice Ranking Points"
)

# %%
mo.md(r"""
### Individual Traits vs Scale 1-10
""")

# %%
_content = """"""

for _style, _traits in SPEAKING_STYLES.items():
    # print(f"Correlation plot for {style}...")
    _fig = S.plot_speaking_style_correlation(
        data=joined_scale,
        style_color=_style,
        style_traits=_traits,
        title=f"Correlation: Speaking Style {_style} and Voice Scale 1-10",
    )
    _content += f"""
#### Speaking Style **{_style}**:

{mo.ui.altair_chart(_fig)}

"""
mo.md(_content)

# %%
mo.md(r"""
### Individual Traits vs Ranking Points
""")

# %%
_content = """"""

for _style, _traits in SPEAKING_STYLES.items():
    # print(f"Correlation plot for {style}...")
    _fig = S.plot_speaking_style_ranking_correlation(
    data=joined_ranking,
    style_color=_style,
    style_traits=_traits,
    title=f"Correlation: Speaking Style {_style} and Voice Ranking Points",
)
    _content += f"""
#### Speaking Style **{_style}**:

{mo.ui.altair_chart(_fig)}

"""
mo.md(_content)

# %%
mo.md(r"""
## Correlations when "Best Brand Character" is chosen

Select only the traits that fit with that character
""")

# %%
from reference import ORIGINAL_CHARACTER_TRAITS
chosen_bc_traits = ORIGINAL_CHARACTER_TRAITS[BEST_CHOSEN_CHARACTER]

# %%
STYLES_SUBSET = utils.filter_speaking_styles(SPEAKING_STYLES, chosen_bc_traits)

# %%
mo.md(r"""
### Individual Traits vs Ranking Points
""")

# %%
_content = ""
for _style, _traits in STYLES_SUBSET.items():
    _fig = S.plot_speaking_style_ranking_correlation(
        data=joined_ranking,
        style_color=_style,
        style_traits=_traits,
        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Ranking Points"""
    )
    _content += f"""
{mo.ui.altair_chart(_fig)}

"""
mo.md(_content)

# %%
mo.md(r"""
### Individual Traits vs Scale 1-10
""")

# %%
_content = """"""

for _style, _traits in STYLES_SUBSET.items():
    # print(f"Correlation plot for {style}...")
    _fig = S.plot_speaking_style_correlation(
        data=joined_scale,
        style_color=_style,
        style_traits=_traits,
        title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style {_style} and Voice Scale 1-10""",
    )
    _content += f"""
{mo.ui.altair_chart(_fig)}

"""
mo.md(_content)

# %%
mo.md(r"""
### Colors vs Scale 1-10 (Best Character)
""")

# %%
# Transform to get one row per color with average correlation
_color_corr_scale, _ = utils.transform_speaking_style_color_correlation(joined_scale, STYLES_SUBSET)
S.plot_speaking_style_color_correlation(
    data=_color_corr_scale,
    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Scale 1-10"""
)

# %%
mo.md(r"""
### Colors vs Ranking Points (Best Character)
""")

# %%
_color_corr_ranking, _ = utils.transform_speaking_style_color_correlation(
    joined_ranking,
    STYLES_SUBSET,
    target_column="Ranking_Points"
)
S.plot_speaking_style_color_correlation(
    data=_color_corr_ranking,
    title=f"""Brand Character "{BEST_CHOSEN_CHARACTER.replace('_', ' ').title()}" - Correlation: Speaking Style Colors and Voice Ranking Points"""
)