Files
JPMC-quant/03_quant_report.py

747 lines
16 KiB
Python

import marimo
__generated_with = "0.19.7"
app = marimo.App(width="full")
with app.setup:
import marimo as mo
import polars as pl
from pathlib import Path
from validation import check_progress, duration_validation, check_straight_liners
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
import utils
from speaking_styles import SPEAKING_STYLES
@app.cell
def _():
file_browser = mo.ui.file_browser(
initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
)
file_browser
return (file_browser,)
@app.cell
def _(file_browser):
mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
RESULTS_FILE = Path(file_browser.path(index=0))
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
return QSF_FILE, RESULTS_FILE
@app.cell
def _(QSF_FILE, RESULTS_FILE):
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
try:
data_all = S.load_data()
except NotImplementedError as e:
mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
return S, data_all
@app.cell(hide_code=True)
def _(RESULTS_FILE, data_all):
mo.md(rf"""
---
# Load Data
**Dataset:** {Path(RESULTS_FILE).name}
**Responses**: {data_all.collect().shape[0]}
""")
return
@app.cell
def _(S, data_all):
sl_ss_max_score = 5
sl_v1_10_max_score = 10
_ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
_sl_ss_c, sl_ss_df = check_straight_liners(_ss_all, max_score=sl_ss_max_score)
_sl_v1_10_c, sl_v1_10_df = check_straight_liners(
S.get_voice_scale_1_10(data_all)[0],
max_score=sl_v1_10_max_score
)
mo.md(f"""
# Data Validation
{check_progress(data_all)}
{duration_validation(data_all)}
## Speaking Style - Straight Liners
{_sl_ss_c}
## Voice Score Scale 1-10 - Straight Liners
{_sl_v1_10_c}
""")
return
@app.cell
def _(data_all):
# # Drop any Voice Scale 1-10 responses with straight-lining, using sl_v1_10_df _responseId values
# records_to_drop = sl_v1_10_df.select('Record ID').to_series().to_list()
# data_validated = data_all.filter(~pl.col('_recordId').is_in(records_to_drop))
# mo.md(f"""
# Dropped `{len(records_to_drop)}` responses with straight-lining in Voice Scale 1-10 evaluation.
# """)
data_validated = data_all
return (data_validated,)
@app.cell(hide_code=True)
def _():
return
@app.cell
def _():
#
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
---
# Introduction (Respondent Demographics)
""")
return
@app.cell
def _(S, data_validated):
demographics = S.get_demographics(data_validated)[0].collect()
demographics
return (demographics,)
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Lucia confirmation missing 'Consumer' data
""")
return
@app.cell
def _(demographics):
# Demographics where 'Consumer' is null
demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
demographics_no_consumer
return (demographics_no_consumer,)
@app.cell
def _(data_all, demographics_no_consumer):
# check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
return
@app.cell
def _():
mo.md(r"""
# Filter Data (Global corrections)
""")
return
@app.cell
def _(data_validated):
# drop rows where 'consumer' is null
# data = data_validated.filter(pl.col('Consumer').is_not_null())
data = data_validated
data.collect()
return (data,)
@app.cell
def _():
return
@app.cell
def _():
# Check if all business owners are missing a 'Consumer type' in demographics
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
return
@app.cell
def _():
mo.md(r"""
## Demographic Distributions
""")
return
@app.cell
def _():
demo_plot_cols = [
'Age',
'Gender',
# 'Race/Ethnicity',
'Bussiness_Owner',
'Consumer'
]
return (demo_plot_cols,)
@app.cell
def _(S, data, demo_plot_cols):
_content = """
## Demographic Distributions
"""
for c in demo_plot_cols:
_fig = S.plot_demographic_distribution(
data=S.get_demographics(data)[0],
column=c,
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
)
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
mo.md(_content)
return
@app.cell
def _():
mo.md(r"""
---
# Brand Character Results
""")
return
@app.cell
def _():
mo.md(r"""
## Best performing: Original vs Refined frankenstein
""")
return
@app.cell
def _(S, data):
char_refine_rank = S.get_character_refine(data)[0]
# print(char_rank.collect().head())
print(char_refine_rank.collect().head())
return
@app.cell
def _():
mo.md(r"""
## Character ranking points
""")
return
@app.cell
def _(S, char_rank):
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
return
@app.cell
def _():
mo.md(r"""
## Character ranking 1-2-3
""")
return
@app.cell
def _(S, data):
char_rank = S.get_character_ranking(data)[0]
return (char_rank,)
@app.cell
def _(S, char_rank):
S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
return
@app.cell
def _():
mo.md(r"""
### Statistical Significance Character Ranking
""")
return
@app.cell
def _(S, char_rank):
_pairwise_df, _meta = S.compute_ranking_significance(char_rank)
# print(_pairwise_df.columns)
mo.md(f"""
{mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
{mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
""")
return
@app.cell
def _():
mo.md(r"""
## Character Ranking: times 1st place
""")
return
@app.cell
def _(S, char_rank):
S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
return
@app.cell
def _():
mo.md(r"""
## Prominent predefined personality traits wordcloud
""")
return
@app.cell
def _(S, data):
top8_traits = S.get_top_8_traits(data)[0]
S.plot_traits_wordcloud(
data=top8_traits,
column='Top_8_Traits',
title="Most Prominent Personality Traits",
)
return
@app.cell
def _():
mo.md(r"""
## Trait frequency per brand character
""")
return
@app.cell
def _(S, data):
char_df = S.get_character_refine(data)[0]
return (char_df,)
@app.cell
def _(S, char_df):
from theme import ColorPalette
# Assuming you already have char_df (your data from get_character_refine or similar)
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
character_colors = {
'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
}
# Build consistent sort order (by total frequency across all characters)
all_trait_counts = {}
for char in characters:
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
for row in freq_df.iter_rows(named=True):
all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
_content = """"""
# Generate 4 plots (one per character)
for char in characters:
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
main_color, highlight_color = character_colors[char]
chart = S.plot_single_character_trait_frequency(
data=freq_df,
character_name=char,
bar_color=main_color,
highlight_color=highlight_color,
trait_sort_order=consistent_sort_order,
)
_content += f"""
{mo.ui.altair_chart(chart)}
"""
mo.md(_content)
return character_colors, consistent_sort_order
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Statistical significance best characters
zie chat
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
>
""")
return
@app.cell
def _():
return
@app.cell
def _():
mo.md(r"""
## Character Ranking Points (per customer segment)
""")
return
@app.cell
def _(S, data):
_content = ""
for _consumer_group, _consumer_df in utils.split_consumer_groups(data).items():
_char_rank = S.get_character_ranking(_consumer_df)[0]
_char_rank_weighted = calculate_weighted_ranking_scores(_char_rank)
_plot = S.plot_weighted_ranking_score(
_char_rank_weighted,
title=f'Most Popular Character - Weighted Popularity Score - CONSUMER: "{_consumer_group.replace("_", " ").replace("Woth", 'Worth')}"<br>(1st=3pts, 2nd=2pts, 3rd=1pt)',
x_label='Voice'
)
_content += f"""
{mo.ui.altair_chart(_plot)}
"""
mo.md(_content)
return
@app.cell
def _():
mo.md(r"""
## Character Ranking Place 1-2-3 in one (per consumer)
""")
return
@app.cell
def _(S, data):
_content = ""
for _consumer_group, _consumer_df in utils.split_consumer_groups(data).items():
_char_rank = S.get_character_ranking(_consumer_df)[0]
_plot = S.plot_top3_ranking_distribution(_char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3 - CONSUMER: "'+_consumer_group.replace("_", " ").replace("Woth", 'Worth')+'"')
_content += f"""
{mo.ui.altair_chart(_plot)}
"""
mo.md(_content)
return
@app.cell
def _():
mo.md(r"""
## Character Ranking times 1st place (per consumer)
""")
return
@app.cell
def _(S, data):
_content = ""
for _consumer_group, _consumer_df in utils.split_consumer_groups(data).items():
_char_rank = S.get_character_ranking(_consumer_df)[0]
_plot = S.plot_most_ranked_1(_char_rank, title=f'Most Popular Character - CONSUMER: "{_consumer_group.replace("_", " ").replace("Woth", 'Worth')}"<br>(Number of Times Ranked 1st)', x_label='Character Personality')
_content += f"""
{mo.ui.altair_chart(_plot)}
"""
mo.md(_content)
return
@app.cell
def _():
mo.md(r"""
## Predefined personality traits WordClouds per Consumer
""")
return
@app.cell
def _(S, data):
_content = ""
for _consumer_group, _consumer_df in utils.split_consumer_groups(data).items():
_top8_traits = S.get_top_8_traits(_consumer_df)[0]
_plot = S.plot_traits_wordcloud(
data=_top8_traits,
column='Top_8_Traits',
title=f'Most Prominent Personality Traits - CONSUMER: "{_consumer_group.replace("_", " ").replace("Woth", "Worth")}"',
)
_content += f"""
{_plot}
"""
mo.md(_content)
return
@app.cell
def _():
mo.md(r"""
## Frequency traits chosen - per consumer segment
""")
return
@app.cell
def _(S, character_colors, consistent_sort_order, data):
top_char = "The Coach"
_content = ""
for _consumer_group, _consumer_df in utils.split_consumer_groups(data).items():
_char_df = S.get_character_refine(_consumer_df)[0]
_freq_df, _ = S.transform_character_trait_frequency(_char_df, top_char)
_main_color, _highlight_color = character_colors[top_char]
_chart = S.plot_single_character_trait_frequency(
data=_freq_df,
character_name=top_char,
bar_color=_main_color,
highlight_color=_highlight_color,
trait_sort_order=consistent_sort_order,
title=f"""Top Personality Traits for '{top_char}' - CONSUMER: "{_consumer_group.replace('_', ' ').replace("Woth", "Worth")}"""
)
_content += f"""
{mo.ui.altair_chart(_chart)}
"""
mo.md(_content)
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
---
# Spoken Voice Results
""")
return
@app.cell
def _(S, data):
top3_voices = S.get_top_3_voices(data)[0]
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
return top3_voices, top3_voices_weighted
@app.cell
def _():
mo.md(r"""
## Which voice is ranked best in the ranking question for top 3?
(not best 3 out of 8 question)
""")
return
@app.cell
def _(S, top3_voices):
_plot = S.plot_ranking_distribution(top3_voices, x_label='Voice')
mo.md(f"""
{mo.ui.altair_chart(_plot)}
""")
return
@app.cell
def _():
mo.md(r"""
### Statistical significance for voice ranking
""")
return
@app.cell
def _():
# print(top3_voices.collect().head())
return
@app.cell
def _():
# _pairwise_df, _metadata = S.compute_ranking_significance(
# top3_voices,alpha=0.05,correction="none")
# # View significant pairs
# # print(pairwise_df.filter(pl.col('significant') == True))
# # Create heatmap visualization
# _heatmap = S.plot_significance_heatmap(
# _pairwise_df,
# metadata=_metadata,
# title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
# )
# # Create summary bar chart
# _summary = S.plot_significance_summary(
# _pairwise_df,
# metadata=_metadata
# )
# mo.md(f"""
# {mo.ui.altair_chart(_heatmap)}
# {mo.ui.altair_chart(_summary)}
# """)
return
@app.cell
def _():
mo.md(r"""
## Weighted Popularity Scores
""")
return
@app.cell
def _(S, top3_voices_weighted):
_plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)")
mo.md(f"""
{mo.ui.altair_chart(_plot)}
""")
return
@app.cell
def _():
return
@app.cell
def _(top3_voices_weighted):
print(top3_voices_weighted.head())
return
@app.cell
def _():
return
@app.cell(hide_code=True)
def _():
mo.md(r"""
## Voice Scale 1-10
""")
return
@app.cell
def _(S, data):
# Get your voice scale data (from notebook)
voice_1_10, _ = S.get_voice_scale_1_10(data)
return (voice_1_10,)
@app.cell
def _(S, voice_1_10):
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)")
return
@app.cell
def _():
mo.md(r"""
### Statistical Significance (Scale 1-10)
""")
return
@app.cell
def _(S, voice_1_10):
# Compute pairwise significance tests
pairwise_df, metadata = S.compute_pairwise_significance(
voice_1_10,
test_type="mannwhitney", # or "ttest", "chi2", "auto"
alpha=0.05,
correction="bonferroni" # or "holm", "none"
)
# View significant pairs
# print(pairwise_df.filter(pl.col('significant') == True))
# Create heatmap visualization
_heatmap = S.plot_significance_heatmap(
pairwise_df,
metadata=metadata,
title="Voice Rating Significance<br>(Pairwise Comparisons)"
)
# Create summary bar chart
_summary = S.plot_significance_summary(
pairwise_df,
metadata=metadata
)
mo.md(f"""
{mo.ui.altair_chart(_heatmap)}
{mo.ui.altair_chart(_summary)}
""")
return
if __name__ == "__main__":
app.run()