657 lines
14 KiB
Python
657 lines
14 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.19.7"
|
|
app = marimo.App(width="full")
|
|
|
|
with app.setup:
|
|
import marimo as mo
|
|
import polars as pl
|
|
from pathlib import Path
|
|
|
|
from validation import check_progress, duration_validation, check_straight_liners
|
|
from utils import QualtricsSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
|
import utils
|
|
|
|
from speaking_styles import SPEAKING_STYLES
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
|
|
file_browser = mo.ui.file_browser(
|
|
initial_path="./data/exports", multiple=False, restrict_navigation=True, filetypes=[".csv"], label="Select 'Labels' File"
|
|
)
|
|
file_browser
|
|
return (file_browser,)
|
|
|
|
|
|
@app.cell
|
|
def _(file_browser):
|
|
mo.stop(file_browser.path(index=0) is None, mo.md("**⚠️ Please select a `_Labels.csv` file above to proceed**"))
|
|
RESULTS_FILE = Path(file_browser.path(index=0))
|
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
|
return QSF_FILE, RESULTS_FILE
|
|
|
|
|
|
@app.cell
|
|
def _(QSF_FILE, RESULTS_FILE):
|
|
S = QualtricsSurvey(RESULTS_FILE, QSF_FILE)
|
|
try:
|
|
data_all = S.load_data()
|
|
except NotImplementedError as e:
|
|
mo.stop(True, mo.md(f"**⚠️ {str(e)}**"))
|
|
return S, data_all
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(RESULTS_FILE, data_all):
|
|
mo.md(rf"""
|
|
---
|
|
# Load Data
|
|
|
|
**Dataset:** {Path(RESULTS_FILE).name}
|
|
|
|
**Responses**: {data_all.collect().shape[0]}
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data_all):
|
|
sl_ss_max_score = 5
|
|
sl_v1_10_max_score = 10
|
|
|
|
_ss_all = S.get_ss_green_blue(data_all)[0].join(S.get_ss_orange_red(data_all)[0], on='_recordId')
|
|
_sl_ss_c, sl_ss_df = check_straight_liners(_ss_all, max_score=sl_ss_max_score)
|
|
|
|
_sl_v1_10_c, sl_v1_10_df = check_straight_liners(
|
|
S.get_voice_scale_1_10(data_all)[0],
|
|
max_score=sl_v1_10_max_score
|
|
)
|
|
|
|
|
|
mo.md(f"""
|
|
|
|
{check_progress(data_all)}
|
|
|
|
|
|
|
|
{duration_validation(data_all)}
|
|
|
|
|
|
## Speaking Style - Straight Liners
|
|
{_sl_ss_c}
|
|
|
|
|
|
## Voice Score Scale 1-10 - Straight Liners
|
|
{_sl_v1_10_c}
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data_all):
|
|
# # Drop any Voice Scale 1-10 responses with straight-lining, using sl_v1_10_df _responseId values
|
|
# records_to_drop = sl_v1_10_df.select('Record ID').to_series().to_list()
|
|
|
|
# data_validated = data_all.filter(~pl.col('_recordId').is_in(records_to_drop))
|
|
|
|
# mo.md(f"""
|
|
# Dropped `{len(records_to_drop)}` responses with straight-lining in Voice Scale 1-10 evaluation.
|
|
# """)
|
|
data_validated = data_all
|
|
return (data_validated,)
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
|
|
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _():
|
|
#
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Lucia confirmation missing 'Consumer' data
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data_validated):
|
|
demographics = S.get_demographics(data_validated)[0].collect()
|
|
demographics
|
|
return (demographics,)
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(demographics):
|
|
# Demographics where 'Consumer' is null
|
|
demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
|
|
demographics_no_consumer
|
|
return (demographics_no_consumer,)
|
|
|
|
|
|
@app.cell
|
|
def _(data_all, demographics_no_consumer):
|
|
# check if the responses with missing 'Consumer type' in demographics are all business owners as Lucia mentioned
|
|
assert all(data_all.filter(pl.col('_recordId').is_in(demographics_no_consumer)).collect()['QID4'] == 'Yes'), "Not all respondents with missing 'Consumer' are business owners."
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
# Filter Data (Global corrections)
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S):
|
|
filter_form = mo.md('''
|
|
|
|
|
|
|
|
{age}
|
|
|
|
{gender}
|
|
|
|
{ethnicity}
|
|
|
|
{income}
|
|
|
|
{consumer}
|
|
'''
|
|
).batch(
|
|
age=mo.ui.multiselect(options=S.options_age, value=S.options_age, label="Select Age Group(s):"),
|
|
gender=mo.ui.multiselect(options=S.options_gender, value=S.options_gender, label="Select Gender(s):"),
|
|
ethnicity=mo.ui.multiselect(options=S.options_ethnicity, value=S.options_ethnicity, label="Select Ethnicities:"),
|
|
income=mo.ui.multiselect(options=S.options_income, value=S.options_income, label="Select Income Group(s):"),
|
|
consumer=mo.ui.multiselect(options=S.options_consumer, value=S.options_consumer, label="Select Consumer Groups:")
|
|
).form()
|
|
mo.md(f'''
|
|
---
|
|
|
|
# Data Filter
|
|
|
|
{filter_form}
|
|
''')
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data_validated):
|
|
# mo.stop(filter_form.value is None, mo.md("**Please submit filter above to proceed**"))
|
|
# _d = S.filter_data(data_validated, age=filter_form.value['age'], gender=filter_form.value['gender'], income=filter_form.value['income'], ethnicity=filter_form.value['ethnicity'], consumer=filter_form.value['consumer'])
|
|
|
|
# # Stop execution and prevent other cells from running if no data is selected
|
|
# mo.stop(len(_d.collect()) == 0, mo.md("**No Data available for current filter combination**"))
|
|
# data = _d
|
|
|
|
data = data_validated
|
|
data.collect()
|
|
return (data,)
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
# Check if all business owners are missing a 'Consumer type' in demographics
|
|
# assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
# Demographic Distributions
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
demo_plot_cols = [
|
|
'Age',
|
|
'Gender',
|
|
# 'Race/Ethnicity',
|
|
'Bussiness_Owner',
|
|
'Consumer'
|
|
]
|
|
return (demo_plot_cols,)
|
|
|
|
|
|
@app.cell
|
|
def _(S, data, demo_plot_cols):
|
|
_content = """
|
|
|
|
"""
|
|
for c in demo_plot_cols:
|
|
_fig = S.plot_demographic_distribution(
|
|
data=S.get_demographics(data)[0],
|
|
column=c,
|
|
title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
|
|
)
|
|
_content += f"""{mo.ui.altair_chart(_fig)}\n\n"""
|
|
|
|
mo.md(_content)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
---
|
|
|
|
# Brand Character Results
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _():
|
|
mo.md(r"""
|
|
## Best performing: Original vs Refined frankenstein
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _(S, data):
|
|
char_refine_rank = S.get_character_refine(data)[0]
|
|
# print(char_rank.collect().head())
|
|
print(char_refine_rank.collect().head())
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _():
|
|
mo.md(r"""
|
|
## Character ranking points
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, char_rank):
|
|
char_rank_weighted = calculate_weighted_ranking_scores(char_rank)
|
|
S.plot_weighted_ranking_score(char_rank_weighted, title="Most Popular Character - Weighted Popularity Score<br>(1st=3pts, 2nd=2pts, 3rd=1pt)", x_label='Voice')
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Character ranking 1-2-3
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data):
|
|
char_rank = S.get_character_ranking(data)[0]
|
|
return (char_rank,)
|
|
|
|
|
|
@app.cell
|
|
def _(S, char_rank):
|
|
S.plot_top3_ranking_distribution(char_rank, x_label='Character Personality', title='Character Personality: Rankings Top 3')
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
### Statistical Significance Character Ranking
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _(S, char_rank):
|
|
_pairwise_df, _meta = S.compute_ranking_significance(char_rank)
|
|
|
|
# print(_pairwise_df.columns)
|
|
|
|
mo.md(f"""
|
|
|
|
|
|
{mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
|
|
|
|
{mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _():
|
|
mo.md(r"""
|
|
## Character Ranking: times 1st place
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, char_rank):
|
|
S.plot_most_ranked_1(char_rank, title="Most Popular Character<br>(Number of Times Ranked 1st)", x_label='Character Personality')
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Prominent predefined personality traits wordcloud
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data):
|
|
top8_traits = S.get_top_8_traits(data)[0]
|
|
S.plot_traits_wordcloud(
|
|
data=top8_traits,
|
|
column='Top_8_Traits',
|
|
title="Most Prominent Personality Traits",
|
|
)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Trait frequency per brand character
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data):
|
|
char_df = S.get_character_refine(data)[0]
|
|
return (char_df,)
|
|
|
|
|
|
@app.cell
|
|
def _(S, char_df):
|
|
from theme import ColorPalette
|
|
|
|
# Assuming you already have char_df (your data from get_character_refine or similar)
|
|
characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
|
|
character_colors = {
|
|
'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
|
|
'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
|
|
'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
|
|
'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
|
|
}
|
|
|
|
# Build consistent sort order (by total frequency across all characters)
|
|
all_trait_counts = {}
|
|
for char in characters:
|
|
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
|
for row in freq_df.iter_rows(named=True):
|
|
all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
|
|
|
|
consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
|
|
|
|
_content = """"""
|
|
# Generate 4 plots (one per character)
|
|
for char in characters:
|
|
freq_df, _ = S.transform_character_trait_frequency(char_df, char)
|
|
main_color, highlight_color = character_colors[char]
|
|
chart = S.plot_single_character_trait_frequency(
|
|
data=freq_df,
|
|
character_name=char,
|
|
bar_color=main_color,
|
|
highlight_color=highlight_color,
|
|
trait_sort_order=consistent_sort_order,
|
|
)
|
|
_content += f"""
|
|
{mo.ui.altair_chart(chart)}
|
|
|
|
|
|
"""
|
|
|
|
mo.md(_content)
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _():
|
|
mo.md(r"""
|
|
## Statistical significance best characters
|
|
|
|
zie chat
|
|
> voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
|
|
>
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _():
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
---
|
|
|
|
# Spoken Voice Results
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Top 8 Most Chosen out of 18
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data):
|
|
v_18_8_3 = S.get_18_8_3(data)[0]
|
|
return (v_18_8_3,)
|
|
|
|
|
|
@app.cell
|
|
def _(S, v_18_8_3):
|
|
S.plot_voice_selection_counts(v_18_8_3, title="Top 8 Voice Selection from 18 Voices", x_label='Voice')
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Top 3 most chosen out of 8
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, v_18_8_3):
|
|
S.plot_top3_selection_counts(v_18_8_3, title="Top 3 Voice Selection Counts from 8 Voices", x_label='Voice')
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _():
|
|
mo.md(r"""
|
|
## Which voice is ranked best in the ranking question for top 3?
|
|
|
|
(not best 3 out of 8 question)
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data):
|
|
top3_voices = S.get_top_3_voices(data)[0]
|
|
top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
|
|
return top3_voices, top3_voices_weighted
|
|
|
|
|
|
@app.cell
|
|
def _(S, top3_voices):
|
|
S.plot_ranking_distribution(top3_voices, x_label='Voice', title="Distribution of Voice Rankings (1st, 2nd, 3rd)")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
### Statistical significance for voice ranking
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
# print(top3_voices.collect().head())
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
|
|
# _pairwise_df, _metadata = S.compute_ranking_significance(
|
|
# top3_voices,alpha=0.05,correction="none")
|
|
|
|
# # View significant pairs
|
|
# # print(pairwise_df.filter(pl.col('significant') == True))
|
|
|
|
# # Create heatmap visualization
|
|
# _heatmap = S.plot_significance_heatmap(
|
|
# _pairwise_df,
|
|
# metadata=_metadata,
|
|
# title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
|
|
# )
|
|
|
|
# # Create summary bar chart
|
|
# _summary = S.plot_significance_summary(
|
|
# _pairwise_df,
|
|
# metadata=_metadata
|
|
# )
|
|
|
|
# mo.md(f"""
|
|
# {mo.ui.altair_chart(_heatmap)}
|
|
|
|
# {mo.ui.altair_chart(_summary)}
|
|
# """)
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _():
|
|
mo.md(r"""
|
|
## Weighted Popularity Scores
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, top3_voices_weighted):
|
|
_plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)")
|
|
|
|
mo.md(f"""
|
|
{mo.ui.altair_chart(_plot)}
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
## Voice Ranked 1st the most
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, top3_voices):
|
|
S.plot_most_ranked_1(top3_voices, title="Most Popular Voice<br>(Number of Times Ranked 1st)", x_label='Voice')
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
mo.md(r"""
|
|
## Voice Scale 1-10
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(S, data):
|
|
# Get your voice scale data (from notebook)
|
|
voice_1_10, _ = S.get_voice_scale_1_10(data)
|
|
S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', domain=[1,10], title="Voice General Impression (Scale 1-10)")
|
|
return (voice_1_10,)
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _():
|
|
mo.md(r"""
|
|
### Statistical Significance (Scale 1-10)
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(disabled=True)
|
|
def _(S, voice_1_10):
|
|
# Compute pairwise significance tests
|
|
pairwise_df, metadata = S.compute_pairwise_significance(
|
|
voice_1_10,
|
|
test_type="mannwhitney", # or "ttest", "chi2", "auto"
|
|
alpha=0.05,
|
|
correction="bonferroni" # or "holm", "none"
|
|
)
|
|
|
|
# View significant pairs
|
|
# print(pairwise_df.filter(pl.col('significant') == True))
|
|
|
|
# Create heatmap visualization
|
|
_heatmap = S.plot_significance_heatmap(
|
|
pairwise_df,
|
|
metadata=metadata,
|
|
title="Voice Rating Significance<br>(Pairwise Comparisons)"
|
|
)
|
|
|
|
# Create summary bar chart
|
|
_summary = S.plot_significance_summary(
|
|
pairwise_df,
|
|
metadata=metadata
|
|
)
|
|
|
|
mo.md(f"""
|
|
{mo.ui.altair_chart(_heatmap)}
|
|
|
|
{mo.ui.altair_chart(_summary)}
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|