325 lines
6.2 KiB
Python
325 lines
6.2 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.19.2"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import polars as pl
|
|
from pathlib import Path
|
|
|
|
from utils import JPMCSurvey, combine_exclusive_columns
|
|
return JPMCSurvey, combine_exclusive_columns, mo, pl
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.outline()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
|
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
|
# RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
|
|
return QSF_FILE, RESULTS_FILE
|
|
|
|
|
|
@app.cell
|
|
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
|
|
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
|
data = survey.load_data()
|
|
data.collect()
|
|
return data, survey
|
|
|
|
|
|
@app.cell
|
|
def _(survey):
|
|
survey.qid_descr_map
|
|
return
|
|
|
|
|
|
app._unparsable_cell(
|
|
r"""
|
|
data.
|
|
""",
|
|
name="_"
|
|
)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Data Validation
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, mo, pl):
|
|
# all progress is 100
|
|
def check_progress(data):
|
|
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
|
return mo.md("""## ✅ All responses are complete (progress = 100) """)
|
|
|
|
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
|
|
|
|
check_progress(data)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, mo, pl):
|
|
|
|
def duration_validation(data):
|
|
# Identify any outliers in duration
|
|
duration_stats = data.select(
|
|
pl.col('duration').mean().alias('mean_duration'),
|
|
pl.col('duration').std().alias('std_duration')
|
|
).collect()
|
|
mean_duration = duration_stats['mean_duration'][0]
|
|
std_duration = duration_stats['std_duration'][0]
|
|
upper_outlier_threshold = mean_duration + 3 * std_duration
|
|
lower_outlier_threshold = mean_duration - 3 * std_duration
|
|
|
|
_d = data.with_columns(
|
|
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
|
)
|
|
|
|
# Show durations with outlier flag is true
|
|
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
|
|
|
if outlier_data.shape[0] == 0:
|
|
return mo.md("## ✅ No duration outliers detected")
|
|
|
|
return mo.md(f"""
|
|
## ⚠️ Duration Outliers Detected ⚠️
|
|
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
|
|
- Standard Deviation of Duration: {std_duration:.2f} seconds
|
|
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
|
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
|
- Number of Outlier Responses: {outlier_data.shape[0]}
|
|
|
|
Outliers:
|
|
|
|
{mo.ui.table(outlier_data)}
|
|
|
|
|
|
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
|
|
|
""")
|
|
|
|
duration_validation(data)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Data Analysis
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Demographics
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
survey.get_demographics(data)[0].collect()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Top 8 traits
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
survey.get_top_8_traits(data)[0].collect()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Top 3 traits
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
survey.get_top_3_traits(data)[0].collect()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Character Ranking
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
survey.get_character_ranking(data)[0].collect()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Voices 18 -> 8 -> 3
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
survey.get_18_8_3(data)[0].collect()
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Voice Scales 1-10
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
vscales = survey.get_voice_scale_1_10(data)[0].collect()
|
|
vscales
|
|
return (vscales,)
|
|
|
|
|
|
@app.cell
|
|
def _(plot_average_scores_with_counts, vscales):
|
|
plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## SS Green Blue
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
_lf, _choice_map = survey.get_ss_green_blue(data)
|
|
# _lf.collect()
|
|
print(_lf.collect().head())
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(df):
|
|
|
|
df
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Top 3 Voices
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
top3_voices = survey.get_top_3_voices(data)[0].collect()
|
|
top3_voices
|
|
return (top3_voices,)
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
|
|
# print(top3_voices.head())
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(plot_top3_ranking_distribution, top3_voices):
|
|
plot_top3_ranking_distribution(top3_voices, x_label='Voice', width=1000)
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## SS Orange / Red
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
_lf, choice_map = survey.get_ss_orange_red(data)
|
|
_d = _lf.collect()
|
|
_d
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Character Refine
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
traits_original = survey.get_top_8_traits(data)[0]
|
|
traits_original.collect()
|
|
return (traits_original,)
|
|
|
|
|
|
@app.cell
|
|
def _(data, survey):
|
|
traits_refined = survey.get_character_refine(data)[0]
|
|
|
|
traits_refined.collect()
|
|
return (traits_refined,)
|
|
|
|
|
|
@app.cell
|
|
def _(combine_exclusive_columns, traits_refined):
|
|
traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined')
|
|
traits_refined_comb
|
|
return (traits_refined_comb,)
|
|
|
|
|
|
@app.cell
|
|
def _(traits_original, traits_refined_comb):
|
|
# merge the two dataframes side by side for comparison
|
|
traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId')
|
|
print(traits_comparison.collect().head())
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|