import marimo __generated_with = "0.19.2" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import polars as pl import pandas as pd import plotly as plt from pathlib import Path import utils return Path, mo, pd, utils @app.cell def _(Path): # results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv') # results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv') results_file = Path('data/exports/1-23-26/JPMC_Chase Brand Personality_Quant Round 1_January 23, 2026_Labels.csv') qsf_file = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' return qsf_file, results_file @app.cell def _(qsf_file, results_file, utils): survey = utils.QualtricsSurvey(results_file, qsf_file) data_all = survey.load_data() return (survey,) @app.cell def _(mo): mo.md(r""" # Mapping Question <-> Internal-ID Questions are too long to use as headers of the df """) return @app.cell def _(survey): qid_descr_map = survey.qid_descr_map qid_descr_map return (qid_descr_map,) @app.cell def _(mo): mo.md(r""" ## Save mapping to a validation.csv so it can be compared across exports """) return @app.cell(hide_code=True) def _(pd, qid_descr_map, results_file): validate_df = pd.DataFrame.from_dict(qid_descr_map, orient='index', columns=['Description']).reset_index().rename(columns={'index': 'ImportID'}) # add column source_file validate_df['SourceFile'] = results_file.as_posix() validate_df return (validate_df,) @app.cell def _(mo): mo.md(r""" # Compare with other exports """) return @app.cell def _(Path, pd, validate_df): validate_record_csv = Path('./validation_qid_descr_map.csv') if not validate_record_csv.exists(): validate_df.to_csv(validate_record_csv, index=False) combined_df = validate_df else: existing_df = pd.read_csv(validate_record_csv) combined_df = pd.concat([existing_df, validate_df]) # remove records that are full duplicates, keeping the last one combined_df = combined_df.drop_duplicates(keep='last').reset_index(drop=True) combined_df.to_csv(validate_record_csv, index=False) # Sort rows by ImportID combined_df = combined_df.sort_values(by='ImportID').reset_index(drop=True) combined_df return (validate_record_csv,) @app.cell def _(mo): mo.md(r""" ## Identify mismatches """) return @app.function def validate_mappings(_df): validation_issues = { 'MismatchedDescriptions': [], 'MissingImportID': [] } for import_id, group in _df.groupby('ImportID'): # Check for mismatched descriptions descriptions = group['Description'].unique() if len(descriptions) > 1: validation_issues['MismatchedDescriptions'].append({ 'ImportID': import_id, 'Descriptions': descriptions.tolist(), 'SourceFiles': group['SourceFile'].tolist() }) # Check for new or missing ImportIDs source_files = group['SourceFile'].unique() if len(source_files) < len(_df['SourceFile'].unique()): validation_issues['MissingImportID'].append({ 'ImportID': import_id, 'Descriptions': descriptions.tolist(), 'SourceFiles': group['SourceFile'].tolist() }) return validation_issues @app.cell def _(pd, validate_record_csv): # As-is (no modifications for known issues) _df = pd.read_csv(validate_record_csv) validation_issues = validate_mappings(_df) validation_issues return @app.cell def _(mo): mo.md(r""" ## Inspect & Dismiss Errors """) return @app.cell def _(pd, validate_record_csv): # Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a ' - ' string. Drop that, then recompare _df = pd.read_csv(validate_record_csv) # Remove the prefix from the descriptions in rows where the SourceFile contains 'OneDrive_1_1-16-2026' _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'] = _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'].apply(lambda x: ' - '.join(x.split(' - ')[1:]) if ' - ' in x else x) validation_issues_fixed = validate_mappings(_df) validation_issues_fixed return if __name__ == "__main__": app.run()