import marimo __generated_with = "0.19.2" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import polars as pl import pandas as pd import plotly as plt from pathlib import Path return Path, mo, pd, pl @app.cell def _(Path): # results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv') results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv') return (results_file,) @app.cell def _(mo): mo.md(r""" # Mapping Question <-> Internal-ID Questions are too long to use as headers of the df """) return @app.cell def _(pd, results_file): if '1_1-16-2026' in results_file.as_posix(): df_questions = pd.read_csv(results_file, nrows=1) df_questions qid_descr_map = df_questions.iloc[0].to_dict() qid_descr_map else: # First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8') # Second row which contains the question content # Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}') df_questions = pd.read_csv(results_file, nrows=1, skiprows=1) def extract_qid(val): if isinstance(val, str) and val.startswith('{') and val.endswith('}'): val = eval(val) return val['ImportId'] # transpose df_questions df_questions = df_questions.T.reset_index() df_questions.columns = ['Description', 'export_metadata'] df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid) df_questions = df_questions[['ImportID', 'Description']] qid_descr_map = dict(zip(df_questions['ImportID'], df_questions['Description'])) qid_descr_map return (qid_descr_map,) @app.cell def _(mo): mo.md(r""" ## Save mapping to a validation.csv so it can be compared across exports """) return @app.cell(hide_code=True) def _(pd, qid_descr_map, results_file): validate_df = pd.DataFrame.from_dict(qid_descr_map, orient='index', columns=['Description']).reset_index().rename(columns={'index': 'ImportID'}) # add column source_file validate_df['SourceFile'] = results_file.as_posix() validate_df return (validate_df,) @app.cell def _(mo): mo.md(r""" # Compare with other exports """) return @app.cell def _(Path, pd, validate_df): validate_record_csv = Path('./data/exports/validation_qid_descr_map.csv') if not validate_record_csv.exists(): validate_df.to_csv(validate_record_csv, index=False) combined_df = validate_df else: existing_df = pd.read_csv(validate_record_csv) combined_df = pd.concat([existing_df, validate_df]) # remove records that are full duplicates, keeping the last one combined_df = combined_df.drop_duplicates(keep='last').reset_index(drop=True) combined_df.to_csv(validate_record_csv, index=False) # Sort rows by ImportID combined_df = combined_df.sort_values(by='ImportID').reset_index(drop=True) combined_df return (validate_record_csv,) @app.cell def _(mo): mo.md(r""" ## Identify mismatches """) return @app.function def validate_mappings(_df): validation_issues = { 'MismatchedDescriptions': [], 'MissingImportID': [] } for import_id, group in _df.groupby('ImportID'): # Check for mismatched descriptions descriptions = group['Description'].unique() if len(descriptions) > 1: validation_issues['MismatchedDescriptions'].append({ 'ImportID': import_id, 'Descriptions': descriptions.tolist(), 'SourceFiles': group['SourceFile'].tolist() }) # Check for new or missing ImportIDs source_files = group['SourceFile'].unique() if len(source_files) < len(_df['SourceFile'].unique()): validation_issues['MissingImportID'].append({ 'ImportID': import_id, 'Descriptions': descriptions.tolist(), 'SourceFiles': group['SourceFile'].tolist() }) return validation_issues @app.cell def _(pd, validate_record_csv): # As-is (no modifications for known issues) _df = pd.read_csv(validate_record_csv) validation_issues = validate_mappings(_df) validation_issues return @app.cell def _(mo): mo.md(r""" ## Process (Dismiss) Errors """) return @app.cell def _(pd, validate_record_csv): # Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a ' - ' string. Drop that then, recompare _df = pd.read_csv(validate_record_csv) # Remove the prefix from the descriptions in rows where the SourceFile contains 'OneDrive_1_1-16-2026' _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'] = _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'].apply(lambda x: ' - '.join(x.split(' - ')[1:]) if ' - ' in x else x) validation_issues_fixed = validate_mappings(_df) validation_issues_fixed return @app.cell def _(): return @app.cell def _(mo): mo.md(r""" # Process Data """) return @app.cell def _(pl, results_file): df = pl.read_csv(results_file, has_header=True, skip_rows_after_header=1) df return @app.cell(hide_code=True) def _(mo): mo.md(r""" # Answers Decoding Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie: `GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option """) return @app.cell def _(mo): mo.md(r""" ## TODO: Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query. Ideas: - Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description - `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up """) return @app.cell def _(): return if __name__ == "__main__": app.run()