240 lines
7.4 KiB
Python
240 lines
7.4 KiB
Python
import marimo
|
|
|
|
__generated_with = "0.19.2"
|
|
app = marimo.App(width="medium")
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
import marimo as mo
|
|
import polars as pl
|
|
import pandas as pd
|
|
import plotly as plt
|
|
from pathlib import Path
|
|
return Path, mo, pd, pl
|
|
|
|
|
|
@app.cell
|
|
def _(Path):
|
|
# results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv')
|
|
results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
|
|
return (results_file,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Mapping Question <-> Internal-ID
|
|
Questions are too long to use as headers of the df
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, results_file):
|
|
if '1_1-16-2026' in results_file.as_posix():
|
|
df_questions = pd.read_csv(results_file, nrows=1)
|
|
df_questions
|
|
|
|
qid_descr_map = df_questions.iloc[0].to_dict()
|
|
qid_descr_map
|
|
|
|
else:
|
|
# First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
|
|
|
|
# Second row which contains the question content
|
|
# Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
|
|
df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
|
|
|
|
def extract_qid(val):
|
|
if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
|
|
val = eval(val)
|
|
return val['ImportId']
|
|
|
|
# transpose df_questions
|
|
df_questions = df_questions.T.reset_index()
|
|
df_questions.columns = ['Description', 'export_metadata']
|
|
df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
|
|
|
|
df_questions = df_questions[['ImportID', 'Description']]
|
|
|
|
qid_descr_map = dict(zip(df_questions['ImportID'], df_questions['Description']))
|
|
|
|
qid_descr_map
|
|
return (qid_descr_map,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Save mapping to a validation.csv so it can be compared across exports
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(pd, qid_descr_map, results_file):
|
|
validate_df = pd.DataFrame.from_dict(qid_descr_map, orient='index', columns=['Description']).reset_index().rename(columns={'index': 'ImportID'})
|
|
# add column source_file
|
|
validate_df['SourceFile'] = results_file.as_posix()
|
|
|
|
validate_df
|
|
return (validate_df,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Compare with other exports
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(Path, pd, validate_df):
|
|
validate_record_csv = Path('./data/exports/validation_qid_descr_map.csv')
|
|
|
|
if not validate_record_csv.exists():
|
|
validate_df.to_csv(validate_record_csv, index=False)
|
|
combined_df = validate_df
|
|
|
|
else:
|
|
existing_df = pd.read_csv(validate_record_csv)
|
|
combined_df = pd.concat([existing_df, validate_df])
|
|
|
|
# remove records that are full duplicates, keeping the last one
|
|
combined_df = combined_df.drop_duplicates(keep='last').reset_index(drop=True)
|
|
combined_df.to_csv(validate_record_csv, index=False)
|
|
|
|
# Sort rows by ImportID
|
|
combined_df = combined_df.sort_values(by='ImportID').reset_index(drop=True)
|
|
combined_df
|
|
return (validate_record_csv,)
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Identify mismatches
|
|
""")
|
|
return
|
|
|
|
|
|
@app.function
|
|
def validate_mappings(_df):
|
|
validation_issues = {
|
|
'MismatchedDescriptions': [],
|
|
'MissingImportID': []
|
|
}
|
|
for import_id, group in _df.groupby('ImportID'):
|
|
# Check for mismatched descriptions
|
|
descriptions = group['Description'].unique()
|
|
if len(descriptions) > 1:
|
|
validation_issues['MismatchedDescriptions'].append({
|
|
'ImportID': import_id,
|
|
'Descriptions': descriptions.tolist(),
|
|
'SourceFiles': group['SourceFile'].tolist()
|
|
})
|
|
|
|
# Check for new or missing ImportIDs
|
|
source_files = group['SourceFile'].unique()
|
|
if len(source_files) < len(_df['SourceFile'].unique()):
|
|
validation_issues['MissingImportID'].append({
|
|
'ImportID': import_id,
|
|
'Descriptions': descriptions.tolist(),
|
|
'SourceFiles': group['SourceFile'].tolist()
|
|
})
|
|
return validation_issues
|
|
|
|
|
|
@app.cell
|
|
def _(pd, validate_record_csv):
|
|
# As-is (no modifications for known issues)
|
|
_df = pd.read_csv(validate_record_csv)
|
|
validation_issues = validate_mappings(_df)
|
|
|
|
validation_issues
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## Process (Dismiss) Errors
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pd, validate_record_csv):
|
|
# Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that then, recompare
|
|
|
|
_df = pd.read_csv(validate_record_csv)
|
|
|
|
# Remove the prefix from the descriptions in rows where the SourceFile contains 'OneDrive_1_1-16-2026'
|
|
_df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'] = _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'].apply(lambda x: ' - '.join(x.split(' - ')[1:]) if ' - ' in x else x)
|
|
|
|
validation_issues_fixed = validate_mappings(_df)
|
|
validation_issues_fixed
|
|
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Process Data
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(pl, results_file):
|
|
df = pl.read_csv(results_file, has_header=True, skip_rows_after_header=1)
|
|
df
|
|
return
|
|
|
|
|
|
@app.cell(hide_code=True)
|
|
def _(mo):
|
|
mo.md(r"""
|
|
# Answers Decoding
|
|
|
|
Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
|
|
|
|
`GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _(mo):
|
|
mo.md(r"""
|
|
## TODO:
|
|
|
|
Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
|
|
|
|
Ideas:
|
|
- Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
|
|
- `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up
|
|
<!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
|
|
- Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
|
|
- "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
|
|
- QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
|
|
""")
|
|
return
|
|
|
|
|
|
@app.cell
|
|
def _():
|
|
return
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app.run()
|