Validation of data exports with reference file

2026-01-22 08:51:27 +01:00
parent 62b57ae862
commit 18ada6ca66
5 changed files with 9899 additions and 23 deletions
--- a/00_qualtrics_validation.py
+++ b/00_qualtrics_validation.py
@@ -11,23 +11,225 @@ def _():
    import pandas as pd
    import plotly as plt
    from pathlib import Path
-    return Path, pl
+    return Path, mo, pd, pl
@app.cell
 def _(Path):
-    results_file = Path('./data/VB_Qualtrics_labels.csv')
+    # results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv')
-
+    results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
    return (results_file,)
@app.cell
 def _(mo):
    mo.md(r"""
    # Mapping Question <-> Internal-ID
    Questions are too long to use as headers of the df
    """)
    return
@app.cell
 def _(pd, results_file):
    if '1_1-16-2026' in results_file.as_posix():
        df_questions = pd.read_csv(results_file, nrows=1)
        df_questions
        qid_descr_map = df_questions.iloc[0].to_dict()
        qid_descr_map
    else:
        # First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
        # Second row which contains the question content
        # Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
        df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
        def extract_qid(val):
            if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
                val = eval(val)
            return val['ImportId']
        # transpose df_questions
        df_questions = df_questions.T.reset_index()
        df_questions.columns = ['Description', 'export_metadata']
        df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
        df_questions = df_questions[['ImportID', 'Description']]
        qid_descr_map = dict(zip(df_questions['ImportID'], df_questions['Description']))
    qid_descr_map
    return (qid_descr_map,)
@app.cell
 def _(mo):
    mo.md(r"""
    ## Save mapping to a validation.csv so it can be compared across exports
    """)
    return
@app.cell(hide_code=True)
 def _(pd, qid_descr_map, results_file):
    validate_df = pd.DataFrame.from_dict(qid_descr_map, orient='index', columns=['Description']).reset_index().rename(columns={'index': 'ImportID'})
    # add column source_file
    validate_df['SourceFile'] = results_file.as_posix()
    validate_df
    return (validate_df,)
@app.cell
 def _(mo):
    mo.md(r"""
    # Compare with other exports
    """)
    return
@app.cell
 def _(Path, pd, validate_df):
    validate_record_csv = Path('./data/exports/validation_qid_descr_map.csv')
    if not validate_record_csv.exists():
        validate_df.to_csv(validate_record_csv, index=False)
        combined_df = validate_df
    else:
        existing_df = pd.read_csv(validate_record_csv)
        combined_df = pd.concat([existing_df, validate_df])
        # remove records that are full duplicates, keeping the last one
        combined_df = combined_df.drop_duplicates(keep='last').reset_index(drop=True)
        combined_df.to_csv(validate_record_csv, index=False)
    # Sort rows by ImportID
    combined_df = combined_df.sort_values(by='ImportID').reset_index(drop=True)
    combined_df
    return (validate_record_csv,)
@app.cell
 def _(mo):
    mo.md(r"""
    ## Identify mismatches
    """)
    return
@app.function
 def validate_mappings(_df):
    validation_issues = {
        'MismatchedDescriptions': [],
        'MissingImportID': []
    }
    for import_id, group in _df.groupby('ImportID'):
        # Check for mismatched descriptions
        descriptions = group['Description'].unique()
        if len(descriptions) > 1:
            validation_issues['MismatchedDescriptions'].append({
                'ImportID': import_id,
                'Descriptions': descriptions.tolist(),
                'SourceFiles': group['SourceFile'].tolist()
            })
        # Check for new or missing ImportIDs
        source_files = group['SourceFile'].unique()
        if len(source_files) < len(_df['SourceFile'].unique()):
            validation_issues['MissingImportID'].append({
                'ImportID': import_id,
                'Descriptions': descriptions.tolist(),
                'SourceFiles': group['SourceFile'].tolist()
            })
    return validation_issues
@app.cell
 def _(pd, validate_record_csv):
    # As-is (no modifications for known issues)
    _df = pd.read_csv(validate_record_csv)
    validation_issues = validate_mappings(_df)
    validation_issues      
    return
@app.cell
 def _(mo):
    mo.md(r"""
    ## Process (Dismiss) Errors
    """)
    return
@app.cell
 def _(pd, validate_record_csv):
    # Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that then, recompare
    _df = pd.read_csv(validate_record_csv)
    # Remove the prefix from the descriptions in rows where the SourceFile contains 'OneDrive_1_1-16-2026'
    _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'] = _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'].apply(lambda x: ' - '.join(x.split(' - ')[1:]) if ' - ' in x else x)
    validation_issues_fixed = validate_mappings(_df)
    validation_issues_fixed
    return
@app.cell
 def _():
    return
@app.cell
 def _(mo):
    mo.md(r"""
    # Process Data
    """)
    return
@app.cell
 def _(pl, results_file):
-    df = pl.read_csv(results_file, has_header=True, skip_rows=1)
+    df = pl.read_csv(results_file, has_header=True, skip_rows_after_header=1)
    df
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    # Answers Decoding
    Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
    `GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
    """)
    return
@app.cell
 def _(mo):
    mo.md(r"""
    ## TODO:
    Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
    Ideas:
    - Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
    - `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns  to clean up
    <!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
      - Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
      - "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
      - QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
    """)
    return
@app.cell
 def _():
    return
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -10,39 +10,53 @@ def _():
    import polars as pl
    import sqlite3
    from pathlib import Path
-    return Path, pl, sqlite3
+    return Path, pl
@app.cell
-def _(Path, pl):
+def _():
-    results_file = Path('./data/VB_Qualtrics_labels.csv')
+    # RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
    RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
    return (RESULTS_FILE,)
@app.cell
 def _(Path, RESULTS_FILE, pl):
    results_file = Path(RESULTS_FILE)
    df = pl.read_csv(results_file, skip_rows=0)
    df
-    return (df,)
+    return df, results_file
@app.cell
-def _(df, sqlite3):
+def _(df, pl, results_file):
    colset = set(df.columns)
    this_df_verify = pl.DataFrame({'column_names': [colset], 'results_file': results_file.as_posix()})
    this_df_verify
    return (this_df_verify,)
-    # Create table if not exists with columns from csv
+
-    with sqlite3.connect("data/qualtrics_JP.db") as conn:
+@app.cell
-        # interact with database
+def _(Path, pl, this_df_verify):
-        q= f'''
+    verification_record = Path('./data/exports/verification.csv')
-        CREATE TABLE IF NOT EXISTS qualtrics_raw(
+    if verification_record.exists():
-            {', '.join(list(df.columns))}
+        verify_df = pl.read_csv(verification_record)
-        );
+
-        '''
+        verify_df = pl.concat([verify_df, this_df_verify], how='vertical')
-        print(q)
+
-        conn.execute(q)
+        # save verify_df
        verify_df.write_csv(verification_record)
    else:
        verify_df = this_df_verify
    # append this_df_verify to verify_df
    verify_df
    return
@app.cell
 def _():
    import sqlalchemy
    DATABASE_URL = "sqlite:///./data/qualtrics_JP.db"
    engine = sqlalchemy.create_engine(DATABASE_URL)
    return
--- a/Checks.md
+++ b/Checks.md
@@ -0,0 +1,14 @@
 # QID - Speaking Style Checks
 QID29 (A_VOICE SEL. 18-8): 'Intro'
 QID101 (B_VOICE SEL. 18-8): 'Intro'
 QID36 (VOICE SEL. 8-3): '8-3 + AB (Ranking)'
 QID38,41,45,46-60 (Voice xx Scale 1-10): 'Green Blue'
 QID42,43,61-76 (Vxx SS Green-Blue): 'Green Blue'
 QID98 (Ranking Top 3 Voices): '8-3 + AB (Ranking)'
 QID77-94 (SS Orange-Red): "Orange Red"
--- a/docs/VoiceSS-texts.md
+++ b/docs/VoiceSS-texts.md
@@ -0,0 +1,17 @@
 Intro:
    - "Welcome to our digital voice assistant. This call may be recorded..."
 Scales:
    - "Please tell us your debit card PIN..."
 Green Blue SS:
    - "I noticed your spending this week is higher than usual..."
 Red Orange SS:
    - "I noticed some unusual activity on your account, for you security could you confirm if you ..."
 8-3 + AB (Ranking):
    - "Please tell us your debit card PIN, the same PIN you use at the ATM..."
--- a/validation_qid_descr_map.csv
+++ b/validation_qid_descr_map.csv