Validation of data exports with reference file

2026-01-22 08:51:27 +01:00
parent 62b57ae862
commit 18ada6ca66
5 changed files with 9899 additions and 23 deletions
--- a/00_qualtrics_validation.py
+++ b/00_qualtrics_validation.py
@@ -11,23 +11,225 @@ def _():
    import pandas as pd
    import plotly as plt
    from pathlib import Path
-    return Path, pl
+    return Path, mo, pd, pl


@app.cell
 def _(Path):
-    results_file = Path('./data/VB_Qualtrics_labels.csv')
-
+    # results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv')
+    results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
    return (results_file,)


+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Mapping Question <-> Internal-ID
+    Questions are too long to use as headers of the df
+    """)
+    return
+
+
+@app.cell
+def _(pd, results_file):
+    if '1_1-16-2026' in results_file.as_posix():
+        df_questions = pd.read_csv(results_file, nrows=1)
+        df_questions
+    
+        qid_descr_map = df_questions.iloc[0].to_dict()
+        qid_descr_map
+
+    else:
+        # First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
+    
+        # Second row which contains the question content
+        # Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
+        df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
+
+        def extract_qid(val):
+            if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
+                val = eval(val)
+            return val['ImportId']
+
+        # transpose df_questions
+        df_questions = df_questions.T.reset_index()
+        df_questions.columns = ['Description', 'export_metadata']
+        df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
+
+        df_questions = df_questions[['ImportID', 'Description']]
+
+        qid_descr_map = dict(zip(df_questions['ImportID'], df_questions['Description']))
+
+    qid_descr_map
+    return (qid_descr_map,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Save mapping to a validation.csv so it can be compared across exports
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(pd, qid_descr_map, results_file):
+    validate_df = pd.DataFrame.from_dict(qid_descr_map, orient='index', columns=['Description']).reset_index().rename(columns={'index': 'ImportID'})
+    # add column source_file
+    validate_df['SourceFile'] = results_file.as_posix()
+
+    validate_df
+    return (validate_df,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Compare with other exports
+    """)
+    return
+
+
+@app.cell
+def _(Path, pd, validate_df):
+    validate_record_csv = Path('./data/exports/validation_qid_descr_map.csv')
+
+    if not validate_record_csv.exists():
+        validate_df.to_csv(validate_record_csv, index=False)
+        combined_df = validate_df
+
+    else:
+        existing_df = pd.read_csv(validate_record_csv)
+        combined_df = pd.concat([existing_df, validate_df])
+
+        # remove records that are full duplicates, keeping the last one
+        combined_df = combined_df.drop_duplicates(keep='last').reset_index(drop=True)
+        combined_df.to_csv(validate_record_csv, index=False)
+
+    # Sort rows by ImportID
+    combined_df = combined_df.sort_values(by='ImportID').reset_index(drop=True)
+    combined_df
+    return (validate_record_csv,)
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Identify mismatches
+    """)
+    return
+
+
+@app.function
+def validate_mappings(_df):
+    validation_issues = {
+        'MismatchedDescriptions': [],
+        'MissingImportID': []
+    }
+    for import_id, group in _df.groupby('ImportID'):
+        # Check for mismatched descriptions
+        descriptions = group['Description'].unique()
+        if len(descriptions) > 1:
+            validation_issues['MismatchedDescriptions'].append({
+                'ImportID': import_id,
+                'Descriptions': descriptions.tolist(),
+                'SourceFiles': group['SourceFile'].tolist()
+            })
+    
+        # Check for new or missing ImportIDs
+        source_files = group['SourceFile'].unique()
+        if len(source_files) < len(_df['SourceFile'].unique()):
+            validation_issues['MissingImportID'].append({
+                'ImportID': import_id,
+                'Descriptions': descriptions.tolist(),
+                'SourceFiles': group['SourceFile'].tolist()
+            })
+    return validation_issues
+
+
+@app.cell
+def _(pd, validate_record_csv):
+    # As-is (no modifications for known issues)
+    _df = pd.read_csv(validate_record_csv)
+    validation_issues = validate_mappings(_df)
+
+    validation_issues      
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Process (Dismiss) Errors
+    """)
+    return
+
+
+@app.cell
+def _(pd, validate_record_csv):
+    # Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that then, recompare
+
+    _df = pd.read_csv(validate_record_csv)
+
+    # Remove the prefix from the descriptions in rows where the SourceFile contains 'OneDrive_1_1-16-2026'
+    _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'] = _df.loc[_df['SourceFile'].str.contains('OneDrive_1_1-16-2026'), 'Description'].apply(lambda x: ' - '.join(x.split(' - ')[1:]) if ' - ' in x else x)
+
+    validation_issues_fixed = validate_mappings(_df)
+    validation_issues_fixed
+
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Process Data
+    """)
+    return
+
+
@app.cell
 def _(pl, results_file):
-    df = pl.read_csv(results_file, has_header=True, skip_rows=1)
+    df = pl.read_csv(results_file, has_header=True, skip_rows_after_header=1)
    df
    return


+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Answers Decoding
+
+    Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
+
+    `GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## TODO:
+
+    Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
+
+    Ideas:
+    - Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
+    - `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns  to clean up
+    <!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
+      - Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
+      - "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
+      - QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
+    """)
+    return
+
+
@app.cell
 def _():
    return
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -10,39 +10,53 @@ def _():
    import polars as pl
    import sqlite3
    from pathlib import Path
-    return Path, pl, sqlite3
+    return Path, pl


@app.cell
-def _(Path, pl):
-    results_file = Path('./data/VB_Qualtrics_labels.csv')
+def _():
+    # RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
+    RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
+    return (RESULTS_FILE,)
+
+
+@app.cell
+def _(Path, RESULTS_FILE, pl):
+    results_file = Path(RESULTS_FILE)
    df = pl.read_csv(results_file, skip_rows=0)
    df
-    return (df,)
+    return df, results_file


@app.cell
-def _(df, sqlite3):
+def _(df, pl, results_file):
+    colset = set(df.columns)
+    this_df_verify = pl.DataFrame({'column_names': [colset], 'results_file': results_file.as_posix()})
+    this_df_verify
+    return (this_df_verify,)

-    # Create table if not exists with columns from csv
-    with sqlite3.connect("data/qualtrics_JP.db") as conn:
-        # interact with database
-        q= f'''
-        CREATE TABLE IF NOT EXISTS qualtrics_raw(
-            {', '.join(list(df.columns))}
-        );
-        '''
-        print(q)
-        conn.execute(q)
+
+@app.cell
+def _(Path, pl, this_df_verify):
+    verification_record = Path('./data/exports/verification.csv')
+    if verification_record.exists():
+        verify_df = pl.read_csv(verification_record)
+
+        verify_df = pl.concat([verify_df, this_df_verify], how='vertical')
+
+        # save verify_df
+        verify_df.write_csv(verification_record)
+    
+    else:
+        verify_df = this_df_verify
+
+    # append this_df_verify to verify_df
+    verify_df
    return


@app.cell
 def _():
-    import sqlalchemy
-
-    DATABASE_URL = "sqlite:///./data/qualtrics_JP.db"
-    engine = sqlalchemy.create_engine(DATABASE_URL)
    return


--- a/Checks.md
+++ b/Checks.md
@@ -0,0 +1,14 @@
+# QID - Speaking Style Checks
+
+QID29 (A_VOICE SEL. 18-8): 'Intro'
+QID101 (B_VOICE SEL. 18-8): 'Intro'
+
+QID36 (VOICE SEL. 8-3): '8-3 + AB (Ranking)'
+
+QID38,41,45,46-60 (Voice xx Scale 1-10): 'Green Blue'
+
+QID42,43,61-76 (Vxx SS Green-Blue): 'Green Blue'
+
+QID98 (Ranking Top 3 Voices): '8-3 + AB (Ranking)'
+
+QID77-94 (SS Orange-Red): "Orange Red"
--- a/docs/VoiceSS-texts.md
+++ b/docs/VoiceSS-texts.md
@@ -0,0 +1,17 @@
+Intro:
+    - "Welcome to our digital voice assistant. This call may be recorded..."
+
+Scales:
+    - "Please tell us your debit card PIN..."
+
+
+Green Blue SS:
+    - "I noticed your spending this week is higher than usual..."
+
+
+Red Orange SS:
+    - "I noticed some unusual activity on your account, for you security could you confirm if you ..."
+
+
+8-3 + AB (Ranking):
+    - "Please tell us your debit card PIN, the same PIN you use at the ATM..."
--- a/validation_qid_descr_map.csv
+++ b/validation_qid_descr_map.csv