move common ingest functions to utils

2026-01-22 11:59:48 +01:00
parent 18ada6ca66
commit b8642e9de8
3 changed files with 136 additions and 126 deletions
--- a/00_qualtrics_validation.py
+++ b/00_qualtrics_validation.py
@@ -11,7 +11,9 @@ def _():
    import pandas as pd
    import plotly as plt
    from pathlib import Path
-    return Path, mo, pd, pl
+
+    from utils import extract_qid_descr_map
+    return Path, extract_qid_descr_map, mo, pd


@app.cell
@@ -31,35 +33,8 @@ def _(mo):


@app.cell
-def _(pd, results_file):
-    if '1_1-16-2026' in results_file.as_posix():
-        df_questions = pd.read_csv(results_file, nrows=1)
-        df_questions
-    
-        qid_descr_map = df_questions.iloc[0].to_dict()
-        qid_descr_map
-
-    else:
-        # First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
-    
-        # Second row which contains the question content
-        # Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
-        df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
-
-        def extract_qid(val):
-            if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
-                val = eval(val)
-            return val['ImportId']
-
-        # transpose df_questions
-        df_questions = df_questions.T.reset_index()
-        df_questions.columns = ['Description', 'export_metadata']
-        df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
-
-        df_questions = df_questions[['ImportID', 'Description']]
-
-        qid_descr_map = dict(zip(df_questions['ImportID'], df_questions['Description']))
-
+def _(extract_qid_descr_map, results_file):
+    qid_descr_map = extract_qid_descr_map(results_file)
    qid_descr_map
    return (qid_descr_map,)

@@ -92,7 +67,7 @@ def _(mo):

@app.cell
 def _(Path, pd, validate_df):
-    validate_record_csv = Path('./data/exports/validation_qid_descr_map.csv')
+    validate_record_csv = Path('./validation_qid_descr_map.csv')

    if not validate_record_csv.exists():
        validate_df.to_csv(validate_record_csv, index=False)
@@ -135,7 +110,7 @@ def validate_mappings(_df):
                'Descriptions': descriptions.tolist(),
                'SourceFiles': group['SourceFile'].tolist()
            })
-    
+
        # Check for new or missing ImportIDs
        source_files = group['SourceFile'].unique()
        if len(source_files) < len(_df['SourceFile'].unique()):
@@ -160,14 +135,14 @@ def _(pd, validate_record_csv):
@app.cell
 def _(mo):
    mo.md(r"""
-    ## Process (Dismiss) Errors
+    ## Inspect & Dismiss Errors
    """)
    return


@app.cell
 def _(pd, validate_record_csv):
-    # Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that then, recompare
+    # Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that, then recompare

    _df = pd.read_csv(validate_record_csv)

@@ -176,62 +151,6 @@ def _(pd, validate_record_csv):

    validation_issues_fixed = validate_mappings(_df)
    validation_issues_fixed
-
-    return
-
-
-@app.cell
-def _():
-    return
-
-
-@app.cell
-def _(mo):
-    mo.md(r"""
-    # Process Data
-    """)
-    return
-
-
-@app.cell
-def _(pl, results_file):
-    df = pl.read_csv(results_file, has_header=True, skip_rows_after_header=1)
-    df
-    return
-
-
-@app.cell(hide_code=True)
-def _(mo):
-    mo.md(r"""
-    # Answers Decoding
-
-    Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
-
-    `GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
-    """)
-    return
-
-
-@app.cell
-def _(mo):
-    mo.md(r"""
-    ## TODO:
-
-    Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
-
-    Ideas:
-    - Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
-    - `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns  to clean up
-    <!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
-      - Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
-      - "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
-      - QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
-    """)
-    return
-
-
-@app.cell
-def _():
    return