move common ingest functions to utils

2026-01-22 11:59:48 +01:00
parent 18ada6ca66
commit b8642e9de8
3 changed files with 136 additions and 126 deletions
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -8,55 +8,72 @@ app = marimo.App(width="medium")
 def _():
    import marimo as mo
    import polars as pl
-    import sqlite3
    from pathlib import Path
-    return Path, pl
+
+    from utils import extract_qid_descr_map, load_csv_with_qid_headers
+    return extract_qid_descr_map, load_csv_with_qid_headers, mo


@app.cell
 def _():
-    # RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
-    RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
+    RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
+    # RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
    return (RESULTS_FILE,)


@app.cell
-def _(Path, RESULTS_FILE, pl):
-    results_file = Path(RESULTS_FILE)
-    df = pl.read_csv(results_file, skip_rows=0)
-    df
-    return df, results_file
-
-
-@app.cell
-def _(df, pl, results_file):
-    colset = set(df.columns)
-    this_df_verify = pl.DataFrame({'column_names': [colset], 'results_file': results_file.as_posix()})
-    this_df_verify
-    return (this_df_verify,)
-
-
-@app.cell
-def _(Path, pl, this_df_verify):
-    verification_record = Path('./data/exports/verification.csv')
-    if verification_record.exists():
-        verify_df = pl.read_csv(verification_record)
-
-        verify_df = pl.concat([verify_df, this_df_verify], how='vertical')
-
-        # save verify_df
-        verify_df.write_csv(verification_record)
-    
-    else:
-        verify_df = this_df_verify
-
-    # append this_df_verify to verify_df
-    verify_df
+def _(RESULTS_FILE, extract_qid_descr_map):
+    qid_descr_map = extract_qid_descr_map(RESULTS_FILE)
+    qid_descr_map
    return


@app.cell
-def _():
+def _(RESULTS_FILE, load_csv_with_qid_headers):
+    df = load_csv_with_qid_headers(RESULTS_FILE)
+    df
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Data Cleanup
+
+    - Remove incomplete responses (progress < 100)
+    - Flag outliers based on duration (add column)
+    - Flag responses that give the same rating for everything (indicates lack of engagement)
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Answers Decoding
+
+    Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
+
+    `GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## TODO:
+
+    Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
+
+    Ideas:
+    - Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
+    - `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns  to clean up
+    <!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
+      - Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
+      - "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
+      - QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
+    """)
    return