move common ingest functions to utils

This commit is contained in:
2026-01-22 11:59:48 +01:00
parent 18ada6ca66
commit b8642e9de8
3 changed files with 136 additions and 126 deletions

View File

@@ -8,55 +8,72 @@ app = marimo.App(width="medium")
def _():
import marimo as mo
import polars as pl
import sqlite3
from pathlib import Path
return Path, pl
from utils import extract_qid_descr_map, load_csv_with_qid_headers
return extract_qid_descr_map, load_csv_with_qid_headers, mo
@app.cell
def _():
# RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
# RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
return (RESULTS_FILE,)
@app.cell
def _(Path, RESULTS_FILE, pl):
results_file = Path(RESULTS_FILE)
df = pl.read_csv(results_file, skip_rows=0)
df
return df, results_file
@app.cell
def _(df, pl, results_file):
colset = set(df.columns)
this_df_verify = pl.DataFrame({'column_names': [colset], 'results_file': results_file.as_posix()})
this_df_verify
return (this_df_verify,)
@app.cell
def _(Path, pl, this_df_verify):
verification_record = Path('./data/exports/verification.csv')
if verification_record.exists():
verify_df = pl.read_csv(verification_record)
verify_df = pl.concat([verify_df, this_df_verify], how='vertical')
# save verify_df
verify_df.write_csv(verification_record)
else:
verify_df = this_df_verify
# append this_df_verify to verify_df
verify_df
def _(RESULTS_FILE, extract_qid_descr_map):
qid_descr_map = extract_qid_descr_map(RESULTS_FILE)
qid_descr_map
return
@app.cell
def _():
def _(RESULTS_FILE, load_csv_with_qid_headers):
df = load_csv_with_qid_headers(RESULTS_FILE)
df
return
@app.cell
def _(mo):
mo.md(r"""
# Data Cleanup
- Remove incomplete responses (progress < 100)
- Flag outliers based on duration (add column)
- Flag responses that give the same rating for everything (indicates lack of engagement)
""")
return
@app.cell
def _(mo):
mo.md(r"""
# Answers Decoding
Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
`GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## TODO:
Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
Ideas:
- Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
- `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up
<!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
- Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
- "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
- QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
""")
return