move common ingest functions to utils
This commit is contained in:
@@ -8,55 +8,72 @@ app = marimo.App(width="medium")
|
||||
def _():
|
||||
import marimo as mo
|
||||
import polars as pl
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
return Path, pl
|
||||
|
||||
from utils import extract_qid_descr_map, load_csv_with_qid_headers
|
||||
return extract_qid_descr_map, load_csv_with_qid_headers, mo
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
# RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
|
||||
RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
|
||||
RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
|
||||
# RESULTS_FILE = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv'
|
||||
return (RESULTS_FILE,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path, RESULTS_FILE, pl):
|
||||
results_file = Path(RESULTS_FILE)
|
||||
df = pl.read_csv(results_file, skip_rows=0)
|
||||
df
|
||||
return df, results_file
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df, pl, results_file):
|
||||
colset = set(df.columns)
|
||||
this_df_verify = pl.DataFrame({'column_names': [colset], 'results_file': results_file.as_posix()})
|
||||
this_df_verify
|
||||
return (this_df_verify,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path, pl, this_df_verify):
|
||||
verification_record = Path('./data/exports/verification.csv')
|
||||
if verification_record.exists():
|
||||
verify_df = pl.read_csv(verification_record)
|
||||
|
||||
verify_df = pl.concat([verify_df, this_df_verify], how='vertical')
|
||||
|
||||
# save verify_df
|
||||
verify_df.write_csv(verification_record)
|
||||
|
||||
else:
|
||||
verify_df = this_df_verify
|
||||
|
||||
# append this_df_verify to verify_df
|
||||
verify_df
|
||||
def _(RESULTS_FILE, extract_qid_descr_map):
|
||||
qid_descr_map = extract_qid_descr_map(RESULTS_FILE)
|
||||
qid_descr_map
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
def _(RESULTS_FILE, load_csv_with_qid_headers):
|
||||
df = load_csv_with_qid_headers(RESULTS_FILE)
|
||||
df
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Data Cleanup
|
||||
|
||||
- Remove incomplete responses (progress < 100)
|
||||
- Flag outliers based on duration (add column)
|
||||
- Flag responses that give the same rating for everything (indicates lack of engagement)
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Answers Decoding
|
||||
|
||||
Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
|
||||
|
||||
`GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## TODO:
|
||||
|
||||
Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
|
||||
|
||||
Ideas:
|
||||
- Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
|
||||
- `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up
|
||||
<!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
|
||||
- Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
|
||||
- "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
|
||||
- QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user