move common ingest functions to utils
This commit is contained in:
@@ -11,7 +11,9 @@ def _():
|
||||
import pandas as pd
|
||||
import plotly as plt
|
||||
from pathlib import Path
|
||||
return Path, mo, pd, pl
|
||||
|
||||
from utils import extract_qid_descr_map
|
||||
return Path, extract_qid_descr_map, mo, pd
|
||||
|
||||
|
||||
@app.cell
|
||||
@@ -31,35 +33,8 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd, results_file):
|
||||
if '1_1-16-2026' in results_file.as_posix():
|
||||
df_questions = pd.read_csv(results_file, nrows=1)
|
||||
df_questions
|
||||
|
||||
qid_descr_map = df_questions.iloc[0].to_dict()
|
||||
qid_descr_map
|
||||
|
||||
else:
|
||||
# First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
|
||||
|
||||
# Second row which contains the question content
|
||||
# Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
|
||||
df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
|
||||
|
||||
def extract_qid(val):
|
||||
if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
|
||||
val = eval(val)
|
||||
return val['ImportId']
|
||||
|
||||
# transpose df_questions
|
||||
df_questions = df_questions.T.reset_index()
|
||||
df_questions.columns = ['Description', 'export_metadata']
|
||||
df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
|
||||
|
||||
df_questions = df_questions[['ImportID', 'Description']]
|
||||
|
||||
qid_descr_map = dict(zip(df_questions['ImportID'], df_questions['Description']))
|
||||
|
||||
def _(extract_qid_descr_map, results_file):
|
||||
qid_descr_map = extract_qid_descr_map(results_file)
|
||||
qid_descr_map
|
||||
return (qid_descr_map,)
|
||||
|
||||
@@ -92,7 +67,7 @@ def _(mo):
|
||||
|
||||
@app.cell
|
||||
def _(Path, pd, validate_df):
|
||||
validate_record_csv = Path('./data/exports/validation_qid_descr_map.csv')
|
||||
validate_record_csv = Path('./validation_qid_descr_map.csv')
|
||||
|
||||
if not validate_record_csv.exists():
|
||||
validate_df.to_csv(validate_record_csv, index=False)
|
||||
@@ -135,7 +110,7 @@ def validate_mappings(_df):
|
||||
'Descriptions': descriptions.tolist(),
|
||||
'SourceFiles': group['SourceFile'].tolist()
|
||||
})
|
||||
|
||||
|
||||
# Check for new or missing ImportIDs
|
||||
source_files = group['SourceFile'].unique()
|
||||
if len(source_files) < len(_df['SourceFile'].unique()):
|
||||
@@ -160,14 +135,14 @@ def _(pd, validate_record_csv):
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Process (Dismiss) Errors
|
||||
## Inspect & Dismiss Errors
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd, validate_record_csv):
|
||||
# Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that then, recompare
|
||||
# Known issue with the 'OneDrive_1_1-16-2026' export, where each of the descriptions is prepended with a '<Qualtrics Editor Question name> - ' string. Drop that, then recompare
|
||||
|
||||
_df = pd.read_csv(validate_record_csv)
|
||||
|
||||
@@ -176,62 +151,6 @@ def _(pd, validate_record_csv):
|
||||
|
||||
validation_issues_fixed = validate_mappings(_df)
|
||||
validation_issues_fixed
|
||||
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Process Data
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pl, results_file):
|
||||
df = pl.read_csv(results_file, has_header=True, skip_rows_after_header=1)
|
||||
df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Answers Decoding
|
||||
|
||||
Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
|
||||
|
||||
`GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## TODO:
|
||||
|
||||
Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
|
||||
|
||||
Ideas:
|
||||
- Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
|
||||
- `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up
|
||||
<!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
|
||||
- Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
|
||||
- "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
|
||||
- QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
return
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user