move common ingest functions to utils
This commit is contained in:
74
utils.py
Normal file
74
utils.py
Normal file
@@ -0,0 +1,74 @@
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
|
||||
def extract_qid(val):
|
||||
"""Extracts the 'ImportId' from a string representation of a dictionary."""
|
||||
|
||||
if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
|
||||
val = eval(val)
|
||||
return val['ImportId']
|
||||
|
||||
|
||||
def extract_qid_descr_map(results_file: Union[str, Path]) -> dict:
|
||||
"""Extract mapping of Qualtrics ImportID to Question Description from results file."""
|
||||
if isinstance(results_file, str):
|
||||
results_file = Path(results_file)
|
||||
|
||||
if '1_1-16-2026' in results_file.as_posix():
|
||||
df_questions = pd.read_csv(results_file, nrows=1)
|
||||
df_questions
|
||||
|
||||
return df_questions.iloc[0].to_dict()
|
||||
|
||||
|
||||
else:
|
||||
# First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
|
||||
|
||||
# Second row which contains the question content
|
||||
# Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
|
||||
df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
|
||||
|
||||
|
||||
|
||||
# transpose df_questions
|
||||
df_questions = df_questions.T.reset_index()
|
||||
df_questions.columns = ['Description', 'export_metadata']
|
||||
df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
|
||||
|
||||
df_questions = df_questions[['ImportID', 'Description']]
|
||||
|
||||
return dict(zip(df_questions['ImportID'], df_questions['Description']))
|
||||
|
||||
|
||||
def load_csv_with_qid_headers(file_path: Union[str, Path]) -> pl.DataFrame:
|
||||
"""
|
||||
Load CSV where column headers are in row 3 as dict strings with ImportId.
|
||||
|
||||
The 3rd row contains metadata like '{"ImportId":"startDate","timeZone":"America/Denver"}'.
|
||||
This function extracts the ImportId from each column and uses it as the column name.
|
||||
|
||||
Parameters:
|
||||
file_path (Path): Path to the CSV file to load.
|
||||
|
||||
Returns:
|
||||
pl.DataFrame: Polars DataFrame with ImportId as column names.
|
||||
"""
|
||||
if isinstance(file_path, str):
|
||||
file_path = Path(file_path)
|
||||
|
||||
# Read the 3rd row (index 2) which contains the metadata dictionaries
|
||||
# Use header=None to get raw values instead of treating them as column names
|
||||
df_meta = pd.read_csv(file_path, nrows=1, skiprows=2, header=None)
|
||||
|
||||
# Extract ImportIds from each column value in this row
|
||||
new_columns = [extract_qid(val) for val in df_meta.iloc[0]]
|
||||
|
||||
# Now read the actual data starting from row 4 (skip first 3 rows)
|
||||
df = pl.read_csv(file_path, skip_rows=3)
|
||||
|
||||
# Rename columns with the extracted ImportIds
|
||||
df.columns = new_columns
|
||||
|
||||
return df
|
||||
Reference in New Issue
Block a user