move common ingest functions to utils

2026-01-22 11:59:48 +01:00
parent 18ada6ca66
commit b8642e9de8
3 changed files with 136 additions and 126 deletions
--- a/utils.py
+++ b/utils.py
@@ -0,0 +1,74 @@
+import polars as pl
+from pathlib import Path
+import pandas as pd
+from typing import Union
+
+def extract_qid(val):
+    """Extracts the 'ImportId' from a string representation of a dictionary."""
+
+    if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
+        val = eval(val)
+    return val['ImportId']
+
+
+def extract_qid_descr_map(results_file: Union[str, Path]) -> dict:
+    """Extract mapping of Qualtrics ImportID to Question Description from results file."""
+    if isinstance(results_file, str):
+        results_file = Path(results_file)
+
+    if '1_1-16-2026' in results_file.as_posix():
+        df_questions = pd.read_csv(results_file, nrows=1)
+        df_questions
+    
+        return df_questions.iloc[0].to_dict()
+        
+    
+    else:
+        # First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
+    
+        # Second row which contains the question content
+        # Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
+        df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
+    
+        
+    
+        # transpose df_questions
+        df_questions = df_questions.T.reset_index()
+        df_questions.columns = ['Description', 'export_metadata']
+        df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
+    
+        df_questions = df_questions[['ImportID', 'Description']]
+    
+        return dict(zip(df_questions['ImportID'], df_questions['Description']))
+
+
+def load_csv_with_qid_headers(file_path: Union[str, Path]) -> pl.DataFrame:
+    """
+    Load CSV where column headers are in row 3 as dict strings with ImportId.
+    
+    The 3rd row contains metadata like '{"ImportId":"startDate","timeZone":"America/Denver"}'.
+    This function extracts the ImportId from each column and uses it as the column name.
+    
+    Parameters:
+    file_path (Path): Path to the CSV file to load.
+    
+    Returns:
+    pl.DataFrame: Polars DataFrame with ImportId as column names.
+    """
+    if isinstance(file_path, str):
+        file_path = Path(file_path)
+
+    # Read the 3rd row (index 2) which contains the metadata dictionaries
+    # Use header=None to get raw values instead of treating them as column names
+    df_meta = pd.read_csv(file_path, nrows=1, skiprows=2, header=None)
+    
+    # Extract ImportIds from each column value in this row
+    new_columns = [extract_qid(val) for val in df_meta.iloc[0]]
+    
+    # Now read the actual data starting from row 4 (skip first 3 rows)
+    df = pl.read_csv(file_path, skip_rows=3)
+    
+    # Rename columns with the extracted ImportIds
+    df.columns = new_columns
+    
+    return df