Files
JPMC-quant/utils.py

74 lines
2.6 KiB
Python

import polars as pl
from pathlib import Path
import pandas as pd
from typing import Union
def extract_qid(val):
"""Extracts the 'ImportId' from a string representation of a dictionary."""
if isinstance(val, str) and val.startswith('{') and val.endswith('}'):
val = eval(val)
return val['ImportId']
def extract_qid_descr_map(results_file: Union[str, Path]) -> dict:
"""Extract mapping of Qualtrics ImportID to Question Description from results file."""
if isinstance(results_file, str):
results_file = Path(results_file)
if '1_1-16-2026' in results_file.as_posix():
df_questions = pd.read_csv(results_file, nrows=1)
df_questions
return df_questions.iloc[0].to_dict()
else:
# First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
# Second row which contains the question content
# Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
# transpose df_questions
df_questions = df_questions.T.reset_index()
df_questions.columns = ['Description', 'export_metadata']
df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
df_questions = df_questions[['ImportID', 'Description']]
return dict(zip(df_questions['ImportID'], df_questions['Description']))
def load_csv_with_qid_headers(file_path: Union[str, Path]) -> pl.DataFrame:
"""
Load CSV where column headers are in row 3 as dict strings with ImportId.
The 3rd row contains metadata like '{"ImportId":"startDate","timeZone":"America/Denver"}'.
This function extracts the ImportId from each column and uses it as the column name.
Parameters:
file_path (Path): Path to the CSV file to load.
Returns:
pl.DataFrame: Polars DataFrame with ImportId as column names.
"""
if isinstance(file_path, str):
file_path = Path(file_path)
# Read the 3rd row (index 2) which contains the metadata dictionaries
# Use header=None to get raw values instead of treating them as column names
df_meta = pd.read_csv(file_path, nrows=1, skiprows=2, header=None)
# Extract ImportIds from each column value in this row
new_columns = [extract_qid(val) for val in df_meta.iloc[0]]
# Now read the actual data starting from row 4 (skip first 3 rows)
df = pl.read_csv(file_path, skip_rows=3)
# Rename columns with the extracted ImportIds
df.columns = new_columns
return df