initial plots

This commit is contained in:
2026-01-22 20:48:59 +01:00
parent b8642e9de8
commit dbcade215b
3 changed files with 642 additions and 59 deletions

339
utils.py
View File

@@ -2,6 +2,27 @@ import polars as pl
from pathlib import Path
import pandas as pd
from typing import Union
import json
import re
def extract_voice_label(html_str: str) -> str:
"""
Extract voice label from HTML string and convert to short format.
Parameters:
html_str (str): HTML string containing voice label in format "Voice N"
Returns:
str: Voice label in format "VN" (e.g., "V14")
Example:
>>> extract_voice_label('<span style="...">Voice 14<br />...')
'V14'
"""
match = re.search(r'Voice (\d+)', html_str)
return f"V{match.group(1)}" if match else None
def extract_qid(val):
"""Extracts the 'ImportId' from a string representation of a dictionary."""
@@ -11,64 +32,286 @@ def extract_qid(val):
return val['ImportId']
def extract_qid_descr_map(results_file: Union[str, Path]) -> dict:
"""Extract mapping of Qualtrics ImportID to Question Description from results file."""
if isinstance(results_file, str):
results_file = Path(results_file)
if '1_1-16-2026' in results_file.as_posix():
df_questions = pd.read_csv(results_file, nrows=1)
df_questions
class JPMCSurvey:
"""Class to handle JPMorgan Chase survey data."""
return df_questions.iloc[0].to_dict()
def __init__(self, data_path: Union[str, Path], qsf_path: Union[str, Path]):
if isinstance(data_path, str):
data_path = Path(data_path)
if isinstance(qsf_path, str):
qsf_path = Path(qsf_path)
self.data_filepath = data_path
self.qsf_filepath = qsf_path
self.qid_descr_map = self._extract_qid_descr_map()
self.qsf:dict = self._load_qsf()
def _extract_qid_descr_map(self) -> dict:
"""Extract mapping of Qualtrics ImportID to Question Description from results file."""
if '1_1-16-2026' in self.data_filepath.as_posix():
df_questions = pd.read_csv(self.data_filepath, nrows=1)
df_questions
return df_questions.iloc[0].to_dict()
else:
# First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
# Second row which contains the question content
# Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
df_questions = pd.read_csv(self.data_filepath, nrows=2)
# transpose df_questions
df_questions = df_questions.T.reset_index()
df_questions.columns = ['QName', 'Description', 'export_metadata']
df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
df_questions = df_questions[['ImportID', 'QName', 'Description']]
# return dict as {ImportID: [QName, Description]}
return df_questions.set_index('ImportID')[['QName', 'Description']].T.to_dict()
def _load_qsf(self) -> dict:
"""Load QSF file to extract question metadata if needed."""
with open(self.qsf_filepath, 'r', encoding='utf-8') as f:
qsf_data = json.load(f)
return qsf_data
def _get_qsf_question_by_QID(self, QID: str) -> dict:
"""Get question metadata from QSF using the Question ID."""
q_elem = [elem for elem in self.qsf['SurveyElements'] if elem['PrimaryAttribute'] == QID]
if len(q_elem) == 0:
raise ValueError(f"SurveyElement with 'PrimaryAttribute': '{QID}' not found in QSF.")
if len(q_elem) > 1:
raise ValueError(f"Multiple SurveyElements with 'PrimaryAttribute': '{QID}' found in QSF: \n{q_elem}")
return q_elem[0]
else:
# First row contains Qualtrics Editor question names (ie 'B_VOICE SEL. 18-8')
# Second row which contains the question content
# Third row contains the Export Metadata (ie '{"ImportId":"startDate","timeZone":"America/Denver"}')
df_questions = pd.read_csv(results_file, nrows=1, skiprows=1)
def load_data(self) -> pl.LazyFrame:
"""
Load CSV where column headers are in row 3 as dict strings with ImportId.
The 3rd row contains metadata like '{"ImportId":"startDate","timeZone":"America/Denver"}'.
This function extracts the ImportId from each column and uses it as the column name.
Parameters:
file_path (Path): Path to the CSV file to load.
Returns:
pl.LazyFrame: Polars LazyFrame with ImportId as column names.
"""
if '1_1-16-2026' in self.data_filepath.as_posix():
raise NotImplementedError("This method does not support the '1_1-16-2026' export format.")
# Read the 3rd row (index 2) which contains the metadata dictionaries
# Use header=None to get raw values instead of treating them as column names
df_meta = pd.read_csv(self.data_filepath, nrows=1, skiprows=2, header=None)
# Extract ImportIds from each column value in this row
new_columns = [extract_qid(val) for val in df_meta.iloc[0]]
# Now read the actual data starting from row 4 (skip first 3 rows)
df = pl.read_csv(self.data_filepath, skip_rows=3)
# Rename columns with the extracted ImportIds
df.columns = new_columns
return df.lazy()
# transpose df_questions
df_questions = df_questions.T.reset_index()
df_questions.columns = ['Description', 'export_metadata']
df_questions['ImportID'] = df_questions['export_metadata'].apply(extract_qid)
df_questions = df_questions[['ImportID', 'Description']]
return dict(zip(df_questions['ImportID'], df_questions['Description']))
def _get_subset(self, q: pl.LazyFrame, QIDs, rename_cols=True) -> pl.LazyFrame:
"""Extract subset of data based on specific questions."""
if not rename_cols:
return q.select(QIDs)
rename_dict = {qid: self.qid_descr_map[qid]['QName'] for qid in QIDs if qid in self.qid_descr_map}
return q.select(QIDs).rename(rename_dict)
def load_csv_with_qid_headers(file_path: Union[str, Path]) -> pl.DataFrame:
"""
Load CSV where column headers are in row 3 as dict strings with ImportId.
The 3rd row contains metadata like '{"ImportId":"startDate","timeZone":"America/Denver"}'.
This function extracts the ImportId from each column and uses it as the column name.
Parameters:
file_path (Path): Path to the CSV file to load.
Returns:
pl.DataFrame: Polars DataFrame with ImportId as column names.
"""
if isinstance(file_path, str):
file_path = Path(file_path)
def get_demographics(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the demographics.
Renames columns using qid_descr_map if provided.
"""
QIDs = ['QID1', 'QID2', 'QID3', 'QID4', 'QID13', 'QID14', 'QID15', 'QID16', 'QID17', 'Consumer']
return self._get_subset(q, QIDs), None
# Read the 3rd row (index 2) which contains the metadata dictionaries
# Use header=None to get raw values instead of treating them as column names
df_meta = pd.read_csv(file_path, nrows=1, skiprows=2, header=None)
def get_top_8_traits(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the top 8 characteristics are most important for this Chase virtual assistant to have.
Returns subquery that can be chained with other polars queries.
"""
QIDs = ['QID25']
return self._get_subset(q, QIDs, rename_cols=False).rename({'QID25': 'Top_8_Traits'}), None
def get_top_3_traits(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the top 3 characteristics that the Chase virtual assistant should prioritize.
Returns subquery that can be chained with other polars queries.
"""
QIDs = ['QID26_0_GROUP']
return self._get_subset(q, QIDs, rename_cols=False).rename({'QID26_0_GROUP': 'Top_3_Traits'}), None
# Extract ImportIds from each column value in this row
new_columns = [extract_qid(val) for val in df_meta.iloc[0]]
# Now read the actual data starting from row 4 (skip first 3 rows)
df = pl.read_csv(file_path, skip_rows=3)
def get_character_ranking(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the ranking of characteristics for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
# Requires QSF to map "Character Ranking_2" to the actual character
cfg = self._get_qsf_question_by_QID('QID27')['Payload']
QIDs_map = {f'QID27_{v}': cfg['VariableNaming'][k] for k,v in cfg['RecodeValues'].items()}
QIDs_rename = {qid: f'Character_Ranking_{QIDs_map[qid].replace(" ", "_")}' for qid in QIDs_map}
return self._get_subset(q, list(QIDs_rename.keys()), rename_cols=False).rename(QIDs_rename), None
# Rename columns with the extracted ImportIds
df.columns = new_columns
def get_18_8_3(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the 18-8-3 feedback for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
QIDs = ['QID29', 'QID101', 'QID36_0_GROUP']
rename_dict = {
'QID29': '18-8_Set-A',
'QID101': '18-8_Set-B',
'QID36_0_GROUP': '8-3_Ranked'
}
return self._get_subset(q, QIDs, rename_cols=False).rename(rename_dict), None
return df
def get_voice_scale_1_10(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the Voice Scale 1-10 ratings for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
QIDs_map = {}
for qid, val in self.qid_descr_map.items():
if 'Scale 1-10_1' in val['QName']:
# Convert "Voice 16 Scale 1-10_1" to "Scale_1_10__Voice_16"
QIDs_map[qid] = f"Voice_Scale_1_10__V{val['QName'].split()[1]}"
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
def get_ss_green_blue(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the SS Green/Blue ratings for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
cfg = self._get_qsf_question_by_QID('QID35')['Payload']
QIDs_map = {}
choices_map = {}
for qid, val in self.qid_descr_map.items():
if 'SS Green-Blue' in val['QName']:
cfg = self._get_qsf_question_by_QID(qid.split('_')[0])['Payload']
# ie: "V14 SS Green-Blue_1"
qname_parts = val['QName'].split()
voice = qname_parts[0]
trait_num = qname_parts[-1].split('_')[-1]
QIDs_map[qid] = f"SS_Green_Blue__{voice}__Choice_{trait_num}"
choices_map[f"SS_Green_Blue__{voice}__Choice_{trait_num}"] = cfg['Choices'][trait_num]['Display']
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), choices_map
def get_top_3_voices(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the top 3 voice choices for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
QIDs_map = {}
cfg36 = self._get_qsf_question_by_QID('QID36')['Payload']
choice_voice_map = {k: extract_voice_label(v['Display']) for k,v in cfg36['Choices'].items()}
for qid, val in self.qid_descr_map.items():
if 'Rank Top 3 Voices' in val['QName']:
cfg = self._get_qsf_question_by_QID(qid.split('_')[0])['Payload']
voice_num = val['QName'].split('_')[-1]
# Validate that the DynamicChoices Locator is as expected
if cfg['DynamicChoices']['Locator'] != r"q://QID36/ChoiceGroup/SelectedChoicesInGroup/1":
raise ValueError(f"Unexpected DynamicChoices Locator for QID '{qid}': {cfg['DynamicChoices']['Locator']}")
# extract the voice from the QID36 config
voice = choice_voice_map[voice_num]
# Convert "Top 3 Voices_1" to "Top_3_Voices__V14"
QIDs_map[qid] = f"Top_3_Voices_ranking__{voice}"
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
def get_ss_orange_red(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
cfg = self._get_qsf_question_by_QID('QID40')['Payload']
QIDs_map = {}
choices_map = {}
for qid, val in self.qid_descr_map.items():
if 'SS Orange-Red' in val['QName']:
cfg = self._get_qsf_question_by_QID(qid.split('_')[0])['Payload']
# ie: "V14 SS Orange-Red_1"
qname_parts = val['QName'].split()
voice = qname_parts[0]
trait_num = qname_parts[-1].split('_')[-1]
QIDs_map[qid] = f"SS_Orange_Red__{voice}__Choice_{trait_num}"
choices_map[f"SS_Orange_Red__{voice}__Choice_{trait_num}"] = cfg['Choices'][trait_num]['Display']
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), choices_map
def get_character_refine(self, q: pl.LazyFrame) -> pl.LazyFrame:
"""Extract columns containing the character refine feedback for the Chase virtual assistant.
Returns subquery that can be chained with other polars queries.
"""
QIDs = ['QID29', 'QID101', 'QID36_0_GROUP']
rename_dict = {
'QID29': '18-8_Set-A',
'QID101': '18-8_Set-B',
'QID36_0_GROUP': '8-3_Ranked'
}