setup complete framework of analysis

This commit is contained in:
2026-01-23 09:53:59 +01:00
parent 42f2d775c7
commit 5327b50ab0
5 changed files with 465 additions and 6 deletions

View File

@@ -32,6 +32,26 @@ def extract_qid(val):
return val['ImportId']
def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", target_col_name: str = "combined_value") -> pl.DataFrame:
"""
Combines all columns except id_col into a single column.
Raises ValueError if more than one column is populated in a single row.
"""
merge_cols = [c for c in df.columns if c != id_col]
# Validate: count non-nulls horizontally
row_counts = df.select(
pl.sum_horizontal(pl.col(merge_cols).is_not_null())
).to_series()
if (row_counts > 1).any():
raise ValueError("Invalid Data: Multiple columns populated for a single record row.")
# Merge columns using coalesce
return df.select([
pl.col(id_col),
pl.coalesce(merge_cols).alias(target_col_name)
])
@@ -144,6 +164,35 @@ class JPMCSurvey:
return q.select(QIDs).rename(rename_dict)
def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None) -> pl.LazyFrame:
"""Filter data based on provided parameters
Possible parameters:
- age: list of age groups to include
- gender: list
- consumer: list
- ethnicity: list
- income: list
Returns filtered polars LazyFrame.
"""
if age is not None:
q = q.filter(pl.col('QID1').is_in(age))
if gender is not None:
q = q.filter(pl.col('QID2').is_in(gender))
if consumer is not None:
q = q.filter(pl.col('Consumer').is_in(consumer))
if ethnicity is not None:
q = q.filter(pl.col('QID3').is_in(ethnicity))
if income is not None:
q = q.filter(pl.col('QID15').is_in(income))
return q
def get_demographics(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
"""Extract columns containing the demographics.