setup complete framework of analysis
This commit is contained in:
49
utils.py
49
utils.py
@@ -32,6 +32,26 @@ def extract_qid(val):
|
||||
return val['ImportId']
|
||||
|
||||
|
||||
def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", target_col_name: str = "combined_value") -> pl.DataFrame:
|
||||
"""
|
||||
Combines all columns except id_col into a single column.
|
||||
Raises ValueError if more than one column is populated in a single row.
|
||||
"""
|
||||
merge_cols = [c for c in df.columns if c != id_col]
|
||||
|
||||
# Validate: count non-nulls horizontally
|
||||
row_counts = df.select(
|
||||
pl.sum_horizontal(pl.col(merge_cols).is_not_null())
|
||||
).to_series()
|
||||
|
||||
if (row_counts > 1).any():
|
||||
raise ValueError("Invalid Data: Multiple columns populated for a single record row.")
|
||||
|
||||
# Merge columns using coalesce
|
||||
return df.select([
|
||||
pl.col(id_col),
|
||||
pl.coalesce(merge_cols).alias(target_col_name)
|
||||
])
|
||||
|
||||
|
||||
|
||||
@@ -144,6 +164,35 @@ class JPMCSurvey:
|
||||
|
||||
return q.select(QIDs).rename(rename_dict)
|
||||
|
||||
def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None) -> pl.LazyFrame:
|
||||
"""Filter data based on provided parameters
|
||||
|
||||
Possible parameters:
|
||||
- age: list of age groups to include
|
||||
- gender: list
|
||||
- consumer: list
|
||||
- ethnicity: list
|
||||
- income: list
|
||||
|
||||
Returns filtered polars LazyFrame.
|
||||
"""
|
||||
|
||||
if age is not None:
|
||||
q = q.filter(pl.col('QID1').is_in(age))
|
||||
|
||||
if gender is not None:
|
||||
q = q.filter(pl.col('QID2').is_in(gender))
|
||||
|
||||
if consumer is not None:
|
||||
q = q.filter(pl.col('Consumer').is_in(consumer))
|
||||
|
||||
if ethnicity is not None:
|
||||
q = q.filter(pl.col('QID3').is_in(ethnicity))
|
||||
|
||||
if income is not None:
|
||||
q = q.filter(pl.col('QID15').is_in(income))
|
||||
|
||||
return q
|
||||
|
||||
def get_demographics(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
|
||||
"""Extract columns containing the demographics.
|
||||
|
||||
Reference in New Issue
Block a user