setup complete framework of analysis

2026-01-23 09:53:59 +01:00
parent 42f2d775c7
commit 5327b50ab0
5 changed files with 465 additions and 6 deletions
--- a/utils.py
+++ b/utils.py
@@ -32,6 +32,26 @@ def extract_qid(val):
    return val['ImportId']


+def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", target_col_name: str = "combined_value") -> pl.DataFrame:
+    """
+    Combines all columns except id_col into a single column.
+    Raises ValueError if more than one column is populated in a single row.
+    """
+    merge_cols = [c for c in df.columns if c != id_col]
+
+    # Validate: count non-nulls horizontally
+    row_counts = df.select(
+        pl.sum_horizontal(pl.col(merge_cols).is_not_null())
+    ).to_series()
+
+    if (row_counts > 1).any():
+        raise ValueError("Invalid Data: Multiple columns populated for a single record row.")
+
+    # Merge columns using coalesce
+    return df.select([
+        pl.col(id_col),
+        pl.coalesce(merge_cols).alias(target_col_name)
+    ])



@@ -144,6 +164,35 @@ class JPMCSurvey:
        
        return q.select(QIDs).rename(rename_dict)

+    def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None) -> pl.LazyFrame:
+        """Filter data based on provided parameters
+        
+        Possible parameters:
+        - age: list of age groups to include
+        - gender: list
+        - consumer: list
+        - ethnicity: list
+        - income: list
+        
+        Returns filtered polars LazyFrame.
+        """
+        
+        if age is not None:
+            q = q.filter(pl.col('QID1').is_in(age))
+        
+        if gender is not None:
+            q = q.filter(pl.col('QID2').is_in(gender))
+        
+        if consumer is not None:
+            q = q.filter(pl.col('Consumer').is_in(consumer))
+        
+        if ethnicity is not None:
+            q = q.filter(pl.col('QID3').is_in(ethnicity))
+        
+        if income is not None:
+            q = q.filter(pl.col('QID15').is_in(income))
+        
+        return q

    def get_demographics(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
        """Extract columns containing the demographics.