split consumer groups best character
This commit is contained in:
34
utils.py
34
utils.py
@@ -1716,3 +1716,37 @@ def process_voice_ranking_data(
|
||||
])
|
||||
|
||||
return result.collect()
|
||||
|
||||
|
||||
def split_consumer_groups(df: Union[pl.LazyFrame, pl.DataFrame]) -> dict[str, pl.DataFrame]:
|
||||
"""
|
||||
Split dataframe into groups based on Consumer column, combining A/B subgroups.
|
||||
|
||||
Mappings:
|
||||
- Mass_A, Mass_B -> Mass
|
||||
- Lower_Mass_A, Lower_Mass_B -> Lower_Mass
|
||||
- MassAffluent_A, MassAffluent_B -> MassAffluent
|
||||
- Mix_of_Affluent..._A, ..._B -> Mix_of_Affluent...
|
||||
"""
|
||||
if isinstance(df, pl.LazyFrame):
|
||||
df = df.collect()
|
||||
|
||||
if "Consumer" not in df.columns:
|
||||
raise ValueError("Column 'Consumer' not found in DataFrame")
|
||||
|
||||
# Clean Consumer column by removing _A or _B suffix
|
||||
# Using regex replacement for trailing _A or _B
|
||||
df_clean = df.with_columns(
|
||||
pl.col("Consumer")
|
||||
.str.replace(r"_[AB]$", "")
|
||||
.alias("Consumer_Group")
|
||||
)
|
||||
|
||||
# Split into dict
|
||||
groups = {}
|
||||
unique_groups = df_clean["Consumer_Group"].drop_nulls().unique().to_list()
|
||||
|
||||
for group in unique_groups:
|
||||
groups[group] = df_clean.filter(pl.col("Consumer_Group") == group)
|
||||
|
||||
return groups
|
||||
|
||||
Reference in New Issue
Block a user