BC results per consumer

This commit is contained in:
2026-02-02 22:59:26 +01:00
parent bda4d54231
commit 3ac330263f
5 changed files with 467 additions and 345 deletions

View File

@@ -1718,35 +1718,40 @@ def process_voice_ranking_data(
return result.collect()
def split_consumer_groups(df: Union[pl.LazyFrame, pl.DataFrame]) -> dict[str, pl.DataFrame]:
def split_consumer_groups(df: Union[pl.LazyFrame, pl.DataFrame], col: str = "Consumer") -> dict[str, pl.DataFrame]:
"""
Split dataframe into groups based on Consumer column, combining A/B subgroups.
Split dataframe into groups based on a column.
Mappings:
- Mass_A, Mass_B -> Mass
- Lower_Mass_A, Lower_Mass_B -> Lower_Mass
- MassAffluent_A, MassAffluent_B -> MassAffluent
- Mix_of_Affluent..._A, ..._B -> Mix_of_Affluent...
If col is 'Consumer', it combines A/B subgroups (e.g. Mass_A + Mass_B -> Mass).
For other columns, it splits by unique values as-is.
"""
if isinstance(df, pl.LazyFrame):
df = df.collect()
if "Consumer" not in df.columns:
raise ValueError("Column 'Consumer' not found in DataFrame")
if col not in df.columns:
raise ValueError(f"Column '{col}' not found in DataFrame")
# Clean Consumer column by removing _A or _B suffix
# Using regex replacement for trailing _A or _B
df_clean = df.with_columns(
pl.col("Consumer")
.str.replace(r"_[AB]$", "")
.alias("Consumer_Group")
)
group_col_alias = f"{col}_Group"
if col == "Consumer":
# Clean Consumer column by removing _A or _B suffix
# Using regex replacement for trailing _A or _B
df_clean = df.with_columns(
pl.col(col)
.str.replace(r"_[AB]$", "")
.alias(group_col_alias)
)
else:
# Use values as is
df_clean = df.with_columns(
pl.col(col).alias(group_col_alias)
)
# Split into dict
groups = {}
unique_groups = df_clean["Consumer_Group"].drop_nulls().unique().to_list()
unique_groups = df_clean[group_col_alias].drop_nulls().unique().to_list()
for group in unique_groups:
groups[group] = df_clean.filter(pl.col("Consumer_Group") == group)
groups[group] = df_clean.filter(pl.col(group_col_alias) == group)
return groups