speaking style trait scores vertical

This commit is contained in:
2026-01-23 12:26:47 +01:00
parent 424355f4a1
commit 84a0f8052e
5 changed files with 615 additions and 90 deletions

100
utils.py
View File

@@ -3,7 +3,6 @@ from pathlib import Path
import pandas as pd
from typing import Union
import json
import re
def extract_voice_label(html_str: str) -> str:
@@ -57,13 +56,13 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe
def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
"""
Calculate weighted scores for character rankings.
Calculate weighted scores for character or voice rankings.
Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt.
Parameters
----------
df : pl.DataFrame
DataFrame containing character ranking columns.
DataFrame containing character/ voice ranking columns.
Returns
-------
@@ -71,8 +70,8 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
DataFrame with columns 'Character' and 'Weighted Score', sorted by score.
"""
scores = []
# Identify columns related to Character Ranking
ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
# Identify ranking columns (assume all columns except _recordId)
ranking_cols = [c for c in df.columns if c != '_recordId']
for col in ranking_cols:
# Calculate score:
@@ -84,7 +83,7 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1)
# Clean name
clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip()
clean_name = col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
scores.append({
'Character': clean_name,
@@ -413,6 +412,95 @@ class JPMCSurvey:
QIDs = ['QID44', 'QID97', 'QID95', 'QID96']
return self._get_subset(q, QIDs, rename_cols=True), None
def process_speaking_style_data(
df: Union[pl.LazyFrame, pl.DataFrame],
trait_map: dict[str, str]
) -> pl.DataFrame:
"""
Process speaking style columns from wide to long format and map trait descriptions.
Parses columns with format: SS_{StyleGroup}__{Voice}__{ChoiceID}
Example: SS_Orange_Red__V14__Choice_1
Parameters
----------
df : pl.LazyFrame or pl.DataFrame
Input dataframe containing SS_* columns.
trait_map : dict
Dictionary mapping column names to trait descriptions.
Keys should be full column names like "SS_Orange_Red__V14__Choice_1".
Returns
-------
pl.DataFrame
Long-format dataframe with columns:
_recordId, Voice, Style_Group, Choice_ID, Description, Score, Left_Anchor, Right_Anchor
"""
# Normalize input to LazyFrame
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
# 1. Melt SS_ columns
melted = lf.melt(
id_vars=["_recordId"],
value_vars=pl.col("^SS_.*$"),
variable_name="full_col_name",
value_name="score"
)
# 2. Extract components from column name
# Regex captures: Style_Group (e.g. SS_Orange_Red), Voice (e.g. V14), Choice_ID (e.g. Choice_1)
pattern = r"^(?P<Style_Group>SS_.+?)__(?P<Voice>.+?)__(?P<Choice_ID>Choice_\d+)$"
processed = melted.with_columns(
pl.col("full_col_name").str.extract_groups(pattern)
).unnest("full_col_name")
# 3. Create Mapping Lookup from the provided dictionary
# We map (Style_Group, Choice_ID) -> Description
mapping_data = []
seen = set()
for col_name, desc in trait_map.items():
match = re.match(pattern, col_name)
if match:
groups = match.groupdict()
key = (groups["Style_Group"], groups["Choice_ID"])
if key not in seen:
# Parse description into anchors if possible (Left : Right)
parts = desc.split(':')
left_anchor = parts[0].strip() if len(parts) > 0 else ""
right_anchor = parts[1].strip() if len(parts) > 1 else ""
mapping_data.append({
"Style_Group": groups["Style_Group"],
"Choice_ID": groups["Choice_ID"],
"Description": desc,
"Left_Anchor": left_anchor,
"Right_Anchor": right_anchor
})
seen.add(key)
if not mapping_data:
return processed.collect()
mapping_lf = pl.LazyFrame(mapping_data)
# 4. Join Data with Mapping
result = processed.join(
mapping_lf,
on=["Style_Group", "Choice_ID"],
how="left"
)
# 5. Cast score to Int
result = result.with_columns(
pl.col("score").cast(pl.Int64, strict=False)
)
return result.collect()