speaking style trait scores vertical
This commit is contained in:
100
utils.py
100
utils.py
@@ -3,7 +3,6 @@ from pathlib import Path
|
||||
import pandas as pd
|
||||
from typing import Union
|
||||
import json
|
||||
|
||||
import re
|
||||
|
||||
def extract_voice_label(html_str: str) -> str:
|
||||
@@ -57,13 +56,13 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe
|
||||
|
||||
def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""
|
||||
Calculate weighted scores for character rankings.
|
||||
Calculate weighted scores for character or voice rankings.
|
||||
Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pl.DataFrame
|
||||
DataFrame containing character ranking columns.
|
||||
DataFrame containing character/ voice ranking columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
@@ -71,8 +70,8 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
|
||||
DataFrame with columns 'Character' and 'Weighted Score', sorted by score.
|
||||
"""
|
||||
scores = []
|
||||
# Identify columns related to Character Ranking
|
||||
ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
|
||||
# Identify ranking columns (assume all columns except _recordId)
|
||||
ranking_cols = [c for c in df.columns if c != '_recordId']
|
||||
|
||||
for col in ranking_cols:
|
||||
# Calculate score:
|
||||
@@ -84,7 +83,7 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
|
||||
weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1)
|
||||
|
||||
# Clean name
|
||||
clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip()
|
||||
clean_name = col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
|
||||
|
||||
scores.append({
|
||||
'Character': clean_name,
|
||||
@@ -413,6 +412,95 @@ class JPMCSurvey:
|
||||
QIDs = ['QID44', 'QID97', 'QID95', 'QID96']
|
||||
|
||||
return self._get_subset(q, QIDs, rename_cols=True), None
|
||||
|
||||
|
||||
def process_speaking_style_data(
|
||||
df: Union[pl.LazyFrame, pl.DataFrame],
|
||||
trait_map: dict[str, str]
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Process speaking style columns from wide to long format and map trait descriptions.
|
||||
|
||||
Parses columns with format: SS_{StyleGroup}__{Voice}__{ChoiceID}
|
||||
Example: SS_Orange_Red__V14__Choice_1
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pl.LazyFrame or pl.DataFrame
|
||||
Input dataframe containing SS_* columns.
|
||||
trait_map : dict
|
||||
Dictionary mapping column names to trait descriptions.
|
||||
Keys should be full column names like "SS_Orange_Red__V14__Choice_1".
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Long-format dataframe with columns:
|
||||
_recordId, Voice, Style_Group, Choice_ID, Description, Score, Left_Anchor, Right_Anchor
|
||||
"""
|
||||
# Normalize input to LazyFrame
|
||||
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
|
||||
|
||||
# 1. Melt SS_ columns
|
||||
melted = lf.melt(
|
||||
id_vars=["_recordId"],
|
||||
value_vars=pl.col("^SS_.*$"),
|
||||
variable_name="full_col_name",
|
||||
value_name="score"
|
||||
)
|
||||
|
||||
# 2. Extract components from column name
|
||||
# Regex captures: Style_Group (e.g. SS_Orange_Red), Voice (e.g. V14), Choice_ID (e.g. Choice_1)
|
||||
pattern = r"^(?P<Style_Group>SS_.+?)__(?P<Voice>.+?)__(?P<Choice_ID>Choice_\d+)$"
|
||||
|
||||
processed = melted.with_columns(
|
||||
pl.col("full_col_name").str.extract_groups(pattern)
|
||||
).unnest("full_col_name")
|
||||
|
||||
# 3. Create Mapping Lookup from the provided dictionary
|
||||
# We map (Style_Group, Choice_ID) -> Description
|
||||
mapping_data = []
|
||||
seen = set()
|
||||
|
||||
for col_name, desc in trait_map.items():
|
||||
match = re.match(pattern, col_name)
|
||||
if match:
|
||||
groups = match.groupdict()
|
||||
key = (groups["Style_Group"], groups["Choice_ID"])
|
||||
|
||||
if key not in seen:
|
||||
# Parse description into anchors if possible (Left : Right)
|
||||
parts = desc.split(':')
|
||||
left_anchor = parts[0].strip() if len(parts) > 0 else ""
|
||||
right_anchor = parts[1].strip() if len(parts) > 1 else ""
|
||||
|
||||
mapping_data.append({
|
||||
"Style_Group": groups["Style_Group"],
|
||||
"Choice_ID": groups["Choice_ID"],
|
||||
"Description": desc,
|
||||
"Left_Anchor": left_anchor,
|
||||
"Right_Anchor": right_anchor
|
||||
})
|
||||
seen.add(key)
|
||||
|
||||
if not mapping_data:
|
||||
return processed.collect()
|
||||
|
||||
mapping_lf = pl.LazyFrame(mapping_data)
|
||||
|
||||
# 4. Join Data with Mapping
|
||||
result = processed.join(
|
||||
mapping_lf,
|
||||
on=["Style_Group", "Choice_ID"],
|
||||
how="left"
|
||||
)
|
||||
|
||||
# 5. Cast score to Int
|
||||
result = result.with_columns(
|
||||
pl.col("score").cast(pl.Int64, strict=False)
|
||||
)
|
||||
|
||||
return result.collect()
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user