speaking style trait scores vertical

2026-01-23 12:26:47 +01:00
parent 424355f4a1
commit 84a0f8052e
5 changed files with 615 additions and 90 deletions
--- a/utils.py
+++ b/utils.py
@@ -3,7 +3,6 @@ from pathlib import Path
 import pandas as pd
 from typing import Union
 import json
-
 import re

 def extract_voice_label(html_str: str) -> str:
@@ -57,13 +56,13 @@ def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", targe

 def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
    """
-    Calculate weighted scores for character rankings.
+    Calculate weighted scores for character or voice rankings.
    Points system: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt.

    Parameters
    ----------
    df : pl.DataFrame
-        DataFrame containing character ranking columns.
+        DataFrame containing character/ voice ranking columns.

    Returns
    -------
@@ -71,8 +70,8 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
        DataFrame with columns 'Character' and 'Weighted Score', sorted by score.
    """
    scores = []
-    # Identify columns related to Character Ranking
-    ranking_cols = [c for c in df.columns if 'Character_Ranking' in c]
+    # Identify ranking columns (assume all columns except _recordId)
+    ranking_cols = [c for c in df.columns if c != '_recordId']

    for col in ranking_cols:
        # Calculate score:
@@ -84,7 +83,7 @@ def calculate_weighted_ranking_scores(df: pl.DataFrame) -> pl.DataFrame:
        weighted_score = (r1_count * 3) + (r2_count * 2) + (r3_count * 1)
        
        # Clean name
-        clean_name = col.replace('Character_Ranking_', '').replace('_', ' ').strip()
+        clean_name = col.replace('Character_Ranking_', '').replace('Top_3_Voices_ranking__', '').replace('_', ' ').strip()
        
        scores.append({
            'Character': clean_name,
@@ -413,6 +412,95 @@ class JPMCSurvey:
        QIDs = ['QID44', 'QID97', 'QID95', 'QID96']
        
        return self._get_subset(q, QIDs, rename_cols=True), None
+
+
+def process_speaking_style_data(
+    df: Union[pl.LazyFrame, pl.DataFrame],
+    trait_map: dict[str, str]
+) -> pl.DataFrame:
+    """
+    Process speaking style columns from wide to long format and map trait descriptions.
+    
+    Parses columns with format: SS_{StyleGroup}__{Voice}__{ChoiceID}
+    Example: SS_Orange_Red__V14__Choice_1
+    
+    Parameters
+    ----------
+    df : pl.LazyFrame or pl.DataFrame
+        Input dataframe containing SS_* columns.
+    trait_map : dict
+        Dictionary mapping column names to trait descriptions.
+        Keys should be full column names like "SS_Orange_Red__V14__Choice_1".
+        
+    Returns
+    -------
+    pl.DataFrame
+        Long-format dataframe with columns:
+        _recordId, Voice, Style_Group, Choice_ID, Description, Score, Left_Anchor, Right_Anchor
+    """
+    # Normalize input to LazyFrame
+    lf = df.lazy() if isinstance(df, pl.DataFrame) else df
+    
+    # 1. Melt SS_ columns
+    melted = lf.melt(
+        id_vars=["_recordId"],
+        value_vars=pl.col("^SS_.*$"),
+        variable_name="full_col_name",
+        value_name="score"
+    )
+    
+    # 2. Extract components from column name
+    # Regex captures: Style_Group (e.g. SS_Orange_Red), Voice (e.g. V14), Choice_ID (e.g. Choice_1)
+    pattern = r"^(?P<Style_Group>SS_.+?)__(?P<Voice>.+?)__(?P<Choice_ID>Choice_\d+)$"
+    
+    processed = melted.with_columns(
+        pl.col("full_col_name").str.extract_groups(pattern)
+    ).unnest("full_col_name")
+    
+    # 3. Create Mapping Lookup from the provided dictionary
+    # We map (Style_Group, Choice_ID) -> Description
+    mapping_data = []
+    seen = set()
+    
+    for col_name, desc in trait_map.items():
+        match = re.match(pattern, col_name)
+        if match:
+            groups = match.groupdict()
+            key = (groups["Style_Group"], groups["Choice_ID"])
+            
+            if key not in seen:
+                # Parse description into anchors if possible (Left : Right)
+                parts = desc.split(':')
+                left_anchor = parts[0].strip() if len(parts) > 0 else ""
+                right_anchor = parts[1].strip() if len(parts) > 1 else ""
+                
+                mapping_data.append({
+                    "Style_Group": groups["Style_Group"],
+                    "Choice_ID": groups["Choice_ID"],
+                    "Description": desc,
+                    "Left_Anchor": left_anchor,
+                    "Right_Anchor": right_anchor
+                })
+                seen.add(key)
+    
+    if not mapping_data:
+        return processed.collect()
+
+    mapping_lf = pl.LazyFrame(mapping_data)
+    
+    # 4. Join Data with Mapping
+    result = processed.join(
+        mapping_lf,
+        on=["Style_Group", "Choice_ID"],
+        how="left"
+    )
+    
+    # 5. Cast score to Int
+    result = result.with_columns(
+        pl.col("score").cast(pl.Int64, strict=False)
+    )
+    
+    return result.collect()