base correlations

2026-02-03 01:32:06 +01:00
parent 1dce4db909
commit 2408d06098
4 changed files with 257 additions and 0 deletions
--- a/utils.py
+++ b/utils.py
@@ -1676,6 +1676,69 @@ def join_voice_and_style_data(
        how="inner"
    )

+
+def transform_speaking_style_color_correlation(
+    joined_df: pl.LazyFrame | pl.DataFrame,
+    speaking_styles: dict[str, list[str]],
+    target_column: str = "Voice_Scale_Score"
+) -> tuple[pl.DataFrame, dict | None]: 
+    """Aggregate speaking style correlation by color (Green, Blue, Orange, Red).
+    
+    Original use-case: "I want to create high-level correlation plots between
+    'green, blue, orange, red' speaking styles and the 'voice scale scores'.
+    I want to go to one plot with one bar for each color."
+    
+    This function calculates the mean correlation per speaking style color by
+    averaging the correlations of all traits within each color.
+    
+    Parameters
+    ----------
+    joined_df : pl.LazyFrame or pl.DataFrame
+        Pre-fetched data from joining speaking style data with target data.
+        Must have columns: Right_Anchor, score, and the target_column
+    speaking_styles : dict
+        Dictionary mapping color names to their constituent traits.
+        Typically imported from speaking_styles.SPEAKING_STYLES
+    target_column : str
+        The column to correlate against speaking style scores.
+        Default: "Voice_Scale_Score" (for voice scale 1-10)
+        Alternative: "Ranking_Points" (for top 3 voice ranking)
+    
+    Returns
+    -------
+    tuple[pl.DataFrame, dict | None]
+        (DataFrame with columns [Color, correlation, n_traits], None)
+    """
+    if isinstance(joined_df, pl.LazyFrame):
+        joined_df = joined_df.collect()
+    
+    color_correlations = []
+    
+    for color, traits in speaking_styles.items():
+        trait_corrs = []
+        for trait in traits:
+            # Filter to this specific trait
+            subset = joined_df.filter(pl.col("Right_Anchor") == trait)
+            valid_data = subset.select(["score", target_column]).drop_nulls()
+            
+            if valid_data.height > 1:
+                corr_val = valid_data.select(pl.corr("score", target_column)).item()
+                if corr_val is not None:
+                    trait_corrs.append(corr_val)
+        
+        # Average across all traits for this color
+        if trait_corrs:
+            avg_corr = sum(trait_corrs) / len(trait_corrs)
+            color_correlations.append({
+                "Color": color,
+                "correlation": avg_corr,
+                "n_traits": len(trait_corrs)
+            })
+    
+    result_df = pl.DataFrame(color_correlations)
+    return result_df, None
+
+
 def process_voice_ranking_data(
    df: Union[pl.LazyFrame, pl.DataFrame]
 ) -> pl.DataFrame: