correlation start

2026-01-27 17:22:16 +01:00
parent 393c527656
commit fd4cb4b596
9 changed files with 5375 additions and 24 deletions
--- a/utils.py
+++ b/utils.py
@@ -506,3 +506,118 @@ def process_speaking_style_data(



+
+def process_voice_scale_data(
+    df: Union[pl.LazyFrame, pl.DataFrame]
+) -> pl.DataFrame:
+    """
+    Process Voice Scale columns from wide to long format.
+    
+    Parses columns with format: Voice_Scale_1_10__V{Voice}
+    Example: Voice_Scale_1_10__V14
+    
+    Returns
+    -------
+    pl.DataFrame
+        Long-format dataframe with columns:
+        _recordId, Voice, Voice_Scale_Score
+    """
+    lf = df.lazy() if isinstance(df, pl.DataFrame) else df
+    
+    # Melt
+    melted = lf.melt(
+        id_vars=["_recordId"],
+        value_vars=pl.col("^Voice_Scale_1_10__V.*$"),
+        variable_name="full_col_name",
+        value_name="Voice_Scale_Score"
+    )
+    
+    # Extract Voice
+    processed = melted.with_columns(
+        pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
+    ).with_columns(
+        ("V" + pl.col("Voice_Num")).alias("Voice")
+    )
+    
+    # Keep Score as Float (original data is f64)
+    result = processed.select([
+        "_recordId", 
+        "Voice", 
+        pl.col("Voice_Scale_Score").cast(pl.Float64, strict=False)
+    ])
+    
+    return result.collect()
+
+def join_voice_and_style_data(
+    processed_style_data: pl.DataFrame,
+    processed_voice_data: pl.DataFrame
+) -> pl.DataFrame:
+    """
+    Joins processed Speaking Style data with Voice Scale 1-10 data.
+    
+    Parameters
+    ----------
+    processed_style_data : pl.DataFrame
+        Result of process_speaking_style_data
+    processed_voice_data : pl.DataFrame
+        Result of process_voice_scale_data
+        
+    Returns
+    -------
+    pl.DataFrame
+        Merged dataframe with columns from both, joined on _recordId and Voice.
+    """
+    
+    return processed_style_data.join(
+        processed_voice_data,
+        on=["_recordId", "Voice"],
+        how="inner"
+    )
+
+def process_voice_ranking_data(
+    df: Union[pl.LazyFrame, pl.DataFrame]
+) -> pl.DataFrame:
+    """
+    Process Voice Ranking columns from wide to long format and convert ranks to points.
+    
+    Parses columns with format: Top_3_Voices_ranking__V{Voice}
+    Converts ranks to points: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt
+    
+    Returns
+    -------
+    pl.DataFrame
+        Long-format dataframe with columns:
+        _recordId, Voice, Ranking_Points
+    """
+    lf = df.lazy() if isinstance(df, pl.DataFrame) else df
+    
+    # Melt
+    melted = lf.melt(
+        id_vars=["_recordId"],
+        value_vars=pl.col("^Top_3_Voices_ranking__V.*$"),
+        variable_name="full_col_name",
+        value_name="rank"
+    )
+    
+    # Extract Voice
+    processed = melted.with_columns(
+        pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
+    ).with_columns(
+        ("V" + pl.col("Voice_Num")).alias("Voice")
+    )
+    
+    # Convert rank to points: 1st=3, 2nd=2, 3rd=1, null=0 (not ranked)
+    # Rank values are 1, 2, 3 for position in top 3
+    result = processed.with_columns(
+        pl.when(pl.col("rank") == 1).then(3)
+          .when(pl.col("rank") == 2).then(2)
+          .when(pl.col("rank") == 3).then(1)
+          .otherwise(0)
+          .alias("Ranking_Points")
+    ).select([
+        "_recordId",
+        "Voice",
+        "Ranking_Points"
+    ])
+    
+    return result.collect()