correlation start

This commit is contained in:
2026-01-27 17:22:16 +01:00
parent 393c527656
commit fd4cb4b596
9 changed files with 5375 additions and 24 deletions

115
utils.py
View File

@@ -506,3 +506,118 @@ def process_speaking_style_data(
def process_voice_scale_data(
df: Union[pl.LazyFrame, pl.DataFrame]
) -> pl.DataFrame:
"""
Process Voice Scale columns from wide to long format.
Parses columns with format: Voice_Scale_1_10__V{Voice}
Example: Voice_Scale_1_10__V14
Returns
-------
pl.DataFrame
Long-format dataframe with columns:
_recordId, Voice, Voice_Scale_Score
"""
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
# Melt
melted = lf.melt(
id_vars=["_recordId"],
value_vars=pl.col("^Voice_Scale_1_10__V.*$"),
variable_name="full_col_name",
value_name="Voice_Scale_Score"
)
# Extract Voice
processed = melted.with_columns(
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
).with_columns(
("V" + pl.col("Voice_Num")).alias("Voice")
)
# Keep Score as Float (original data is f64)
result = processed.select([
"_recordId",
"Voice",
pl.col("Voice_Scale_Score").cast(pl.Float64, strict=False)
])
return result.collect()
def join_voice_and_style_data(
processed_style_data: pl.DataFrame,
processed_voice_data: pl.DataFrame
) -> pl.DataFrame:
"""
Joins processed Speaking Style data with Voice Scale 1-10 data.
Parameters
----------
processed_style_data : pl.DataFrame
Result of process_speaking_style_data
processed_voice_data : pl.DataFrame
Result of process_voice_scale_data
Returns
-------
pl.DataFrame
Merged dataframe with columns from both, joined on _recordId and Voice.
"""
return processed_style_data.join(
processed_voice_data,
on=["_recordId", "Voice"],
how="inner"
)
def process_voice_ranking_data(
df: Union[pl.LazyFrame, pl.DataFrame]
) -> pl.DataFrame:
"""
Process Voice Ranking columns from wide to long format and convert ranks to points.
Parses columns with format: Top_3_Voices_ranking__V{Voice}
Converts ranks to points: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt
Returns
-------
pl.DataFrame
Long-format dataframe with columns:
_recordId, Voice, Ranking_Points
"""
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
# Melt
melted = lf.melt(
id_vars=["_recordId"],
value_vars=pl.col("^Top_3_Voices_ranking__V.*$"),
variable_name="full_col_name",
value_name="rank"
)
# Extract Voice
processed = melted.with_columns(
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
).with_columns(
("V" + pl.col("Voice_Num")).alias("Voice")
)
# Convert rank to points: 1st=3, 2nd=2, 3rd=1, null=0 (not ranked)
# Rank values are 1, 2, 3 for position in top 3
result = processed.with_columns(
pl.when(pl.col("rank") == 1).then(3)
.when(pl.col("rank") == 2).then(2)
.when(pl.col("rank") == 3).then(1)
.otherwise(0)
.alias("Ranking_Points")
).select([
"_recordId",
"Voice",
"Ranking_Points"
])
return result.collect()