correlation start
This commit is contained in:
115
utils.py
115
utils.py
@@ -506,3 +506,118 @@ def process_speaking_style_data(
|
||||
|
||||
|
||||
|
||||
|
||||
def process_voice_scale_data(
|
||||
df: Union[pl.LazyFrame, pl.DataFrame]
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Process Voice Scale columns from wide to long format.
|
||||
|
||||
Parses columns with format: Voice_Scale_1_10__V{Voice}
|
||||
Example: Voice_Scale_1_10__V14
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Long-format dataframe with columns:
|
||||
_recordId, Voice, Voice_Scale_Score
|
||||
"""
|
||||
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
|
||||
|
||||
# Melt
|
||||
melted = lf.melt(
|
||||
id_vars=["_recordId"],
|
||||
value_vars=pl.col("^Voice_Scale_1_10__V.*$"),
|
||||
variable_name="full_col_name",
|
||||
value_name="Voice_Scale_Score"
|
||||
)
|
||||
|
||||
# Extract Voice
|
||||
processed = melted.with_columns(
|
||||
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
|
||||
).with_columns(
|
||||
("V" + pl.col("Voice_Num")).alias("Voice")
|
||||
)
|
||||
|
||||
# Keep Score as Float (original data is f64)
|
||||
result = processed.select([
|
||||
"_recordId",
|
||||
"Voice",
|
||||
pl.col("Voice_Scale_Score").cast(pl.Float64, strict=False)
|
||||
])
|
||||
|
||||
return result.collect()
|
||||
|
||||
def join_voice_and_style_data(
|
||||
processed_style_data: pl.DataFrame,
|
||||
processed_voice_data: pl.DataFrame
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Joins processed Speaking Style data with Voice Scale 1-10 data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
processed_style_data : pl.DataFrame
|
||||
Result of process_speaking_style_data
|
||||
processed_voice_data : pl.DataFrame
|
||||
Result of process_voice_scale_data
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Merged dataframe with columns from both, joined on _recordId and Voice.
|
||||
"""
|
||||
|
||||
return processed_style_data.join(
|
||||
processed_voice_data,
|
||||
on=["_recordId", "Voice"],
|
||||
how="inner"
|
||||
)
|
||||
|
||||
def process_voice_ranking_data(
|
||||
df: Union[pl.LazyFrame, pl.DataFrame]
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Process Voice Ranking columns from wide to long format and convert ranks to points.
|
||||
|
||||
Parses columns with format: Top_3_Voices_ranking__V{Voice}
|
||||
Converts ranks to points: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Long-format dataframe with columns:
|
||||
_recordId, Voice, Ranking_Points
|
||||
"""
|
||||
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
|
||||
|
||||
# Melt
|
||||
melted = lf.melt(
|
||||
id_vars=["_recordId"],
|
||||
value_vars=pl.col("^Top_3_Voices_ranking__V.*$"),
|
||||
variable_name="full_col_name",
|
||||
value_name="rank"
|
||||
)
|
||||
|
||||
# Extract Voice
|
||||
processed = melted.with_columns(
|
||||
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
|
||||
).with_columns(
|
||||
("V" + pl.col("Voice_Num")).alias("Voice")
|
||||
)
|
||||
|
||||
# Convert rank to points: 1st=3, 2nd=2, 3rd=1, null=0 (not ranked)
|
||||
# Rank values are 1, 2, 3 for position in top 3
|
||||
result = processed.with_columns(
|
||||
pl.when(pl.col("rank") == 1).then(3)
|
||||
.when(pl.col("rank") == 2).then(2)
|
||||
.when(pl.col("rank") == 3).then(1)
|
||||
.otherwise(0)
|
||||
.alias("Ranking_Points")
|
||||
).select([
|
||||
"_recordId",
|
||||
"Voice",
|
||||
"Ranking_Points"
|
||||
])
|
||||
|
||||
return result.collect()
|
||||
|
||||
Reference in New Issue
Block a user