correlation start
This commit is contained in:
@@ -12,15 +12,24 @@ def _():
|
||||
import plotly as plt
|
||||
from pathlib import Path
|
||||
|
||||
from utils import extract_qid_descr_map
|
||||
return Path, extract_qid_descr_map, mo, pd
|
||||
import utils
|
||||
return Path, mo, pd, utils
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path):
|
||||
# results_file = Path('data/exports/OneDrive_1_1-16-2026/JPMC_Chase Brand Personality_Quant Round 1_TestData_Labels.csv')
|
||||
results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
|
||||
return (results_file,)
|
||||
# results_file = Path('data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv')
|
||||
results_file = Path('data/exports/1-23-26/JPMC_Chase Brand Personality_Quant Round 1_January 23, 2026_Labels.csv')
|
||||
qsf_file = 'data/exports/OneDrive_1_1-16-2026/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||
return qsf_file, results_file
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(qsf_file, results_file, utils):
|
||||
survey = utils.JPMCSurvey(results_file, qsf_file)
|
||||
data_all = survey.load_data()
|
||||
return (survey,)
|
||||
|
||||
|
||||
@app.cell
|
||||
@@ -33,8 +42,8 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(extract_qid_descr_map, results_file):
|
||||
qid_descr_map = extract_qid_descr_map(results_file)
|
||||
def _(survey):
|
||||
qid_descr_map = survey.qid_descr_map
|
||||
qid_descr_map
|
||||
return (qid_descr_map,)
|
||||
|
||||
|
||||
@@ -14,25 +14,27 @@ def _():
|
||||
from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
|
||||
from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_ranking_distribution, plot_most_ranked_1, plot_weighted_ranking_score, plot_voice_selection_counts, plot_top3_selection_counts
|
||||
|
||||
import plots as plts
|
||||
import utils as utl
|
||||
import plots
|
||||
import utils
|
||||
|
||||
from speaking_styles import SPEAKING_STYLES
|
||||
return (
|
||||
JPMCSurvey,
|
||||
Path,
|
||||
SPEAKING_STYLES,
|
||||
calculate_weighted_ranking_scores,
|
||||
check_progress,
|
||||
duration_validation,
|
||||
mo,
|
||||
pl,
|
||||
plot_average_scores_with_counts,
|
||||
plot_most_ranked_1,
|
||||
plot_ranking_distribution,
|
||||
plot_top3_ranking_distribution,
|
||||
plot_top3_selection_counts,
|
||||
plot_voice_selection_counts,
|
||||
plot_weighted_ranking_score,
|
||||
plts,
|
||||
utl,
|
||||
plots,
|
||||
utils,
|
||||
)
|
||||
|
||||
|
||||
@@ -47,7 +49,7 @@ def _():
|
||||
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
|
||||
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||
data_all = survey.load_data()
|
||||
data_all.collect()
|
||||
# data_all.collect()
|
||||
return data_all, survey
|
||||
|
||||
|
||||
@@ -298,7 +300,7 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey, utl):
|
||||
def _(data, survey, utils):
|
||||
ss_or, choice_map_or = survey.get_ss_orange_red(data)
|
||||
ss_gb, choice_map_gb = survey.get_ss_green_blue(data)
|
||||
|
||||
@@ -309,12 +311,12 @@ def _(data, survey, utl):
|
||||
choice_map = {**choice_map_or, **choice_map_gb}
|
||||
# print(_d.head())
|
||||
# print(choice_map)
|
||||
ss_long = utl.process_speaking_style_data(ss_all, choice_map)
|
||||
return (ss_long,)
|
||||
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
||||
return choice_map, ss_all, ss_long
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, pl, plts, ss_long):
|
||||
def _(mo, pl, plots, ss_long):
|
||||
content = """### How does each voice score for each “speaking style labeled trait”?"""
|
||||
|
||||
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
||||
@@ -323,7 +325,7 @@ def _(mo, pl, plts, ss_long):
|
||||
content += f"""
|
||||
### {i+1}) {trait.replace(":", " ↔ ")}
|
||||
|
||||
{mo.ui.plotly(plts.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550))}
|
||||
{mo.ui.plotly(plots.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550))}
|
||||
"""
|
||||
|
||||
mo.md(content)
|
||||
@@ -339,17 +341,17 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, mo, plot_average_scores_with_counts, survey):
|
||||
def _(data, mo, plots, survey):
|
||||
vscales = survey.get_voice_scale_1_10(data)[0].collect()
|
||||
plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
|
||||
# plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
|
||||
|
||||
mo.md(f"""
|
||||
|
||||
### How does each voice score on a scale from 1-10?
|
||||
|
||||
{mo.ui.plotly(plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
|
||||
{mo.ui.plotly(plots.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
|
||||
""")
|
||||
return
|
||||
return (vscales,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -373,16 +375,57 @@ def _(mo):
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Total Results
|
||||
### How to Interpret These Correlation Results
|
||||
Each bar represents the Pearson correlation coefficient (r) between a speaking style trait rating (1-5 scale) and the overall Voice Scale rating (1-10).
|
||||
|
||||
- [ ] 4 correlation diagrams
|
||||
**Reading the Chart**
|
||||
|
||||
| Correlation Value | Interpretation |
|
||||
|-----------|----------|
|
||||
| r > 0 (Green bars)| Positive correlation — voices rated higher on this trait tend to receive higher Voice Scale scores|
|
||||
| r < 0 (Red bars)| Negative correlation — voices rated higher on this trait tend to receive lower Voice Scale scores|
|
||||
| r ≈ 0| No relationship — this trait doesn't predict Voice Scale ratings|
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(choice_map, ss_all, utils, vscales):
|
||||
df_style = utils.process_speaking_style_data(ss_all.collect(), choice_map)
|
||||
df_voice_long = utils.process_voice_scale_data(vscales)
|
||||
|
||||
joined_df = df_style.join(df_voice_long, on=["_recordId", "Voice"], how="inner")
|
||||
# df_voice_long
|
||||
return df_style, joined_df
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(SPEAKING_STYLES, joined_df, mo, plots):
|
||||
_content = """### Total Results
|
||||
|
||||
"""
|
||||
|
||||
for style, traits in SPEAKING_STYLES.items():
|
||||
# print(f"Correlation plot for {style}...")
|
||||
fig = plots.plot_speaking_style_correlation(
|
||||
df=joined_df,
|
||||
style_color=style,
|
||||
style_traits=traits,
|
||||
title=f"Correlation: Speaking Style {style} and Voice Scale 1-10"
|
||||
)
|
||||
_content += f"""
|
||||
#### Speaking Style **{style}**:
|
||||
|
||||
{mo.ui.plotly(fig)}
|
||||
|
||||
"""
|
||||
mo.md(_content)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
@@ -425,6 +468,30 @@ def _(mo):
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(SPEAKING_STYLES, df_style, mo, plots, top3_voices, utils):
|
||||
df_ranking = utils.process_voice_ranking_data(top3_voices)
|
||||
joined = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
|
||||
|
||||
|
||||
_content = """## Correlations Voice Speaking Styles <-> Voice Ranking Points
|
||||
|
||||
"""
|
||||
|
||||
for _style, _traits in SPEAKING_STYLES.items():
|
||||
_fig = plots.plot_speaking_style_ranking_correlation(joined, _style, _traits)
|
||||
_content += f"""
|
||||
|
||||
#### Speaking Style **{_style}**:
|
||||
|
||||
{mo.ui.plotly(_fig)}
|
||||
|
||||
"""
|
||||
|
||||
mo.md(_content)
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
|
||||
BIN
docs/Speaking Style Traits Quantitative test design.pdf
Normal file
BIN
docs/Speaking Style Traits Quantitative test design.pdf
Normal file
Binary file not shown.
60
example_correlation_plots.py
Normal file
60
example_correlation_plots.py
Normal file
@@ -0,0 +1,60 @@
|
||||
|
||||
import polars as pl
|
||||
from utils import JPMCSurvey, process_speaking_style_data, process_voice_scale_data, join_voice_and_style_data
|
||||
from plots import plot_speaking_style_correlation
|
||||
from speaking_styles import SPEAKING_STYLES
|
||||
|
||||
# 1. Initialize Survey and Load Data
|
||||
# We need to point to the actual data files if possible, or use standard paths
|
||||
# Assuming the file structure observed in workspace:
|
||||
# Data: data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Values.csv
|
||||
# QSF: data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf
|
||||
|
||||
RESULTS_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Values.csv"
|
||||
QSF_FILE = "data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf"
|
||||
|
||||
try:
|
||||
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||
except TypeError:
|
||||
# Fallback if signature is different or file not found (just in case)
|
||||
print("Error initializing survey with paths. Checking signature...")
|
||||
# This part is just for debugging if it fails again
|
||||
raise
|
||||
|
||||
data = survey.load_data()
|
||||
|
||||
# 2. Extract Data
|
||||
# Speaking Styles
|
||||
ss_gb, map_gb = survey.get_ss_green_blue(data)
|
||||
ss_or, map_or = survey.get_ss_orange_red(data)
|
||||
|
||||
# Voice Scale 1-10
|
||||
voice_scale, _ = survey.get_voice_scale_1_10(data)
|
||||
|
||||
# 3. Process Dataframes (Wide to Long)
|
||||
# Note: process_speaking_style_data handles the melt and parsing
|
||||
# We collect them because the plotting functions expect eager DataFrames usually,
|
||||
# but polars functions here return eager DFs currently based on `utils.py` implementation (return result.collect())
|
||||
|
||||
df_style_gb = process_speaking_style_data(ss_gb, map_gb)
|
||||
df_style_or = process_speaking_style_data(ss_or, map_or)
|
||||
|
||||
# Combine both style dataframes
|
||||
df_style_all = pl.concat([df_style_gb, df_style_or])
|
||||
|
||||
# Process Voice Scale
|
||||
df_voice_long = process_voice_scale_data(voice_scale)
|
||||
|
||||
# 4. Join Style + Voice Data
|
||||
joined_df = join_voice_and_style_data(df_style_all, df_voice_long)
|
||||
|
||||
# 5. Generate Plots for each Style Color
|
||||
for style, traits in SPEAKING_STYLES.items():
|
||||
print(f"Generating plot for {style}...")
|
||||
fig = plot_speaking_style_correlation(
|
||||
df=joined_df,
|
||||
style_color=style,
|
||||
style_traits=traits
|
||||
)
|
||||
fig.show()
|
||||
# If in Marimo/Jupyter, just 'fig' or 'mo.ui.plotly(fig)'
|
||||
220
plots.py
220
plots.py
@@ -854,3 +854,223 @@ def plot_speaking_style_trait_scores(
|
||||
font=dict(size=11)
|
||||
)
|
||||
return fig
|
||||
|
||||
def plot_speaking_style_correlation(
|
||||
df: pl.DataFrame,
|
||||
style_color: str,
|
||||
style_traits: list[str],
|
||||
title=f"Speaking style and voice scale 1-10 correlations"
|
||||
) -> go.Figure:
|
||||
"""
|
||||
Plots the correlation between Speaking Style Trait Scores (1-5) and Voice Scale (1-10) using a Bar Chart.
|
||||
Each bar represents one trait.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pl.DataFrame
|
||||
Joined dataframe containing 'Right_Anchor', 'score' (Trait Score), and 'Voice_Scale_Score'.
|
||||
style_color : str
|
||||
The name of the style (e.g., 'Green', 'Blue') for title and coloring.
|
||||
style_traits : list[str]
|
||||
List of trait descriptions (positive side) to include in the plot.
|
||||
These should match the 'Right_Anchor' column values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
go.Figure
|
||||
"""
|
||||
|
||||
trait_correlations = []
|
||||
|
||||
# 1. Calculate Correlations
|
||||
for i, trait in enumerate(style_traits):
|
||||
# Match against Right_Anchor which contains the positive trait description
|
||||
# Use exact match for reliability
|
||||
subset = df.filter(
|
||||
pl.col("Right_Anchor") == trait
|
||||
)
|
||||
|
||||
# Drop Nulls for correlation calculation
|
||||
valid_data = subset.select(["score", "Voice_Scale_Score"]).drop_nulls()
|
||||
|
||||
if valid_data.height > 1:
|
||||
# Calculate Pearson Correlation
|
||||
corr_val = valid_data.select(pl.corr("score", "Voice_Scale_Score")).item()
|
||||
|
||||
# Trait Label for Plot (Use the provided list text, maybe truncated or wrapped later)
|
||||
trait_label = f"Trait {i+1}: {trait}"
|
||||
# Or just "Trait {i+1}" and put full text in hover or subtitle?
|
||||
# User example showed "Trait 1", "Trait 2".
|
||||
# User request said "Use the traits directly".
|
||||
# Let's use the trait text as the x-axis label, perhaps wrapped.
|
||||
|
||||
trait_correlations.append({
|
||||
"trait_full": trait,
|
||||
"trait_short": f"Trait {i+1}",
|
||||
"correlation": corr_val if corr_val is not None else 0.0
|
||||
})
|
||||
|
||||
# 2. Build Plot Data
|
||||
if not trait_correlations:
|
||||
# Return empty fig with title
|
||||
fig = go.Figure()
|
||||
fig.update_layout(title=f"No data for {style_color} Style")
|
||||
return fig
|
||||
|
||||
plot_df = pl.DataFrame(trait_correlations)
|
||||
|
||||
# Determine colors based on correlation sign
|
||||
colors = []
|
||||
for val in plot_df["correlation"]:
|
||||
if val >= 0:
|
||||
colors.append("green") # Positive
|
||||
else:
|
||||
colors.append("red") # Negative
|
||||
|
||||
fig = go.Figure()
|
||||
|
||||
fig.add_trace(go.Bar(
|
||||
x=[f"Trait {i+1}" for i in range(len(plot_df))], # Simple Labels on Axis
|
||||
y=plot_df["correlation"],
|
||||
text=[f"{val:.2f}" for val in plot_df["correlation"]],
|
||||
textposition='outside', # Or auto
|
||||
marker_color=colors,
|
||||
hovertemplate="<b>%{customdata}</b><br>Correlation: %{y:.2f}<extra></extra>",
|
||||
customdata=plot_df["trait_full"] # Full text on hover
|
||||
))
|
||||
|
||||
# 3. Add Trait Descriptions as Subtitle or Annotation?
|
||||
# Or put on X-axis? The traits are long strings "Friendly | Conversational ...".
|
||||
# User's example has "Trait 1", "Trait 2" on axis.
|
||||
# But user specifically said "Use the traits directly".
|
||||
# This might mean "Don't map choice 1->Green, choice 2->Blue dynamically, trusting indices. Instead use the text match".
|
||||
# It might ALSO mean "Show the text on the chart".
|
||||
# The example image has simple "Trait X" labels.
|
||||
# I will stick to "Trait X" on axis but add the legend/list in the title or as annotations,
|
||||
# OR better: Use the full text on X-axis but with <br> wrapping.
|
||||
# Given the length ("Optimistic | Benevolent | Positive | Appreciative"), wrapping is needed.
|
||||
|
||||
# Wrap text at the "|" separator for cleaner line breaks
|
||||
def wrap_text_at_pipe(text):
|
||||
parts = [p.strip() for p in text.split("|")]
|
||||
return "<br>".join(parts)
|
||||
|
||||
x_labels = [wrap_text_at_pipe(t) for t in plot_df["trait_full"]]
|
||||
|
||||
# Update trace to use full labels
|
||||
fig.data[0].x = x_labels
|
||||
|
||||
fig.update_layout(
|
||||
title=title,
|
||||
yaxis_title="Correlation",
|
||||
yaxis=dict(range=[-1, 1], zeroline=True, zerolinecolor="black"),
|
||||
xaxis=dict(tickangle=0), # Keep flat if possible
|
||||
height=400,
|
||||
width=1000,
|
||||
template="plotly_white",
|
||||
showlegend=False
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def plot_speaking_style_ranking_correlation(
|
||||
df: pl.DataFrame,
|
||||
style_color: str,
|
||||
style_traits: list[str],
|
||||
title: str = None
|
||||
) -> go.Figure:
|
||||
"""
|
||||
Plots the correlation between Speaking Style Trait Scores (1-5) and Voice Ranking Points (0-3).
|
||||
Each bar represents one trait.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
df : pl.DataFrame
|
||||
Joined dataframe containing 'Right_Anchor', 'score' (Trait Score), and 'Ranking_Points'.
|
||||
style_color : str
|
||||
The name of the style (e.g., 'Green', 'Blue') for title and coloring.
|
||||
style_traits : list[str]
|
||||
List of trait descriptions (positive side) to include in the plot.
|
||||
These should match the 'Right_Anchor' column values.
|
||||
title : str, optional
|
||||
Custom title for the plot. If None, uses default.
|
||||
|
||||
Returns
|
||||
-------
|
||||
go.Figure
|
||||
"""
|
||||
|
||||
if title is None:
|
||||
title = f"Speaking style {style_color} and voice ranking points correlations"
|
||||
|
||||
trait_correlations = []
|
||||
|
||||
# 1. Calculate Correlations
|
||||
for i, trait in enumerate(style_traits):
|
||||
# Match against Right_Anchor which contains the positive trait description
|
||||
subset = df.filter(pl.col("Right_Anchor") == trait)
|
||||
|
||||
# Drop Nulls for correlation calculation
|
||||
valid_data = subset.select(["score", "Ranking_Points"]).drop_nulls()
|
||||
|
||||
if valid_data.height > 1:
|
||||
# Calculate Pearson Correlation
|
||||
corr_val = valid_data.select(pl.corr("score", "Ranking_Points")).item()
|
||||
|
||||
trait_correlations.append({
|
||||
"trait_full": trait,
|
||||
"trait_short": f"Trait {i+1}",
|
||||
"correlation": corr_val if corr_val is not None else 0.0
|
||||
})
|
||||
|
||||
# 2. Build Plot Data
|
||||
if not trait_correlations:
|
||||
fig = go.Figure()
|
||||
fig.update_layout(title=f"No data for {style_color} Style")
|
||||
return fig
|
||||
|
||||
plot_df = pl.DataFrame(trait_correlations)
|
||||
|
||||
# Determine colors based on correlation sign
|
||||
colors = []
|
||||
for val in plot_df["correlation"]:
|
||||
if val >= 0:
|
||||
colors.append("green")
|
||||
else:
|
||||
colors.append("red")
|
||||
|
||||
fig = go.Figure()
|
||||
|
||||
fig.add_trace(go.Bar(
|
||||
x=[f"Trait {i+1}" for i in range(len(plot_df))],
|
||||
y=plot_df["correlation"],
|
||||
text=[f"{val:.2f}" for val in plot_df["correlation"]],
|
||||
textposition='outside',
|
||||
marker_color=colors,
|
||||
hovertemplate="<b>%{customdata}</b><br>Correlation: %{y:.2f}<extra></extra>",
|
||||
customdata=plot_df["trait_full"]
|
||||
))
|
||||
|
||||
# Wrap text at the "|" separator for cleaner line breaks
|
||||
def wrap_text_at_pipe(text):
|
||||
parts = [p.strip() for p in text.split("|")]
|
||||
return "<br>".join(parts)
|
||||
|
||||
x_labels = [wrap_text_at_pipe(t) for t in plot_df["trait_full"]]
|
||||
|
||||
# Update trace to use full labels
|
||||
fig.data[0].x = x_labels
|
||||
|
||||
fig.update_layout(
|
||||
title=title,
|
||||
yaxis_title="Correlation",
|
||||
yaxis=dict(range=[-1, 1], zeroline=True, zerolinecolor="black"),
|
||||
xaxis=dict(tickangle=0),
|
||||
height=400,
|
||||
width=1000,
|
||||
template="plotly_white",
|
||||
showlegend=False
|
||||
)
|
||||
|
||||
return fig
|
||||
|
||||
33
speaking_styles.py
Normal file
33
speaking_styles.py
Normal file
@@ -0,0 +1,33 @@
|
||||
|
||||
"""
|
||||
Mapping of Speaking Styles (Colors) to their constituent Traits (Positive side).
|
||||
Derived from "Speaking Style Traits Quantitative test design.pdf".
|
||||
"""
|
||||
|
||||
SPEAKING_STYLES = {
|
||||
"Green": [
|
||||
"Friendly | Conversational | Down-to-earth",
|
||||
"Approachable | Familiar | Warm",
|
||||
"Optimistic | Benevolent | Positive | Appreciative"
|
||||
],
|
||||
"Blue": [
|
||||
"Proactive | Cooperative",
|
||||
"Knowledgable | Resourceful | Savvy",
|
||||
"Clear | Straightforward | Direct",
|
||||
"Confident | Competent",
|
||||
"Respectable | Respectful"
|
||||
],
|
||||
"Orange": [
|
||||
"Attentive | Helpful | Caring | Deliberate",
|
||||
"Reassuring | Empowering",
|
||||
"Progressive | Guiding | Intentional",
|
||||
"Patient | Open-minded"
|
||||
],
|
||||
"Red": [
|
||||
"Trustworthy | Reliable | Dependable",
|
||||
"Calm | Steady/Stable | Controlled",
|
||||
"Transparent | Upright | Altruistic",
|
||||
"Adaptive | Flexible"
|
||||
]
|
||||
}
|
||||
|
||||
115
utils.py
115
utils.py
@@ -506,3 +506,118 @@ def process_speaking_style_data(
|
||||
|
||||
|
||||
|
||||
|
||||
def process_voice_scale_data(
|
||||
df: Union[pl.LazyFrame, pl.DataFrame]
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Process Voice Scale columns from wide to long format.
|
||||
|
||||
Parses columns with format: Voice_Scale_1_10__V{Voice}
|
||||
Example: Voice_Scale_1_10__V14
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Long-format dataframe with columns:
|
||||
_recordId, Voice, Voice_Scale_Score
|
||||
"""
|
||||
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
|
||||
|
||||
# Melt
|
||||
melted = lf.melt(
|
||||
id_vars=["_recordId"],
|
||||
value_vars=pl.col("^Voice_Scale_1_10__V.*$"),
|
||||
variable_name="full_col_name",
|
||||
value_name="Voice_Scale_Score"
|
||||
)
|
||||
|
||||
# Extract Voice
|
||||
processed = melted.with_columns(
|
||||
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
|
||||
).with_columns(
|
||||
("V" + pl.col("Voice_Num")).alias("Voice")
|
||||
)
|
||||
|
||||
# Keep Score as Float (original data is f64)
|
||||
result = processed.select([
|
||||
"_recordId",
|
||||
"Voice",
|
||||
pl.col("Voice_Scale_Score").cast(pl.Float64, strict=False)
|
||||
])
|
||||
|
||||
return result.collect()
|
||||
|
||||
def join_voice_and_style_data(
|
||||
processed_style_data: pl.DataFrame,
|
||||
processed_voice_data: pl.DataFrame
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Joins processed Speaking Style data with Voice Scale 1-10 data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
processed_style_data : pl.DataFrame
|
||||
Result of process_speaking_style_data
|
||||
processed_voice_data : pl.DataFrame
|
||||
Result of process_voice_scale_data
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Merged dataframe with columns from both, joined on _recordId and Voice.
|
||||
"""
|
||||
|
||||
return processed_style_data.join(
|
||||
processed_voice_data,
|
||||
on=["_recordId", "Voice"],
|
||||
how="inner"
|
||||
)
|
||||
|
||||
def process_voice_ranking_data(
|
||||
df: Union[pl.LazyFrame, pl.DataFrame]
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Process Voice Ranking columns from wide to long format and convert ranks to points.
|
||||
|
||||
Parses columns with format: Top_3_Voices_ranking__V{Voice}
|
||||
Converts ranks to points: 1st place = 3 pts, 2nd place = 2 pts, 3rd place = 1 pt
|
||||
|
||||
Returns
|
||||
-------
|
||||
pl.DataFrame
|
||||
Long-format dataframe with columns:
|
||||
_recordId, Voice, Ranking_Points
|
||||
"""
|
||||
lf = df.lazy() if isinstance(df, pl.DataFrame) else df
|
||||
|
||||
# Melt
|
||||
melted = lf.melt(
|
||||
id_vars=["_recordId"],
|
||||
value_vars=pl.col("^Top_3_Voices_ranking__V.*$"),
|
||||
variable_name="full_col_name",
|
||||
value_name="rank"
|
||||
)
|
||||
|
||||
# Extract Voice
|
||||
processed = melted.with_columns(
|
||||
pl.col("full_col_name").str.extract(r"V(\d+)", 1).alias("Voice_Num")
|
||||
).with_columns(
|
||||
("V" + pl.col("Voice_Num")).alias("Voice")
|
||||
)
|
||||
|
||||
# Convert rank to points: 1st=3, 2nd=2, 3rd=1, null=0 (not ranked)
|
||||
# Rank values are 1, 2, 3 for position in top 3
|
||||
result = processed.with_columns(
|
||||
pl.when(pl.col("rank") == 1).then(3)
|
||||
.when(pl.col("rank") == 2).then(2)
|
||||
.when(pl.col("rank") == 3).then(1)
|
||||
.otherwise(0)
|
||||
.alias("Ranking_Points")
|
||||
).select([
|
||||
"_recordId",
|
||||
"Voice",
|
||||
"Ranking_Points"
|
||||
])
|
||||
|
||||
return result.collect()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
20
voices.py
Normal file
20
voices.py
Normal file
@@ -0,0 +1,20 @@
|
||||
Voice Reference Gender
|
||||
Voice 14 Female
|
||||
Voice 04 Female
|
||||
Voice 08 Female
|
||||
Voice 77 Female
|
||||
|
||||
Voice 48 Female
|
||||
Voice 82 Female
|
||||
Voice 89 Female
|
||||
Voice 91 Emily (Current IVR Voice) Female
|
||||
Voice 34 Male
|
||||
Voice 69 Male
|
||||
Voice 45 Male
|
||||
Voice 46 Male
|
||||
Voice 54 Male
|
||||
Voice 74 Male
|
||||
Voice 81 Male
|
||||
Voice 86 Male
|
||||
Voice 88 Male
|
||||
Voice 16 Male
|
||||
Reference in New Issue
Block a user