correlation start

2026-01-27 17:22:16 +01:00
parent 393c527656
commit fd4cb4b596
9 changed files with 5375 additions and 24 deletions
--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -14,25 +14,27 @@ def _():
    from utils import JPMCSurvey, combine_exclusive_columns, calculate_weighted_ranking_scores
    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution, plot_ranking_distribution, plot_most_ranked_1, plot_weighted_ranking_score, plot_voice_selection_counts, plot_top3_selection_counts

-    import plots as plts
-    import utils as utl
+    import plots
+    import utils
+
+    from speaking_styles import SPEAKING_STYLES
    return (
        JPMCSurvey,
        Path,
+        SPEAKING_STYLES,
        calculate_weighted_ranking_scores,
        check_progress,
        duration_validation,
        mo,
        pl,
-        plot_average_scores_with_counts,
        plot_most_ranked_1,
        plot_ranking_distribution,
        plot_top3_ranking_distribution,
        plot_top3_selection_counts,
        plot_voice_selection_counts,
        plot_weighted_ranking_score,
-        plts,
-        utl,
+        plots,
+        utils,
    )


@@ -47,7 +49,7 @@ def _():
 def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
    survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
    data_all = survey.load_data()
-    data_all.collect()
+    # data_all.collect()
    return data_all, survey


@@ -298,7 +300,7 @@ def _(mo):


@app.cell
-def _(data, survey, utl):
+def _(data, survey, utils):
    ss_or, choice_map_or = survey.get_ss_orange_red(data)
    ss_gb, choice_map_gb = survey.get_ss_green_blue(data)

@@ -309,12 +311,12 @@ def _(data, survey, utl):
    choice_map = {**choice_map_or, **choice_map_gb}
    # print(_d.head())
    # print(choice_map)
-    ss_long = utl.process_speaking_style_data(ss_all, choice_map)
-    return (ss_long,)
+    ss_long = utils.process_speaking_style_data(ss_all, choice_map)
+    return choice_map, ss_all, ss_long


@app.cell
-def _(mo, pl, plts, ss_long):
+def _(mo, pl, plots, ss_long):
    content = """### How does each voice score for each “speaking style labeled trait”?"""

    for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
@@ -323,7 +325,7 @@ def _(mo, pl, plts, ss_long):
        content += f"""
    ### {i+1}) {trait.replace(":", " ↔ ")}

-    {mo.ui.plotly(plts.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550))}
+    {mo.ui.plotly(plots.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550))}
    """

    mo.md(content)
@@ -339,17 +341,17 @@ def _(mo):


@app.cell
-def _(data, mo, plot_average_scores_with_counts, survey):
+def _(data, mo, plots, survey):
    vscales = survey.get_voice_scale_1_10(data)[0].collect()
-    plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
+    # plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)

    mo.md(f"""

    ### How does each voice score on a scale from 1-10?

-    {mo.ui.plotly(plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
+    {mo.ui.plotly(plots.plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
    """)
-    return
+    return (vscales,)


@app.cell(hide_code=True)
@@ -373,16 +375,57 @@ def _(mo):
    return


-@app.cell
+@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    ### Total Results
+    ### How to Interpret These Correlation Results
+    Each bar represents the Pearson correlation coefficient (r) between a speaking style trait rating (1-5 scale) and the overall Voice Scale rating (1-10).

-    - [ ] 4 correlation diagrams
+    **Reading the Chart**
+
+    | Correlation Value |	Interpretation |
+    |-----------|----------|
+    | r > 0 (Green bars)| 	Positive correlation — voices rated higher on this trait tend to receive higher Voice Scale scores|
+    | r < 0 (Red bars)| 	Negative correlation — voices rated higher on this trait tend to receive lower Voice Scale scores|
+    | r ≈ 0| 	No relationship — this trait doesn't predict Voice Scale ratings|
    """)
    return


+@app.cell
+def _(choice_map, ss_all, utils, vscales):
+    df_style = utils.process_speaking_style_data(ss_all.collect(), choice_map)
+    df_voice_long = utils.process_voice_scale_data(vscales)
+
+    joined_df = df_style.join(df_voice_long, on=["_recordId", "Voice"], how="inner")
+    # df_voice_long
+    return df_style, joined_df
+
+
+@app.cell
+def _(SPEAKING_STYLES, joined_df, mo, plots):
+    _content = """### Total Results
+
+    """
+
+    for style, traits in SPEAKING_STYLES.items():
+        # print(f"Correlation plot for {style}...")
+        fig = plots.plot_speaking_style_correlation(
+            df=joined_df,
+            style_color=style,
+            style_traits=traits,
+            title=f"Correlation: Speaking Style {style} and Voice Scale 1-10"
+        )
+        _content += f"""
+    #### Speaking Style **{style}**:
+    
+    {mo.ui.plotly(fig)}
+
+    """
+    mo.md(_content)
+    return
+
+
@app.cell
 def _(mo):
    mo.md(r"""
@@ -425,6 +468,30 @@ def _(mo):
    return


+@app.cell
+def _(SPEAKING_STYLES, df_style, mo, plots, top3_voices, utils):
+    df_ranking = utils.process_voice_ranking_data(top3_voices)
+    joined = df_style.join(df_ranking, on=['_recordId', 'Voice'], how='inner')
+
+
+    _content = """## Correlations Voice Speaking Styles <-> Voice Ranking Points
+
+    """
+
+    for _style, _traits in SPEAKING_STYLES.items():
+        _fig = plots.plot_speaking_style_ranking_correlation(joined, _style, _traits)
+        _content += f"""
+
+        #### Speaking Style **{_style}**:
+    
+        {mo.ui.plotly(_fig)}
+    
+        """
+
+    mo.md(_content)
+    return
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""