statistical tests

2026-02-02 21:47:37 +01:00
parent 29df6a4bd9
commit f2c659c266
9 changed files with 1679 additions and 47 deletions
--- a/03_quant_report.py
+++ b/03_quant_report.py
@@ -44,14 +44,14 @@ def _(QSF_FILE, RESULTS_FILE):


@app.cell(hide_code=True)
-def _():
-    mo.md(r"""
+def _(RESULTS_FILE, data_all):
+    mo.md(rf"""
    ---
    # Load Data

-    **Dataset:** `{Path(RESULTS_FILE).name}`
+    **Dataset:** {Path(RESULTS_FILE).name}

-    **Responses**: `{data_all.collect().shape[0]}`
+    **Responses**: {data_all.collect().shape[0]}
    """)
    return

@@ -112,11 +112,9 @@ def _():


@app.cell
-def _(data_validated):
-    data = data_validated
-
-    data.collect()
-    return (data,)
+def _():
+    # 
+    return


@app.cell(hide_code=True)
@@ -130,8 +128,8 @@ def _():


@app.cell
-def _(S, data):
-    demographics = S.get_demographics(data)[0].collect()
+def _(S, data_validated):
+    demographics = S.get_demographics(data_validated)[0].collect()
    demographics
    return (demographics,)

@@ -148,7 +146,7 @@ def _():
 def _(demographics):
    # Demographics where 'Consumer' is null
    demographics_no_consumer = demographics.filter(pl.col('Consumer').is_null())['_recordId'].to_list()
-    # demographics_no_consumer
+    demographics_no_consumer
    return (demographics_no_consumer,)


@@ -160,9 +158,26 @@ def _(data_all, demographics_no_consumer):


@app.cell
-def _(data_all):
+def _():
+    mo.md(r"""
+    # Filter Data (Global corrections)
+    """)
+    return
+
+
+@app.cell
+def _(data_validated):
+    # drop rows where 'consumer' is null
+    # data = data_validated.filter(pl.col('Consumer').is_not_null())
+    data = data_validated
+    data.collect()
+    return (data,)
+
+
+@app.cell
+def _():
    # Check if all business owners are missing a 'Consumer type' in demographics
-    assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
+    # assert all([a is None for a in data_all.filter(pl.col('QID4') == 'Yes').collect()['Consumer'].unique()]) , "Not all business owners are missing 'Consumer type' in demographics."
    return


@@ -187,14 +202,14 @@ def _():


@app.cell
-def _(S, demo_plot_cols, demographics):
+def _(S, data, demo_plot_cols):
    _content = """
    ## Demographic Distributions

    """
    for c in demo_plot_cols:
        _fig = S.plot_demographic_distribution(
-            data=demographics,
+            data=S.get_demographics(data)[0],
            column=c,
            title=f"{c.replace('Bussiness', 'Business').replace('_', ' ')} Distribution of Survey Respondents"
        )
@@ -265,6 +280,22 @@ def _(S, char_rank):
    return


+@app.cell
+def _(S, char_rank):
+    _pairwise_df, _meta = S.compute_ranking_significance(char_rank)
+
+    print(_pairwise_df.columns)
+
+    mo.md(f"""
+    ### Statistical Significance Character Ranking
+
+    {mo.ui.altair_chart(S.plot_significance_heatmap(_pairwise_df, metadata=_meta))}
+
+    {mo.ui.altair_chart(S.plot_significance_summary(_pairwise_df, metadata=_meta))}
+    """)
+    return
+
+
@app.cell
 def _():
    mo.md(r"""
@@ -307,28 +338,69 @@ def _():


@app.cell
-def _():
-    # Join respondent 
+def _(S, data):
+    char_df = S.get_character_refine(data)[0]
+    return (char_df,)
+
+
+@app.cell
+def _(S, char_df):
+    from theme import ColorPalette
+
+    # Assuming you already have char_df (your data from get_character_refine or similar)
+    characters = ['Bank Teller', 'Familiar Friend', 'The Coach', 'Personal Assistant']
+    character_colors = {
+        'Bank Teller': (ColorPalette.CHARACTER_BANK_TELLER, ColorPalette.CHARACTER_BANK_TELLER_HIGHLIGHT),
+        'Familiar Friend': (ColorPalette.CHARACTER_FAMILIAR_FRIEND, ColorPalette.CHARACTER_FAMILIAR_FRIEND_HIGHLIGHT),
+        'The Coach': (ColorPalette.CHARACTER_COACH, ColorPalette.CHARACTER_COACH_HIGHLIGHT),
+        'Personal Assistant': (ColorPalette.CHARACTER_PERSONAL_ASSISTANT, ColorPalette.CHARACTER_PERSONAL_ASSISTANT_HIGHLIGHT),
+    }
+
+    # Build consistent sort order (by total frequency across all characters)
+    all_trait_counts = {}
+    for char in characters:
+        freq_df, _ = S.transform_character_trait_frequency(char_df, char)
+        for row in freq_df.iter_rows(named=True):
+            all_trait_counts[row['trait']] = all_trait_counts.get(row['trait'], 0) + row['count']
+
+    consistent_sort_order = sorted(all_trait_counts.keys(), key=lambda x: -all_trait_counts[x])
+
+    _content = """"""
+    # Generate 4 plots (one per character)
+    for char in characters:
+        freq_df, _ = S.transform_character_trait_frequency(char_df, char)
+        main_color, highlight_color = character_colors[char]
+        chart = S.plot_single_character_trait_frequency(
+            data=freq_df,
+            character_name=char,
+            bar_color=main_color,
+            highlight_color=highlight_color,
+            trait_sort_order=consistent_sort_order,
+        )
+        _content += f"""
+        {mo.ui.altair_chart(chart)}
+
+
+    """
+
+    mo.md(_content)
    return


@app.cell
 def _():
    mo.md(r"""
-    ---
+    ## Statistical significance best characters

-    # Spoken Voice Results
+    zie chat
+    > voorbeeld: als de nr 1 en 2 niet significant verschillen maar wel van de nr 3 bijvoorbeeld is dat ook top. Beetje meedenkend over hoe ik het kan presenteren weetje wat ik bedoel?:)
+    >
    """)
    return


-@app.cell(hide_code=True)
+@app.cell
 def _():
-    mo.md(r"""
-    ---
-
-    # Brand Character Results
-    """)
    return


@@ -342,5 +414,174 @@ def _():
    return


+@app.cell
+def _(S, data):
+    top3_voices = S.get_top_3_voices(data)[0]
+    top3_voices_weighted = calculate_weighted_ranking_scores(top3_voices)
+    return top3_voices, top3_voices_weighted
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ## Which voice is ranked best in the ranking question for top 3?
+
+    (not best 3 out of 8 question)
+    """)
+    return
+
+
+@app.cell
+def _(S, top3_voices):
+    _plot = S.plot_ranking_distribution(top3_voices, x_label='Voice')
+    mo.md(f"""
+    {mo.ui.altair_chart(_plot)}
+    """)
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Statistical significance for voice ranking
+    """)
+    return
+
+
+@app.cell
+def _():
+    # print(top3_voices.collect().head())
+    return
+
+
+@app.cell
+def _():
+
+    # _pairwise_df, _metadata = S.compute_ranking_significance(
+    #     top3_voices,alpha=0.05,correction="none")
+
+    # # View significant pairs
+    # # print(pairwise_df.filter(pl.col('significant') == True))
+
+    # # Create heatmap visualization
+    # _heatmap = S.plot_significance_heatmap(
+    #     _pairwise_df, 
+    #     metadata=_metadata,
+    #     title="Weighted Voice Ranking Significance<br>(Pairwise Comparisons)"
+    # )
+
+    # # Create summary bar chart
+    # _summary = S.plot_significance_summary(
+    #     _pairwise_df,
+    #     metadata=_metadata
+    # )
+
+    # mo.md(f"""
+    # {mo.ui.altair_chart(_heatmap)}
+
+    # {mo.ui.altair_chart(_summary)}
+    # """)
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ## Weighted Popularity Scores
+    """)
+    return
+
+
+@app.cell
+def _(S, top3_voices_weighted):
+    _plot = S.plot_weighted_ranking_score(top3_voices_weighted, title="Most Popular Voice - Weighted Popularity Score<br>(1st = 3pts, 2nd = 2pts, 3rd = 1pt)")
+
+    mo.md(f"""
+    {mo.ui.altair_chart(_plot)}
+    """)
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell
+def _(top3_voices_weighted):
+    print(top3_voices_weighted.head())
+    return
+
+
+@app.cell
+def _():
+    return
+
+
+@app.cell(hide_code=True)
+def _():
+    mo.md(r"""
+    ## Voice Scale 1-10
+    """)
+    return
+
+
+@app.cell
+def _(S, data):
+    # Get your voice scale data (from notebook)
+    voice_1_10, _ = S.get_voice_scale_1_10(data)
+    return (voice_1_10,)
+
+
+@app.cell
+def _(S, voice_1_10):
+    S.plot_average_scores_with_counts(voice_1_10, x_label='Voice', width=1000, domain=[1,10], title="Voice General Impression (Scale 1-10)")
+    return
+
+
+@app.cell
+def _():
+    mo.md(r"""
+    ### Statistical Significance (Scale 1-10)
+    """)
+    return
+
+
+@app.cell
+def _(S, voice_1_10):
+    # Compute pairwise significance tests
+    pairwise_df, metadata = S.compute_pairwise_significance(
+        voice_1_10,
+        test_type="mannwhitney",  # or "ttest", "chi2", "auto"
+        alpha=0.05,
+        correction="bonferroni"   # or "holm", "none"
+    )
+
+    # View significant pairs
+    # print(pairwise_df.filter(pl.col('significant') == True))
+
+    # Create heatmap visualization
+    _heatmap = S.plot_significance_heatmap(
+        pairwise_df, 
+        metadata=metadata,
+        title="Voice Rating Significance<br>(Pairwise Comparisons)"
+    )
+
+    # Create summary bar chart
+    _summary = S.plot_significance_summary(
+        pairwise_df,
+        metadata=metadata
+    )
+
+    mo.md(f"""
+    {mo.ui.altair_chart(_heatmap)}
+
+    {mo.ui.altair_chart(_summary)}
+    """)
+
+
+    return
+
+
 if __name__ == "__main__":
    app.run()