setup complete framework of analysis

2026-01-23 09:53:59 +01:00
parent 42f2d775c7
commit 5327b50ab0
5 changed files with 465 additions and 6 deletions
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -10,10 +10,11 @@ def _():
    import polars as pl
    from pathlib import Path

-    from utils import JPMCSurvey
+    from utils import JPMCSurvey, combine_exclusive_columns
    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
    return (
        JPMCSurvey,
+        combine_exclusive_columns,
        mo,
        pl,
        plot_average_scores_with_counts,
@@ -38,8 +39,8 @@ def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):


@app.cell
-def _():
-    # survey.qid_descr_map
+def _(survey):
+    survey.qid_descr_map
    return


@@ -286,15 +287,24 @@ def _(data, survey):
@app.cell
 def _(data, survey):
    traits_refined = survey.get_character_refine(data)[0]
+
    traits_refined.collect()
+
    return (traits_refined,)


@app.cell
-def _(traits_original, traits_refined):
+def _(combine_exclusive_columns, traits_refined):
+    traits_refined_comb = combine_exclusive_columns(traits_refined.collect(), target_col_name='Top_8_Traits_Refined')
+    traits_refined_comb
+    return (traits_refined_comb,)
+
+
+@app.cell
+def _(traits_original, traits_refined_comb):
    # merge the two dataframes side by side for comparison
-    traits_comparison = traits_original.join(traits_refined, on='_recordId')
-    traits_comparison.collect()
+    traits_comparison = traits_original.join(traits_refined_comb.lazy(), on='_recordId')
+    print(traits_comparison.collect().head())
    return


--- a/02_quant_analysis.py
+++ b/02_quant_analysis.py
@@ -0,0 +1,342 @@
+import marimo
+
+__generated_with = "0.19.2"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import polars as pl
+    from pathlib import Path
+
+    from validation import check_progress, duration_validation
+    from utils import JPMCSurvey, combine_exclusive_columns
+    from plots import plot_average_scores_with_counts, plot_top3_ranking_distribution
+    return (
+        JPMCSurvey,
+        Path,
+        check_progress,
+        duration_validation,
+        mo,
+        plot_average_scores_with_counts,
+    )
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Load Data
+    """)
+    return
+
+
+@app.cell
+def _(Path, mo):
+    RESULTS_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase Brand Personality_Quant Round 1_January 21, 2026_Soft Launch_Labels.csv'
+    QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+    mo.md(f"**Dataset:** `{Path(RESULTS_FILE).name}`")
+    return QSF_FILE, RESULTS_FILE
+
+
+@app.cell
+def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
+    survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
+    data_all = survey.load_data()
+    data_all.collect()
+    return data_all, survey
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Data Validation
+    """)
+    return
+
+
+@app.cell
+def _(check_progress, data_all):
+    check_progress(data_all)
+    return
+
+
+@app.cell
+def _(data_all, duration_validation):
+    duration_validation(data_all)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ### ToDo: "straight-liner" detection and removal
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ---
+
+    # Data Filter
+
+    Use to select a subset of the data for the following analysis
+    """)
+    return
+
+
+@app.cell
+def _(data_all, survey):
+    data = survey.filter_data(data_all, age=None, gender=None, income=None, ethnicity=None, consumer=None)
+    return (data,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ---
+
+    # Analysis
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Character personality ranking
+
+    1. Which character personality is ranked best?
+    2. Which character personality is ranked number 1 the most?
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Voice Ranking
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    Which 8 voices are chosen the most out of 18?
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    Which 3 voices are chosen the most out of 18? How many times does each voice end up in the top 3? ( this is based on the survey question where participants need to choose 3 out of the earlier selected 8 voices. So how often each of the 18 stimuli ended up in participants’ Top 3, after they first selected 8 out of 18.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    Which voice is ranked best in the ranking question for top 3.? (so not best 3 out of 8 question)
+    - E.g. 1 point for place 3. 2 points for place 2 and 3 points for place 1.  The voice with most points is ranked best.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    Which voice is ranked number 1 the most? (not always the voice with most points)
+
+    - Each of the 350 participants gives exactly one 1st-place vote.
+    - Total Rank-1 votes = 350.
+    - Voices are sorted from most to least 1st-place votes.
+    - The top 3 voices with the most Rank-1 votes are colored blue.
+    - This can differ from the points-based winners (3–2–1 totals), because a voice may receive many 2nd/3rd places but fewer 1st places.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Voice Speaking Style - Perception Traits
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    How does each voice score for each “speaking style labeled trait”? Here you can find the speaking styles and traits: [Speaking Style Traits Quantitative test design.docx](https://voicebranding-my.sharepoint.com/:w:/g/personal/phoebe_voicebranding_ai/IQBfM_Z8PF98Qalz4lzIbJ3RAUCdc7waB32HZXCj7k3xfo0?e=rtFd27)
+
+    - There are 4 speaking styles: Green, Blue, Orange, Red.
+    - There are 16 traits distributed across the 4 speaking styles.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Voice Scale 1-10
+    """)
+    return
+
+
+@app.cell
+def _(data, mo, plot_average_scores_with_counts, survey):
+    vscales = survey.get_voice_scale_1_10(data)[0].collect()
+    plot_average_scores_with_counts(vscales, x_label='Voice', width=1000)
+
+    mo.md(f"""
+
+    How does each voice score on a scale from 1-10?
+
+    {mo.ui.plotly(plot_average_scores_with_counts(vscales, x_label='Voice', width=1000))}
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+ 
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Correlations Voice Speaking Styles <-> Voice Scale 1-10
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    Let’s show how scoring better on these speaking styles correlates (or not) with better Voice Scale 1-10 evaluation. For each speaking style we show how the traits in these speaking styles correlate with Voice Scale 1-10 evaluation. This gives us a total of 4 correlation diagrams.
+
+    Example for speaking style green:
+    - Trait 1: Friendly | Conversational | Down-to-earth
+    - Trait 2: Approachable | Familiar | Warm
+    - Trait 3: Optimistic | Benevolent | Positive | Appreciative
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ### Total Results
+
+    - [ ] 4 correlation diagrams
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ### Female / Male Voices considered seperately
+
+    - [ ] 4 correlation diagrams considering each speaking style (4) and all female voice results.
+    - [ ] 4 correlation diagrams considering each speaking style (4) and all male voice results.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Correlations Voice Speaking Styles <-> Voice Ranking Points
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    Let’s show how scoring better on these speaking styles correlates (or not) with better Vocie Ranking results. For each speaking style we show how the traits in these speaking styles correlate with voice ranking points. This gives us a total of 4 correlation diagrams.
+
+    Example for speaking style green:
+    - Trait 1: Friendly | Conversational | Down-to-earth
+    - Trait 2: Approachable | Familiar | Warm
+    - Trait 3: Optimistic | Benevolent | Positive | Appreciative
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ### Total Results
+
+    - [ ] 4 correlation diagrams
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ### Female / Male Voices considered seperately
+
+    - [ ] 4 correlation diagrams considering each speaking style (4) and all female voice results.
+    - [ ] 4 correlation diagrams considering each speaking style (4) and all male voice results.
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Correlation Heatmap all evaluations <-> voice acoustic data
+
+    - [ ] Heatmap for male voices
+    - [ ] Heatmap for female voices
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Most Prominent Character Personality Traits
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    The last question of the survey is about traits for the described character's personality. For each Character personality, we want to display the 8 most chosen character personality traits. This will give us a total of 4 diagrams, one for each character personality included in the test.
+
+    - [ ] Bank Teller
+    - [ ] Familiar Friend
+    - [ ] The Coach
+    - [ ] Personal Assistant
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    ---
+
+    # Results per subgroup
+
+    Use the dropdown selector at the top to filter the data and generate all the plots again
+    """)
+    return
+
+
+if __name__ == "__main__":
+    app.run()
--- a/layouts/02_quant_analysis.slides.json
+++ b/layouts/02_quant_analysis.slides.json
@@ -0,0 +1,4 @@
+{
+  "type": "slides",
+  "data": {}
+}
--- a/utils.py
+++ b/utils.py
@@ -32,6 +32,26 @@ def extract_qid(val):
    return val['ImportId']


+def combine_exclusive_columns(df: pl.DataFrame, id_col: str = "_recordId", target_col_name: str = "combined_value") -> pl.DataFrame:
+    """
+    Combines all columns except id_col into a single column.
+    Raises ValueError if more than one column is populated in a single row.
+    """
+    merge_cols = [c for c in df.columns if c != id_col]
+
+    # Validate: count non-nulls horizontally
+    row_counts = df.select(
+        pl.sum_horizontal(pl.col(merge_cols).is_not_null())
+    ).to_series()
+
+    if (row_counts > 1).any():
+        raise ValueError("Invalid Data: Multiple columns populated for a single record row.")
+
+    # Merge columns using coalesce
+    return df.select([
+        pl.col(id_col),
+        pl.coalesce(merge_cols).alias(target_col_name)
+    ])



@@ -144,6 +164,35 @@ class JPMCSurvey:
        
        return q.select(QIDs).rename(rename_dict)

+    def filter_data(self, q: pl.LazyFrame, age:list=None, gender:list=None, consumer:list=None, ethnicity:list=None, income:list=None) -> pl.LazyFrame:
+        """Filter data based on provided parameters
+        
+        Possible parameters:
+        - age: list of age groups to include
+        - gender: list
+        - consumer: list
+        - ethnicity: list
+        - income: list
+        
+        Returns filtered polars LazyFrame.
+        """
+        
+        if age is not None:
+            q = q.filter(pl.col('QID1').is_in(age))
+        
+        if gender is not None:
+            q = q.filter(pl.col('QID2').is_in(gender))
+        
+        if consumer is not None:
+            q = q.filter(pl.col('Consumer').is_in(consumer))
+        
+        if ethnicity is not None:
+            q = q.filter(pl.col('QID3').is_in(ethnicity))
+        
+        if income is not None:
+            q = q.filter(pl.col('QID15').is_in(income))
+        
+        return q

    def get_demographics(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, None]:
        """Extract columns containing the demographics. 
--- a/validation.py
+++ b/validation.py
@@ -0,0 +1,54 @@
+import marimo as mo
+import polars as pl
+
+
+def check_progress(data):
+    """Check if all responses are complete based on 'progress' column."""
+    if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
+        return mo.md("""### Responses Complete: \n\n✅ All responses are complete (progress = 100) """)
+    
+    return mo.md("### Responses Complete: \n\n⚠️ There are incomplete responses (progress < 100) ⚠️")
+
+
+def duration_validation(data):
+    """Validate response durations to identify outliers."""
+    # Identify any outliers in duration
+    duration_stats = data.select(
+        pl.col('duration').mean().alias('mean_duration'),
+        pl.col('duration').std().alias('std_duration')
+    ).collect()
+    mean_duration = duration_stats['mean_duration'][0]
+    std_duration = duration_stats['std_duration'][0]
+    upper_outlier_threshold = mean_duration + 3 * std_duration
+    lower_outlier_threshold = mean_duration - 3 * std_duration
+    
+    _d = data.with_columns(
+        ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
+    )
+    
+    # Show durations with outlier flag is true
+    outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
+
+    if outlier_data.shape[0] == 0:
+        return mo.md("### Duration Outliers: \n\n✅ No duration outliers detected")
+
+    return mo.md(f"""
+    ### Duration Outliers:
+    
+    **⚠️ Potential outliers detected based on response duration ⚠️**
+    
+    - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
+    - Standard Deviation of Duration: {std_duration:.2f} seconds
+    - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
+    - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
+    - Number of Outlier Responses: {outlier_data.shape[0]}
+    
+    Outliers:
+    
+    {mo.ui.table(outlier_data)}
+    
+    
+    **⚠️ NOTE: These have not been removed from the dataset ⚠️**
+    
+    """)
+