character refine

2026-01-23 08:41:23 +01:00
parent 0e1126563e
commit 42f2d775c7
6 changed files with 319 additions and 70 deletions
--- a/01_ingest_qualtrics_export.py
+++ b/01_ingest_qualtrics_export.py
@@ -15,6 +15,7 @@ def _():
    return (
        JPMCSurvey,
        mo,
+        pl,
        plot_average_scores_with_counts,
        plot_top3_ranking_distribution,
    )
@@ -31,29 +32,88 @@ def _():
@app.cell
 def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
    survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
-    survey.qid_descr_map
-    return (survey,)
-
-
-@app.cell
-def _(survey):
    data = survey.load_data()
-    df = data.collect()
+    data.collect()
+    return data, survey


-    df.select([q for q in df.columns if 'QID98' in q])
-
-    return (data,)
+@app.cell
+def _():
+    # survey.qid_descr_map
+    return


@app.cell
 def _(mo):
    mo.md(r"""
-    # Data Cleanup
+    # Data Validation
+    """)
+    return

-    - Remove incomplete responses (progress < 100)
-    - Flag outliers based on duration (add column)
-    - Flag responses that give the same rating for everything (indicates lack of engagement)
+
+@app.cell
+def _(data, mo, pl):
+    # all progress is 100
+    def check_progress(data):
+        if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
+            return mo.md("""## ✅ All responses are complete (progress = 100) """)
+    
+        return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
+
+    check_progress(data)
+
+    return
+
+
+@app.cell
+def _(data, mo, pl):
+
+    def duration_validation(data):
+        # Identify any outliers in duration
+        duration_stats = data.select(
+            pl.col('duration').mean().alias('mean_duration'),
+            pl.col('duration').std().alias('std_duration')
+        ).collect()
+        mean_duration = duration_stats['mean_duration'][0]
+        std_duration = duration_stats['std_duration'][0]
+        upper_outlier_threshold = mean_duration + 3 * std_duration
+        lower_outlier_threshold = mean_duration - 3 * std_duration
+    
+        _d = data.with_columns(
+            ((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
+        )
+    
+        # Show durations with outlier flag is true
+        outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
+
+        if outlier_data.shape[0] == 0:
+            return mo.md("## ✅ No duration outliers detected")
+
+        return mo.md(f"""
+        ## ⚠️ Duration Outliers Detected ⚠️
+        - Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
+        - Standard Deviation of Duration: {std_duration:.2f} seconds
+        - Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
+        - Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
+        - Number of Outlier Responses: {outlier_data.shape[0]}
+    
+        Outliers:
+    
+        {mo.ui.table(outlier_data)}
+    
+    
+        **⚠️ NOTE: These have not been removed from the dataset ⚠️**
+    
+        """)
+    
+    duration_validation(data)
+    return
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Data Analysis
    """)
    return

@@ -61,64 +121,67 @@ def _(mo):
@app.cell
 def _(mo):
    mo.md(r"""
-    # Answers Decoding
-
-    Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
-
-    `GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
+    ## Demographics
    """)
    return


-@app.cell(hide_code=True)
-def _(mo):
-    mo.md(r"""
-    ## TODO:
-
-    Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
-
-    Ideas:
-    - Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
-    - `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns  to clean up
-    <!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
-      - Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
-      - "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
-      - QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
-    """)
-    return
-
-
-@app.cell
-def _(survey):
-    cfg = survey._get_qsf_question_by_QID('QID36')['Payload']
-    cfg
-    return
-
-
@app.cell
 def _(data, survey):
    survey.get_demographics(data)[0].collect()
    return


+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Top 8 traits
+    """)
+    return
+
+
@app.cell
 def _(data, survey):
    survey.get_top_8_traits(data)[0].collect()
    return


+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Top 3 traits
+    """)
+    return
+
+
@app.cell
 def _(data, survey):
    survey.get_top_3_traits(data)[0].collect()
    return


+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Character Ranking
+    """)
+    return
+
+
@app.cell
 def _(data, survey):
    survey.get_character_ranking(data)[0].collect()
    return


+@app.cell
+def _(mo):
+    mo.md(r"""
+    ## Voices 18 -> 8 -> 3
+    """)
+    return
+
+
@app.cell
 def _(data, survey):
    survey.get_18_8_3(data)[0].collect()
@@ -128,7 +191,7 @@ def _(data, survey):
@app.cell
 def _(mo):
    mo.md(r"""
-    # Voice Scales 1-10
+    ## Voice Scales 1-10
    """)
    return

@@ -149,7 +212,7 @@ def _(plot_average_scores_with_counts, vscales):
@app.cell
 def _(mo):
    mo.md(r"""
-    # SS Green Blue
+    ## SS Green Blue
    """)
    return

@@ -164,7 +227,7 @@ def _(data, survey):
@app.cell
 def _(mo):
    mo.md(r"""
-    # Top 3 Voices
+    ## Top 3 Voices
    """)
    return

@@ -177,9 +240,9 @@ def _(data, survey):


@app.cell
-def _(top3_voices):
+def _():

-    print(top3_voices.head())
+    # print(top3_voices.head())
    return


@@ -192,7 +255,7 @@ def _(plot_top3_ranking_distribution, top3_voices):
@app.cell
 def _(mo):
    mo.md(r"""
-    # SS Orange / Red
+    ## SS Orange / Red
    """)
    return

@@ -205,5 +268,35 @@ def _(data, survey):
    return


+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Character Refine
+    """)
+    return
+
+
+@app.cell
+def _(data, survey):
+    traits_original = survey.get_top_8_traits(data)[0]
+    traits_original.collect()
+    return (traits_original,)
+
+
+@app.cell
+def _(data, survey):
+    traits_refined = survey.get_character_refine(data)[0]
+    traits_refined.collect()
+    return (traits_refined,)
+
+
+@app.cell
+def _(traits_original, traits_refined):
+    # merge the two dataframes side by side for comparison
+    traits_comparison = traits_original.join(traits_refined, on='_recordId')
+    traits_comparison.collect()
+    return
+
+
 if __name__ == "__main__":
    app.run()