character refine

This commit is contained in:
2026-01-23 08:41:23 +01:00
parent 0e1126563e
commit 42f2d775c7
6 changed files with 319 additions and 70 deletions

View File

@@ -15,6 +15,7 @@ def _():
return (
JPMCSurvey,
mo,
pl,
plot_average_scores_with_counts,
plot_top3_ranking_distribution,
)
@@ -31,29 +32,88 @@ def _():
@app.cell
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
survey.qid_descr_map
return (survey,)
@app.cell
def _(survey):
data = survey.load_data()
df = data.collect()
data.collect()
return data, survey
df.select([q for q in df.columns if 'QID98' in q])
return (data,)
@app.cell
def _():
# survey.qid_descr_map
return
@app.cell
def _(mo):
mo.md(r"""
# Data Cleanup
# Data Validation
""")
return
- Remove incomplete responses (progress < 100)
- Flag outliers based on duration (add column)
- Flag responses that give the same rating for everything (indicates lack of engagement)
@app.cell
def _(data, mo, pl):
# all progress is 100
def check_progress(data):
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
return mo.md("""## ✅ All responses are complete (progress = 100) """)
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
check_progress(data)
return
@app.cell
def _(data, mo, pl):
def duration_validation(data):
# Identify any outliers in duration
duration_stats = data.select(
pl.col('duration').mean().alias('mean_duration'),
pl.col('duration').std().alias('std_duration')
).collect()
mean_duration = duration_stats['mean_duration'][0]
std_duration = duration_stats['std_duration'][0]
upper_outlier_threshold = mean_duration + 3 * std_duration
lower_outlier_threshold = mean_duration - 3 * std_duration
_d = data.with_columns(
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
)
# Show durations with outlier flag is true
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
if outlier_data.shape[0] == 0:
return mo.md("## ✅ No duration outliers detected")
return mo.md(f"""
## ⚠️ Duration Outliers Detected ⚠️
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
- Standard Deviation of Duration: {std_duration:.2f} seconds
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
- Number of Outlier Responses: {outlier_data.shape[0]}
Outliers:
{mo.ui.table(outlier_data)}
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
""")
duration_validation(data)
return
@app.cell
def _(mo):
mo.md(r"""
# Data Analysis
""")
return
@@ -61,64 +121,67 @@ def _(mo):
@app.cell
def _(mo):
mo.md(r"""
# Answers Decoding
Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
`GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
## Demographics
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## TODO:
Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
Ideas:
- Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
- `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up
<!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
- Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
- "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
- QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
""")
return
@app.cell
def _(survey):
cfg = survey._get_qsf_question_by_QID('QID36')['Payload']
cfg
return
@app.cell
def _(data, survey):
survey.get_demographics(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Top 8 traits
""")
return
@app.cell
def _(data, survey):
survey.get_top_8_traits(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Top 3 traits
""")
return
@app.cell
def _(data, survey):
survey.get_top_3_traits(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Character Ranking
""")
return
@app.cell
def _(data, survey):
survey.get_character_ranking(data)[0].collect()
return
@app.cell
def _(mo):
mo.md(r"""
## Voices 18 -> 8 -> 3
""")
return
@app.cell
def _(data, survey):
survey.get_18_8_3(data)[0].collect()
@@ -128,7 +191,7 @@ def _(data, survey):
@app.cell
def _(mo):
mo.md(r"""
# Voice Scales 1-10
## Voice Scales 1-10
""")
return
@@ -149,7 +212,7 @@ def _(plot_average_scores_with_counts, vscales):
@app.cell
def _(mo):
mo.md(r"""
# SS Green Blue
## SS Green Blue
""")
return
@@ -164,7 +227,7 @@ def _(data, survey):
@app.cell
def _(mo):
mo.md(r"""
# Top 3 Voices
## Top 3 Voices
""")
return
@@ -177,9 +240,9 @@ def _(data, survey):
@app.cell
def _(top3_voices):
def _():
print(top3_voices.head())
# print(top3_voices.head())
return
@@ -192,7 +255,7 @@ def _(plot_top3_ranking_distribution, top3_voices):
@app.cell
def _(mo):
mo.md(r"""
# SS Orange / Red
## SS Orange / Red
""")
return
@@ -205,5 +268,35 @@ def _(data, survey):
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Character Refine
""")
return
@app.cell
def _(data, survey):
traits_original = survey.get_top_8_traits(data)[0]
traits_original.collect()
return (traits_original,)
@app.cell
def _(data, survey):
traits_refined = survey.get_character_refine(data)[0]
traits_refined.collect()
return (traits_refined,)
@app.cell
def _(traits_original, traits_refined):
# merge the two dataframes side by side for comparison
traits_comparison = traits_original.join(traits_refined, on='_recordId')
traits_comparison.collect()
return
if __name__ == "__main__":
app.run()