character refine
This commit is contained in:
@@ -15,6 +15,7 @@ def _():
|
||||
return (
|
||||
JPMCSurvey,
|
||||
mo,
|
||||
pl,
|
||||
plot_average_scores_with_counts,
|
||||
plot_top3_ranking_distribution,
|
||||
)
|
||||
@@ -31,29 +32,88 @@ def _():
|
||||
@app.cell
|
||||
def _(JPMCSurvey, QSF_FILE, RESULTS_FILE):
|
||||
survey = JPMCSurvey(RESULTS_FILE, QSF_FILE)
|
||||
survey.qid_descr_map
|
||||
return (survey,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(survey):
|
||||
data = survey.load_data()
|
||||
df = data.collect()
|
||||
data.collect()
|
||||
return data, survey
|
||||
|
||||
|
||||
df.select([q for q in df.columns if 'QID98' in q])
|
||||
|
||||
return (data,)
|
||||
@app.cell
|
||||
def _():
|
||||
# survey.qid_descr_map
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Data Cleanup
|
||||
# Data Validation
|
||||
""")
|
||||
return
|
||||
|
||||
- Remove incomplete responses (progress < 100)
|
||||
- Flag outliers based on duration (add column)
|
||||
- Flag responses that give the same rating for everything (indicates lack of engagement)
|
||||
|
||||
@app.cell
|
||||
def _(data, mo, pl):
|
||||
# all progress is 100
|
||||
def check_progress(data):
|
||||
if data.collect().select(pl.col('progress').unique()).shape[0] == 1:
|
||||
return mo.md("""## ✅ All responses are complete (progress = 100) """)
|
||||
|
||||
return mo.md("## ⚠️ There are incomplete responses (progress < 100) ⚠️")
|
||||
|
||||
check_progress(data)
|
||||
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, mo, pl):
|
||||
|
||||
def duration_validation(data):
|
||||
# Identify any outliers in duration
|
||||
duration_stats = data.select(
|
||||
pl.col('duration').mean().alias('mean_duration'),
|
||||
pl.col('duration').std().alias('std_duration')
|
||||
).collect()
|
||||
mean_duration = duration_stats['mean_duration'][0]
|
||||
std_duration = duration_stats['std_duration'][0]
|
||||
upper_outlier_threshold = mean_duration + 3 * std_duration
|
||||
lower_outlier_threshold = mean_duration - 3 * std_duration
|
||||
|
||||
_d = data.with_columns(
|
||||
((pl.col('duration') > upper_outlier_threshold) | (pl.col('duration') < lower_outlier_threshold)).alias('outlier_duration')
|
||||
)
|
||||
|
||||
# Show durations with outlier flag is true
|
||||
outlier_data = _d.filter(pl.col('outlier_duration') == True).collect()
|
||||
|
||||
if outlier_data.shape[0] == 0:
|
||||
return mo.md("## ✅ No duration outliers detected")
|
||||
|
||||
return mo.md(f"""
|
||||
## ⚠️ Duration Outliers Detected ⚠️
|
||||
- Mean Duration: {mean_duration:.2f} seconds (approximately {mean_duration/60:.2f} minutes)
|
||||
- Standard Deviation of Duration: {std_duration:.2f} seconds
|
||||
- Upper Outlier Threshold (Mean + 3*Std): {upper_outlier_threshold:.2f} seconds
|
||||
- Lower Outlier Threshold (Mean - 3*Std): {lower_outlier_threshold:.2f} seconds
|
||||
- Number of Outlier Responses: {outlier_data.shape[0]}
|
||||
|
||||
Outliers:
|
||||
|
||||
{mo.ui.table(outlier_data)}
|
||||
|
||||
|
||||
**⚠️ NOTE: These have not been removed from the dataset ⚠️**
|
||||
|
||||
""")
|
||||
|
||||
duration_validation(data)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Data Analysis
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -61,64 +121,67 @@ def _(mo):
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Answers Decoding
|
||||
|
||||
Pipeline to decode the ranking of voices. Currently saved as QID's, they need to be remapped back to their actual values so that the analysis can be performed. ie:
|
||||
|
||||
`GQIK26_G0_x8_RANK` -> Refers to question `Top 3 Traits_0_8_RANK - What are the important traits for the Chase AI virtual assistant?` and thus the #8 option
|
||||
## Demographics
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## TODO:
|
||||
|
||||
Create a python function for each of the questions. ie `def QID63()`. Each function should return a Polars query, that can be added to an existing query.
|
||||
|
||||
Ideas:
|
||||
- Map column name to include the Voice number (VID) (ie the questions that only have 1 voice). The VID is in this case often included in the question description
|
||||
- `QID_x_GROUP` Contains the rankings of the values, stored in order. The following columns (ie `QID26_G0_x1_RANK`) are redundant and not necessary for us. The function should drop the unnecessary columns to clean up
|
||||
<!-- - Translate the RANK values back to the actual VID, and create an aggregate column that contains a list of the VIDs in order. ie: [V34, V56, V81].
|
||||
- Use the first line of the question description (see `qid_descr_map`) to get the `"DataExportTag"`, which is a property that can be found in the `.qsf` file to inspect the choice number and it's corresponding VID
|
||||
- "`VOICE SEL. 8-3_0_5_RANK`" refers to `"DataExportTag": "VOICE SEL. 8-3"`, `Group 0` (not important for this), `Choice 5`, and the value in the cell refers to the Rank assigned to that voice
|
||||
- QSF file example to retrieve the VID: `"SurveyElements" -> (Find item where "Payload"["DataExportTag"] == "VOICE SEL. 8-3") -> "Payload" -> "Choices" -> "5" -> "Display" -> (Extract 'Voice <xx>' from the HTML)` -->
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(survey):
|
||||
cfg = survey._get_qsf_question_by_QID('QID36')['Payload']
|
||||
cfg
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_demographics(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Top 8 traits
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_top_8_traits(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Top 3 traits
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_top_3_traits(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Character Ranking
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_character_ranking(data)[0].collect()
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Voices 18 -> 8 -> 3
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
survey.get_18_8_3(data)[0].collect()
|
||||
@@ -128,7 +191,7 @@ def _(data, survey):
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Voice Scales 1-10
|
||||
## Voice Scales 1-10
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -149,7 +212,7 @@ def _(plot_average_scores_with_counts, vscales):
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# SS Green Blue
|
||||
## SS Green Blue
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -164,7 +227,7 @@ def _(data, survey):
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Top 3 Voices
|
||||
## Top 3 Voices
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -177,9 +240,9 @@ def _(data, survey):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(top3_voices):
|
||||
def _():
|
||||
|
||||
print(top3_voices.head())
|
||||
# print(top3_voices.head())
|
||||
return
|
||||
|
||||
|
||||
@@ -192,7 +255,7 @@ def _(plot_top3_ranking_distribution, top3_voices):
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# SS Orange / Red
|
||||
## SS Orange / Red
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -205,5 +268,35 @@ def _(data, survey):
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Character Refine
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
traits_original = survey.get_top_8_traits(data)[0]
|
||||
traits_original.collect()
|
||||
return (traits_original,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(data, survey):
|
||||
traits_refined = survey.get_character_refine(data)[0]
|
||||
traits_refined.collect()
|
||||
return (traits_refined,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(traits_original, traits_refined):
|
||||
# merge the two dataframes side by side for comparison
|
||||
traits_comparison = traits_original.join(traits_refined, on='_recordId')
|
||||
traits_comparison.collect()
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
|
||||
Reference in New Issue
Block a user