missing data analysis
This commit is contained in:
54
utils.py
54
utils.py
@@ -1115,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):
|
||||
|
||||
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
|
||||
|
||||
def get_top_3_voices_missing_ranking(
|
||||
self, q: pl.LazyFrame
|
||||
) -> pl.DataFrame:
|
||||
"""Identify respondents who completed the top-3 voice selection (QID36)
|
||||
but are missing the explicit ranking question (QID98).
|
||||
|
||||
These respondents picked 3 voices in the selection step and have
|
||||
selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
|
||||
ranking columns are null. This means ``get_top_3_voices()`` will
|
||||
return all-null rows for them, causing plots like
|
||||
``plot_most_ranked_1`` to undercount.
|
||||
|
||||
Parameters:
|
||||
q: The (optionally filtered) LazyFrame from ``load_data()``.
|
||||
|
||||
Returns:
|
||||
A collected ``pl.DataFrame`` with columns:
|
||||
|
||||
- ``_recordId`` – the respondent identifier
|
||||
- ``3_Ranked`` – comma-separated text of the 3 voices they selected
|
||||
- ``qid36_rank_cols`` – dict-like column with their QID36 selection-
|
||||
order values (for reference; these are *not* preference ranks)
|
||||
"""
|
||||
# Get the top-3 ranking data (QID98-based)
|
||||
top3, _ = self.get_top_3_voices(q)
|
||||
top3_df = top3.collect()
|
||||
|
||||
ranking_cols = [c for c in top3_df.columns if c != '_recordId']
|
||||
|
||||
# Respondents where every QID98 ranking column is null
|
||||
all_null_expr = pl.lit(True)
|
||||
for col in ranking_cols:
|
||||
all_null_expr = all_null_expr & pl.col(col).is_null()
|
||||
|
||||
missing_ids = top3_df.filter(all_null_expr).select('_recordId')
|
||||
|
||||
if missing_ids.height == 0:
|
||||
return pl.DataFrame(schema={
|
||||
'_recordId': pl.Utf8,
|
||||
'3_Ranked': pl.Utf8,
|
||||
})
|
||||
|
||||
# Enrich with the 3_Ranked text from the 18→8→3 question
|
||||
v_18_8_3, _ = self.get_18_8_3(q)
|
||||
v_df = v_18_8_3.collect()
|
||||
|
||||
result = missing_ids.join(
|
||||
v_df.select(['_recordId', '3_Ranked']),
|
||||
on='_recordId',
|
||||
how='left',
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
|
||||
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.
|
||||
|
||||
Reference in New Issue
Block a user