missing data analysis

This commit is contained in:
2026-02-10 14:24:26 +01:00
parent 14e28cf368
commit 9dfab75925
5 changed files with 1477 additions and 7 deletions

View File

@@ -1115,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):
return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
def get_top_3_voices_missing_ranking(
self, q: pl.LazyFrame
) -> pl.DataFrame:
"""Identify respondents who completed the top-3 voice selection (QID36)
but are missing the explicit ranking question (QID98).
These respondents picked 3 voices in the selection step and have
selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
ranking columns are null. This means ``get_top_3_voices()`` will
return all-null rows for them, causing plots like
``plot_most_ranked_1`` to undercount.
Parameters:
q: The (optionally filtered) LazyFrame from ``load_data()``.
Returns:
A collected ``pl.DataFrame`` with columns:
- ``_recordId`` the respondent identifier
- ``3_Ranked`` comma-separated text of the 3 voices they selected
- ``qid36_rank_cols`` dict-like column with their QID36 selection-
order values (for reference; these are *not* preference ranks)
"""
# Get the top-3 ranking data (QID98-based)
top3, _ = self.get_top_3_voices(q)
top3_df = top3.collect()
ranking_cols = [c for c in top3_df.columns if c != '_recordId']
# Respondents where every QID98 ranking column is null
all_null_expr = pl.lit(True)
for col in ranking_cols:
all_null_expr = all_null_expr & pl.col(col).is_null()
missing_ids = top3_df.filter(all_null_expr).select('_recordId')
if missing_ids.height == 0:
return pl.DataFrame(schema={
'_recordId': pl.Utf8,
'3_Ranked': pl.Utf8,
})
# Enrich with the 3_Ranked text from the 18→8→3 question
v_18_8_3, _ = self.get_18_8_3(q)
v_df = v_18_8_3.collect()
result = missing_ids.join(
v_df.select(['_recordId', '3_Ranked']),
on='_recordId',
how='left',
)
return result
def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
"""Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.