missing data analysis

2026-02-10 14:24:26 +01:00
parent 14e28cf368
commit 9dfab75925
5 changed files with 1477 additions and 7 deletions
--- a/utils.py
+++ b/utils.py
@@ -1115,6 +1115,60 @@ class QualtricsSurvey(QualtricsPlotsMixin):

        return self._get_subset(q, list(QIDs_map.keys()), rename_cols=False).rename(QIDs_map), None
    
+    def get_top_3_voices_missing_ranking(
+        self, q: pl.LazyFrame
+    ) -> pl.DataFrame:
+        """Identify respondents who completed the top-3 voice selection (QID36)
+        but are missing the explicit ranking question (QID98).
+
+        These respondents picked 3 voices in the selection step and have
+        selection-order data in ``QID36_G0_*_RANK``, but all 18 ``QID98_*``
+        ranking columns are null.  This means ``get_top_3_voices()`` will
+        return all-null rows for them, causing plots like
+        ``plot_most_ranked_1`` to undercount.
+
+        Parameters:
+            q: The (optionally filtered) LazyFrame from ``load_data()``.
+
+        Returns:
+            A collected ``pl.DataFrame`` with columns:
+
+            - ``_recordId`` – the respondent identifier
+            - ``3_Ranked`` – comma-separated text of the 3 voices they selected
+            - ``qid36_rank_cols`` – dict-like column with their QID36 selection-
+              order values (for reference; these are *not* preference ranks)
+        """
+        # Get the top-3 ranking data (QID98-based)
+        top3, _ = self.get_top_3_voices(q)
+        top3_df = top3.collect()
+
+        ranking_cols = [c for c in top3_df.columns if c != '_recordId']
+
+        # Respondents where every QID98 ranking column is null
+        all_null_expr = pl.lit(True)
+        for col in ranking_cols:
+            all_null_expr = all_null_expr & pl.col(col).is_null()
+
+        missing_ids = top3_df.filter(all_null_expr).select('_recordId')
+
+        if missing_ids.height == 0:
+            return pl.DataFrame(schema={
+                '_recordId': pl.Utf8,
+                '3_Ranked': pl.Utf8,
+            })
+
+        # Enrich with the 3_Ranked text from the 18→8→3 question
+        v_18_8_3, _ = self.get_18_8_3(q)
+        v_df = v_18_8_3.collect()
+
+        result = missing_ids.join(
+            v_df.select(['_recordId', '3_Ranked']),
+            on='_recordId',
+            how='left',
+        )
+
+        return result
+
    
    def get_ss_orange_red(self, q: pl.LazyFrame) -> Union[pl.LazyFrame, dict]:
        """Extract columns containing the SS Orange/Red ratings for the Chase virtual assistant.