fix sample size

2026-02-03 20:48:34 +01:00
parent 9a587dcc4c
commit 36280a6ff8
3 changed files with 186 additions and 122 deletions
--- a/plots.py
+++ b/plots.py
@@ -178,8 +178,8 @@ class QualtricsPlotsMixin:
            # Use UPPERCASE for category name to distinguish from values
            parts.append(f"{display_name.upper()}: {val_str}")
        
-        # Get sample size (stored by _ensure_dataframe)
-        sample_size = getattr(self, '_last_sample_size', None)
+        # Get sample size from the filtered dataset (not from transformed plot data)
+        sample_size = self._get_filtered_sample_size()
        sample_prefix = f"Sample size: {sample_size}" if sample_size is not None else ""
        
        if not parts:
@@ -297,10 +297,7 @@ class QualtricsPlotsMixin:
        return chart

    def _ensure_dataframe(self, data: pl.LazyFrame | pl.DataFrame | None) -> pl.DataFrame:
-        """Ensure data is an eager DataFrame, collecting if necessary.
-        
-        Also stores the sample size on self._last_sample_size for use in filter descriptions.
-        """
+        """Ensure data is an eager DataFrame, collecting if necessary."""
        df = data if data is not None else getattr(self, 'data_filtered', None)
        if df is None:
             raise ValueError("No data provided and self.data_filtered is None.")
@@ -308,9 +305,21 @@ class QualtricsPlotsMixin:
        if isinstance(df, pl.LazyFrame):
            df = df.collect()
        
-        # Store sample size for filter description
-        self._last_sample_size = df.height
        return df
+    
+    def _get_filtered_sample_size(self) -> int | None:
+        """Get the sample size from the filtered dataset (self.data_filtered).
+        
+        This returns the number of respondents in the filtered dataset,
+        not the size of any transformed/aggregated data passed to plot functions.
+        """
+        data_filtered = getattr(self, 'data_filtered', None)
+        if data_filtered is None:
+            return None
+        
+        if isinstance(data_filtered, pl.LazyFrame):
+            return data_filtered.select(pl.len()).collect().item()
+        return data_filtered.height

    def _clean_voice_label(self, col_name: str) -> str:
        """Extract and clean voice name from column name for display.
@@ -681,7 +690,7 @@ class QualtricsPlotsMixin:
                ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_NEUTRAL
            ]
            
-            chart = alt.Chart(stats_df).mark_bar().encode(
+            bars = alt.Chart(stats_df).mark_bar().encode(
                x=alt.X('item:N', title=x_label, sort='-y'),
                y=alt.Y('count:Q', title=y_label),
                color=alt.Color('gender_category:N',
@@ -692,14 +701,27 @@ class QualtricsPlotsMixin:
                    alt.Tooltip('count:Q', title='1st Place Votes'),
                    alt.Tooltip('gender:N', title='Gender')
                ]
-            ).properties(
+            )
+            
+            # Text overlay for counts
+            text = alt.Chart(stats_df).mark_text(
+                dy=-5,
+                color='black',
+                fontSize=10
+            ).encode(
+                x=alt.X('item:N', sort='-y'),
+                y=alt.Y('count:Q'),
+                text=alt.Text('count:Q')
+            )
+            
+            chart = (bars + text).properties(
                title=self._process_title(title),
                width=width or 800,
                height=height or getattr(self, 'plot_height', 400)
            )
        else:
            # Bar chart with conditional color
-            chart = alt.Chart(stats_df).mark_bar().encode(
+            bars = alt.Chart(stats_df).mark_bar().encode(
                x=alt.X('item:N', title=x_label, sort='-y'),
                y=alt.Y('count:Q', title=y_label),
                color=alt.Color('category:N',
@@ -710,7 +732,20 @@ class QualtricsPlotsMixin:
                    alt.Tooltip('item:N', title='Item'),
                    alt.Tooltip('count:Q', title='1st Place Votes')
                ]
-            ).properties(
+            )
+            
+            # Text overlay for counts
+            text = alt.Chart(stats_df).mark_text(
+                dy=-5,
+                color='black',
+                fontSize=10
+            ).encode(
+                x=alt.X('item:N', sort='-y'),
+                y=alt.Y('count:Q'),
+                text=alt.Text('count:Q')
+            )
+            
+            chart = (bars + text).properties(
                title=self._process_title(title),
                width=width or 800,
                height=height or getattr(self, 'plot_height', 400)
@@ -769,7 +804,7 @@ class QualtricsPlotsMixin:
        # Text overlay
        text = bars.mark_text(
            dy=-5,
-            color='white',
+            color='black',
            fontSize=11
        ).encode(
            text='Weighted Score:Q'