straight-liner plot analysis

2026-02-09 17:26:45 +01:00
parent 92c6fc03ab
commit 6c16993cb3
4 changed files with 897 additions and 24 deletions
--- a/plots.py
+++ b/plots.py
@@ -1115,6 +1115,7 @@ class QualtricsPlotsMixin:
        title: str = "Speaking Style Trait Analysis",
        height: int | None = None,
        width: int | str | None = None,
+        color_gender: bool = False,
    ) -> alt.Chart:
        """Plot scores for a single speaking style trait across multiple voices."""
        df = self._ensure_dataframe(data)
@@ -1156,36 +1157,71 @@ class QualtricsPlotsMixin:
            else:
                trait_description = ""

-        # Horizontal bar chart - use x2 to explicitly start bars at x=1
-        bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
-            x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
-            x2=alt.datum(1),  # Bars start at x=1 (left edge of domain)
-            y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
-            tooltip=[
-                alt.Tooltip('Voice:N'),
-                alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
-                alt.Tooltip('count:Q', title='Count')
-            ]
-        )
+        if color_gender:
+            stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
+            
+            bars = alt.Chart(stats).mark_bar().encode(
+                x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
+                x2=alt.datum(1),  # Bars start at x=1 (left edge of domain)
+                y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
+                color=alt.Color('gender:N',
+                               scale=alt.Scale(domain=['Male', 'Female'],
+                                             range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]),
+                               legend=alt.Legend(orient='top', direction='horizontal', title='Gender')),
+                tooltip=[
+                    alt.Tooltip('Voice:N'),
+                    alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
+                    alt.Tooltip('count:Q', title='Count'),
+                    alt.Tooltip('gender:N', title='Gender')
+                ]
+            )
+            
+            text = alt.Chart(stats).mark_text(
+                align='left',
+                baseline='middle',
+                dx=5,
+                fontSize=12
+            ).encode(
+                x='mean_score:Q',
+                y=alt.Y('Voice:N', sort='-x'),
+                text='count:Q',
+                color=alt.condition(
+                    alt.datum.gender == 'Female',
+                    alt.value(ColorPalette.GENDER_FEMALE),
+                    alt.value(ColorPalette.GENDER_MALE)
+                )
+            )
+        else:
+            # Horizontal bar chart - use x2 to explicitly start bars at x=1
+            bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
+                x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
+                x2=alt.datum(1),  # Bars start at x=1 (left edge of domain)
+                y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
+                tooltip=[
+                    alt.Tooltip('Voice:N'),
+                    alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
+                    alt.Tooltip('count:Q', title='Count')
+                ]
+            )

-        # Count text at end of bars (right-aligned inside bar)
-        text = alt.Chart(stats).mark_text(
-            align='right',
-            baseline='middle',
-            color='white',
-            fontSize=12,
-            dx=-5  # Slight padding from bar end
-        ).encode(
-            x='mean_score:Q',
-            y=alt.Y('Voice:N', sort='-x'),
-            text='count:Q'
-        )
+            # Count text at end of bars
+            text = alt.Chart(stats).mark_text(
+                align='left',
+                baseline='middle',
+                color='black',
+                fontSize=12,
+                dx=5
+            ).encode(
+                x='mean_score:Q',
+                y=alt.Y('Voice:N', sort='-x'),
+                text='count:Q'
+            )

        # Combine layers
        chart = (bars + text).properties(
            title={
                "text": self._process_title(title),
-                "subtitle": [trait_description, "(Numbers on bars indicate respondent count)"]
+                "subtitle": [trait_description, "(Numbers near bars indicate respondent count)"]
            },
            width=width or 800,
            height=height or getattr(self, 'plot_height', 400)
@@ -1194,6 +1230,101 @@ class QualtricsPlotsMixin:
        chart = self._save_plot(chart, title)
        return chart

+    def plot_speaking_style_trait_scores_comparison(
+        self,
+        data_all: pl.LazyFrame | pl.DataFrame,
+        data_clean: pl.LazyFrame | pl.DataFrame,
+        trait_description: str = None,
+        title: str = "Speaking Style Trait Analysis (Comparison)",
+        height: int | None = None,
+        width: int | str | None = None,
+    ) -> alt.Chart:
+        """Plot scores comparing All Respondents vs Cleaned data (excl. straight-liners) in grouped bars."""
+        
+        # Helper to process each dataframe
+        def get_stats(d, group_label):
+            df = self._ensure_dataframe(d)
+            if df.is_empty(): return None
+            
+            return (
+                df.filter(pl.col("score").is_not_null())
+                .group_by("Voice")
+                .agg([
+                    pl.col("score").mean().alias("mean_score"),
+                    pl.col("score").count().alias("count")
+                ])
+                .with_columns(pl.lit(group_label).alias("dataset"))
+                .to_pandas()
+            )
+
+        stats_all = get_stats(data_all, "All Respondents")
+        stats_clean = get_stats(data_clean, "Excl. Straight-Liners")
+        
+        if stats_all is None or stats_clean is None:
+             return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
+
+        # Combine
+        stats = pd.concat([stats_all, stats_clean])
+        
+        # Determine sort order using "All Respondents" data (Desc)
+        sort_order = stats_all.sort_values('mean_score', ascending=False)['Voice'].tolist()
+        
+        # Add gender and combined category for color
+        stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
+        stats['color_group'] = stats.apply(
+            lambda x: f"{x['gender']} - {x['dataset']}", axis=1
+        )
+        
+        # Define Color Scale
+        domain = [
+            'Male - All Respondents', 'Male - Excl. Straight-Liners',
+            'Female - All Respondents', 'Female - Excl. Straight-Liners'
+        ]
+        range_colors = [
+            ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_RANK_3, 
+            ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_RANK_3
+        ]
+
+        # Base chart
+        base = alt.Chart(stats).encode(
+            y=alt.Y('Voice:N', title='Voice', sort=sort_order, axis=alt.Axis(grid=False)),
+        )
+
+        bars = base.mark_bar().encode(
+            x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
+            x2=alt.datum(1),
+            yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
+            color=alt.Color('color_group:N', 
+                           scale=alt.Scale(domain=domain, range=range_colors),
+                           legend=alt.Legend(title='Dataset', orient='top', columns=2)),
+            tooltip=[
+                alt.Tooltip('Voice:N'),
+                alt.Tooltip('dataset:N', title='Dataset'),
+                alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
+                alt.Tooltip('count:Q', title='Count'),
+                alt.Tooltip('gender:N', title='Gender')
+            ]
+        )
+        
+        text = base.mark_text(align='left', baseline='middle', dx=5, fontSize=9).encode(
+            x=alt.X('mean_score:Q'),
+            yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
+            text=alt.Text('count:Q'),
+             color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=None)
+        )
+
+        chart = (bars + text).properties(
+            title={
+                "text": self._process_title(title),
+                "subtitle": [trait_description if trait_description else "", "(Lighter shade = Straight-liners removed)"]
+            },
+            width=width or 800,
+            height=height or getattr(self, 'plot_height', 600)
+        )
+        
+        chart = self._save_plot(chart, title)
+        return chart
+
    def plot_speaking_style_scale_correlation(
        self,
        style_color: str,
@@ -2495,5 +2626,214 @@ class QualtricsPlotsMixin:
            height=height or getattr(self, 'plot_height', 400),
        )
        
+        chart = self._save_plot(chart, title)
+        return chart
+
+    def plot_straight_liner_repeat_offenders(
+        self,
+        cumulative_df: pl.DataFrame | pd.DataFrame,
+        title: str = "Straight-Liner Repeat Offenders\n(Cumulative Distribution)",
+        height: int | None = None,
+        width: int | str | None = None,
+        total_respondents: int | None = None,
+    ) -> alt.Chart:
+        """Plot the cumulative distribution of straight-liner repeat offenders.
+
+        Shows how many respondents straight-lined at N or more question
+        groups, for every observed threshold.
+
+        Parameters:
+            cumulative_df: DataFrame with columns ``threshold`` (int),
+                ``count`` (int) and ``pct`` (float, 0-100).  Each row
+                represents "≥ threshold question groups".
+            title: Chart title.
+            height: Chart height in pixels.
+            width: Chart width in pixels.
+            total_respondents: If provided, shown in the subtitle for
+                context.
+
+        Returns:
+            The Altair chart object (already saved if ``fig_save_dir``
+            is configured).
+        """
+        if isinstance(cumulative_df, pl.DataFrame):
+            plot_df = cumulative_df.to_pandas()
+        else:
+            plot_df = cumulative_df.copy()
+
+        # Build readable x-axis labels ("≥1", "≥2", …)
+        plot_df["label"] = plot_df["threshold"].apply(lambda t: f"≥{t}")
+
+        # Explicit sort order so Altair keeps ascending threshold
+        sort_order = plot_df.sort_values("threshold")["label"].tolist()
+
+        # --- Bars: respondent count ---
+        bars = alt.Chart(plot_df).mark_bar(
+            color=ColorPalette.PRIMARY
+        ).encode(
+            x=alt.X(
+                "label:N",
+                title="Number of Straight-Lined Question Groups",
+                sort=sort_order,
+                axis=alt.Axis(grid=False),
+            ),
+            y=alt.Y(
+                "count:Q",
+                title="Number of Respondents",
+                axis=alt.Axis(grid=True),
+            ),
+            tooltip=[
+                alt.Tooltip("label:N", title="Threshold"),
+                alt.Tooltip("count:Q", title="Respondents"),
+                alt.Tooltip("pct:Q", title="% of Total", format=".1f"),
+            ],
+        )
+
+        # --- Text: count + percentage above each bar ---
+        text = alt.Chart(plot_df).mark_text(
+            dy=-10, color="black", fontSize=11
+        ).encode(
+            x=alt.X("label:N", sort=sort_order),
+            y=alt.Y("count:Q"),
+            text=alt.Text("count_label:N"),
+        )
+
+        # Build a combined label column "N  (xx.x%)"
+        plot_df["count_label"] = plot_df.apply(
+            lambda r: f"{int(r['count'])}  ({r['pct']:.1f}%)", axis=1
+        )
+
+        # Rebuild text layer with the updated df
+        text = alt.Chart(plot_df).mark_text(
+            dy=-10, color="black", fontSize=11
+        ).encode(
+            x=alt.X("label:N", sort=sort_order),
+            y=alt.Y("count:Q"),
+            text=alt.Text("count_label:N"),
+        )
+
+        # --- Subtitle ---
+        subtitle_parts = []
+        if total_respondents is not None:
+            subtitle_parts.append(
+                f"Total respondents: {total_respondents}"
+            )
+        subtitle_parts.append(
+            "Each bar shows how many respondents straight-lined "
+            "at least that many question groups"
+        )
+        subtitle = " | ".join(subtitle_parts)
+
+        title_config = {
+            "text": self._process_title(title),
+            "subtitle": subtitle,
+            "subtitleColor": "gray",
+            "subtitleFontSize": 10,
+            "anchor": "start",
+        }
+
+        chart = alt.layer(bars, text).properties(
+            title=title_config,
+            width=width or 800,
+            height=height or getattr(self, "plot_height", 400),
+        )
+
+        chart = self._save_plot(chart, title)
+        return chart
+
+    def plot_straight_liner_per_question(
+        self,
+        per_question_df: pl.DataFrame | pd.DataFrame,
+        title: str = "Straight-Lining Frequency per Question Group",
+        height: int | None = None,
+        width: int | str | None = None,
+        total_respondents: int | None = None,
+    ) -> alt.Chart:
+        """Plot how often each question group is straight-lined.
+
+        Parameters:
+            per_question_df: DataFrame with columns ``question`` (str,
+                human-readable name), ``count`` (int) and ``pct``
+                (float, 0-100).  Sorted descending by count.
+            title: Chart title.
+            height: Chart height in pixels.
+            width: Chart width in pixels.
+            total_respondents: Shown in subtitle for context.
+
+        Returns:
+            The Altair chart (saved if ``fig_save_dir`` is set).
+        """
+        if isinstance(per_question_df, pl.DataFrame):
+            plot_df = per_question_df.to_pandas()
+        else:
+            plot_df = per_question_df.copy()
+
+        # Sort order: largest count at top. Altair y-axis nominal sort places
+        # the first list element at the top, so descending order is correct.
+        sort_order = plot_df.sort_values("count", ascending=False)["question"].tolist()
+
+        # Combined label  "N  (xx.x%)"
+        plot_df["count_label"] = plot_df.apply(
+            lambda r: f"{int(r['count'])}  ({r['pct']:.1f}%)", axis=1
+        )
+
+        # --- Horizontal Bars ---
+        bars = alt.Chart(plot_df).mark_bar(
+            color=ColorPalette.PRIMARY,
+        ).encode(
+            y=alt.Y(
+                "question:N",
+                title=None,
+                sort=sort_order,
+                axis=alt.Axis(grid=False, labelLimit=250, labelAngle=0),
+            ),
+            x=alt.X(
+                "count:Q",
+                title="Number of Straight-Liners",
+                axis=alt.Axis(grid=True),
+            ),
+            tooltip=[
+                alt.Tooltip("question:N", title="Question"),
+                alt.Tooltip("count:Q", title="Straight-Liners"),
+                alt.Tooltip("pct:Q", title="% of Respondents", format=".1f"),
+            ],
+        )
+
+        # --- Text labels to the right of bars ---
+        text = alt.Chart(plot_df).mark_text(
+            align="left", dx=4, color="black", fontSize=10,
+        ).encode(
+            y=alt.Y("question:N", sort=sort_order),
+            x=alt.X("count:Q"),
+            text=alt.Text("count_label:N"),
+        )
+
+        # --- Subtitle ---
+        subtitle_parts = []
+        if total_respondents is not None:
+            subtitle_parts.append(f"Total respondents: {total_respondents}")
+        subtitle_parts.append(
+            "Count and share of respondents who straight-lined each question group"
+        )
+        subtitle = " | ".join(subtitle_parts)
+
+        title_config = {
+            "text": self._process_title(title),
+            "subtitle": subtitle,
+            "subtitleColor": "gray",
+            "subtitleFontSize": 10,
+            "anchor": "start",
+        }
+
+        # Scale height with number of questions for readable bar spacing
+        n_questions = len(plot_df)
+        auto_height = max(400, n_questions * 22)
+
+        chart = alt.layer(bars, text).properties(
+            title=title_config,
+            width=width or 700,
+            height=height or auto_height,
+        )
+
        chart = self._save_plot(chart, title)
        return chart