straight-liner plot analysis

2026-02-09 17:26:45 +01:00
parent 92c6fc03ab
commit 6c16993cb3
4 changed files with 897 additions and 24 deletions
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "wakatime.vscode-wakatime"
+    ]
+}
--- a/XX_detailed_trait_analysis.py
+++ b/XX_detailed_trait_analysis.py
@@ -0,0 +1,263 @@
+"""Extra analyses of the traits"""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+from validation import check_straight_liners
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+# %% Voices per trait
+
+
+ss_or, choice_map_or = S.get_ss_orange_red(data)
+ss_gb, choice_map_gb = S.get_ss_green_blue(data)
+
+# Combine the data
+ss_all = ss_or.join(ss_gb, on='_recordId')
+_d = ss_all.collect()
+
+choice_map = {**choice_map_or, **choice_map_gb}
+# print(_d.head())
+# print(choice_map)
+ss_long = utils.process_speaking_style_data(ss_all, choice_map)
+
+
+# %% Create plots
+
+for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
+    trait_d = ss_long.filter(pl.col("Description") == trait)
+
+    S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True)
+
+
+
+
+
+# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
+# Save with different filename suffix so we can compare with/without straight-liners
+
+print("\n--- Straight-lining Checks on TRAITS ---")
+sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
+sl_traits_df
+
+# %%
+
+if sl_traits_df is not None and not sl_traits_df.is_empty():
+    sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
+    n_sl_groups = sl_traits_df.height
+    print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
+    
+    # Create key in ss_long to match sl_traits_df for anti-join
+    # Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
+    # ss_long has "Style_Group" and "Voice"
+    ss_long_w_key = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    
+    # Prepare filter table: Record ID + Question Group
+    sl_filter = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+
+    # Anti-join to remove specific question blocks that were straight-lined
+    ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
+    
+    # Re-plot with suffix in title
+    print("Re-plotting traits (Cleaned)...")
+    for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
+        trait_d = ss_long_clean.filter(pl.col("Description") == trait)
+        
+        # Modify title to create unique filename (and display title)
+        title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)"
+        
+        S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
+else:
+    print("No straight-liners found on traits.")
+
+
+
+
+# %% Compare All vs Cleaned
+if sl_traits_df is not None and not sl_traits_df.is_empty():
+    print("Generating Comparison Plots (All vs Cleaned)...")
+    
+    # Always apply the per-question-group filtering here to ensure consistency
+    # (Matches the logic used in the re-plotting section above)
+    print("Applying filter to remove straight-lined question blocks...")
+    ss_long_w_key = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    sl_filter = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+    ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
+
+    sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
+
+    # --- Verification Prints ---
+    print(f"\n--- Verification of Filter ---")
+    print(f"Original Row Count: {ss_long.height}")
+    print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
+    print(f"Sample IDs affected: {sl_ids[:5]}")
+    print(f"Cleaned Row Count: {ss_long_clean.height}")
+    print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
+    
+    # Verify removal
+    # Re-construct key to verify
+    ss_long_check = ss_long.with_columns(
+        (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
+    )
+    sl_filter_check = sl_traits_df.select([
+        pl.col("Record ID").alias("_recordId"), 
+        pl.col("Question Group")
+    ])
+    
+    should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
+    print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
+    
+    # Show what was removed (the straight lining behavior)
+    print("\nSample of Straight-Liner Data (Values that caused removal):")
+    print(sl_traits_df.head(5))
+    print("-" * 30 + "\n")
+    # ---------------------------
+    
+    for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
+        
+        # Get data for this trait from both datasets
+        trait_d_all = ss_long.filter(pl.col("Description") == trait)
+        trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
+        
+        # Plot comparison
+        title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)"
+        
+        S.plot_speaking_style_trait_scores_comparison(
+            trait_d_all, 
+            trait_d_clean, 
+            title=title_comp,
+            height=600  # Slightly taller for grouped bars
+        )
+
--- a/XX_straight_liners.py
+++ b/XX_straight_liners.py
@@ -0,0 +1,265 @@
+"""Extra analyses of the straight-liners"""
+# %% Imports
+
+import utils
+import polars as pl
+import argparse
+import json
+import re
+from pathlib import Path
+from validation import check_straight_liners
+
+
+# %% Fixed Variables
+RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
+QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
+
+
+# %% CLI argument parsing for batch automation
+# When run as script: uv run XX_statistical_significance.script.py --age '["18
+# Central filter configuration - add new filters here only
+# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
+FILTER_CONFIG = {
+    'age': 'options_age',
+    'gender': 'options_gender',
+    'ethnicity': 'options_ethnicity',
+    'income': 'options_income',
+    'consumer': 'options_consumer',
+    'business_owner': 'options_business_owner',
+    'ai_user': 'options_ai_user',
+    'investable_assets': 'options_investable_assets',
+    'industry': 'options_industry',
+}
+
+def parse_cli_args():
+    parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
+    
+    # Dynamically add filter arguments from config
+    for filter_name in FILTER_CONFIG:
+        parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
+    
+    parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
+    parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
+    
+    # Only parse if running as script (not in Jupyter/interactive)
+    try:
+        # Check if running in Jupyter by looking for ipykernel
+        get_ipython()  # noqa: F821 # type: ignore
+        # Return namespace with all filters set to None
+        no_filters = {f: None for f in FILTER_CONFIG}
+        # Use the same default as argparse
+        default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}'
+        return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
+    except NameError:
+        args = parser.parse_args()
+        # Parse JSON strings to lists
+        for filter_name in FILTER_CONFIG:
+            val = getattr(args, filter_name)
+            setattr(args, filter_name, json.loads(val) if val else None)
+        return args
+
+cli_args = parse_cli_args()
+
+
+# %%
+S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
+data_all = S.load_data()
+
+
+# %% Build filtered dataset based on CLI args
+
+# CLI args: None means "no filter applied" - filter_data() will skip None filters
+
+# Build filter values dict dynamically from FILTER_CONFIG
+_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
+
+_d = S.filter_data(data_all, **_active_filters)
+
+# Write filter description file if filter-name is provided
+if cli_args.filter_name and S.fig_save_dir:
+    # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
+    _filter_slug = S._get_filter_slug()
+    _filter_slug_dir = S.fig_save_dir / _filter_slug
+    _filter_slug_dir.mkdir(parents=True, exist_ok=True)
+    
+    # Build filter description
+    _filter_desc_lines = [
+        f"Filter: {cli_args.filter_name}",
+        "",
+        "Applied Filters:",
+    ]
+    _short_desc_parts = []
+    for filter_name, options_attr in FILTER_CONFIG.items():
+        all_options = getattr(S, options_attr)
+        values = _active_filters[filter_name]
+        display_name = filter_name.replace('_', ' ').title()
+        # None means no filter applied (same as "All")
+        if values is not None and values != all_options:
+            _short_desc_parts.append(f"{display_name}: {', '.join(values)}")
+            _filter_desc_lines.append(f"  {display_name}: {', '.join(values)}")
+        else:
+            _filter_desc_lines.append(f"  {display_name}: All")
+    
+    # Write detailed description INSIDE the filter-slug directory
+    # Sanitize filter name for filename usage (replace / and other chars)
+    _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
+    _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
+    _filter_file.write_text('\n'.join(_filter_desc_lines))
+    
+    # Append to summary index file at figures/<export_date>/filter_index.txt
+    _summary_file = S.fig_save_dir / "filter_index.txt"
+    _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
+    _summary_line = f"{_filter_slug}  |  {cli_args.filter_name}  |  {_short_desc}\n"
+    
+    # Append or create the summary file
+    if _summary_file.exists():
+        _existing = _summary_file.read_text()
+        # Avoid duplicate entries for same slug
+        if _filter_slug not in _existing:
+            with _summary_file.open('a') as f:
+                f.write(_summary_line)
+    else:
+        _header = "Filter Index\n" + "=" * 80 + "\n\n"
+        _header += "Directory  |  Filter Name  |  Description\n"
+        _header += "-" * 80 + "\n"
+        _summary_file.write_text(_header + _summary_line)
+
+# Save to logical variable name for further analysis
+data = _d
+data.collect()
+
+
+# %% Determine straight-liner repeat offenders
+# Extract question groups with renamed columns that check_straight_liners expects.
+# The raw `data` has QID-based column names; the getter methods rename them to
+# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc.
+
+ss_or, _ = S.get_ss_orange_red(data)
+ss_gb, _ = S.get_ss_green_blue(data)
+vs, _ = S.get_voice_scale_1_10(data)
+
+# Combine all question groups into one wide LazyFrame (joined on _recordId)
+all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
+
+# Run straight-liner detection across all question groups
+# max_score=5 catches all speaking-style straight-lining (1-5 scale)
+# and voice-scale values ≤5 on the 1-10 scale
+print("Running straight-liner detection across all question groups...")
+sl_report, sl_df = check_straight_liners(all_questions, max_score=5)
+
+# %% Quantify repeat offenders
+# sl_df has one row per (Record ID, Question Group) that was straight-lined.
+# Group by Record ID to count how many question groups each person SL'd.
+
+if sl_df is not None and not sl_df.is_empty():
+    total_respondents = data.select(pl.len()).collect().item()
+
+    # Per-respondent count of straight-lined question groups
+    respondent_sl_counts = (
+        sl_df
+        .group_by("Record ID")
+        .agg(pl.len().alias("sl_count"))
+        .sort("sl_count", descending=True)
+    )
+
+    max_sl = respondent_sl_counts["sl_count"].max()
+    print(f"\nTotal respondents: {total_respondents}")
+    print(f"Respondents who straight-lined at least 1 question group: "
+          f"{respondent_sl_counts.height}")
+    print(f"Maximum question groups straight-lined by one person: {max_sl}")
+    print()
+
+    # Build cumulative distribution: for each threshold N, count respondents
+    # who straight-lined >= N question groups
+    cumulative_rows = []
+    for threshold in range(1, max_sl + 1):
+        count = respondent_sl_counts.filter(
+            pl.col("sl_count") >= threshold
+        ).height
+        pct = (count / total_respondents) * 100
+        cumulative_rows.append({
+            "threshold": threshold,
+            "count": count,
+            "pct": pct,
+        })
+        print(
+            f"  ≥{threshold} question groups straight-lined: "
+            f"{count} respondents ({pct:.1f}%)"
+        )
+
+    cumulative_df = pl.DataFrame(cumulative_rows)
+    print(f"\n{cumulative_df}")
+
+    # %% Save cumulative data to CSV
+    _filter_slug = S._get_filter_slug()
+    _csv_dir = Path(S.fig_save_dir) / _filter_slug
+    _csv_dir.mkdir(parents=True, exist_ok=True)
+
+    _csv_path = _csv_dir / "straight_liner_repeat_offenders.csv"
+    cumulative_df.write_csv(_csv_path)
+    print(f"Saved cumulative data to {_csv_path}")
+
+    # %% Plot the cumulative distribution
+    S.plot_straight_liner_repeat_offenders(
+        cumulative_df,
+        total_respondents=total_respondents,
+    )
+
+    # %% Per-question straight-lining frequency
+    # Build human-readable question group names from the raw keys
+    def _humanise_question_group(key: str) -> str:
+        """Convert internal question group key to a readable label.
+
+        Examples:
+            SS_Green_Blue__V14  → Green/Blue – V14
+            SS_Orange_Red__V48  → Orange/Red – V48
+            Voice_Scale_1_10    → Voice Scale (1-10)
+        """
+        if key.startswith("SS_Green_Blue__"):
+            voice = key.split("__")[1]
+            return f"Green/Blue – {voice}"
+        if key.startswith("SS_Orange_Red__"):
+            voice = key.split("__")[1]
+            return f"Orange/Red – {voice}"
+        if key == "Voice_Scale_1_10":
+            return "Voice Scale (1-10)"
+        # Fallback: replace underscores
+        return key.replace("_", " ")
+
+    per_question_counts = (
+        sl_df
+        .group_by("Question Group")
+        .agg(pl.col("Record ID").n_unique().alias("count"))
+        .sort("count", descending=True)
+        .with_columns(
+            (pl.col("count") / total_respondents * 100).alias("pct")
+        )
+    )
+
+    # Add human-readable names
+    per_question_counts = per_question_counts.with_columns(
+        pl.col("Question Group").map_elements(
+            _humanise_question_group, return_dtype=pl.Utf8
+        ).alias("question")
+    )
+
+    print("\n--- Per-Question Straight-Lining Frequency ---")
+    print(per_question_counts)
+
+    # Save per-question data to CSV
+    _csv_path_pq = _csv_dir / "straight_liner_per_question.csv"
+    per_question_counts.write_csv(_csv_path_pq)
+    print(f"Saved per-question data to {_csv_path_pq}")
+
+    # Plot
+    S.plot_straight_liner_per_question(
+        per_question_counts,
+        total_respondents=total_respondents,
+    )
+
+    # %% Show the top repeat offenders (respondents with most SL'd groups)
+    print("\n--- Top Repeat Offenders ---")
+    print(respondent_sl_counts.head(20))
+
+else:
+    print("No straight-liners detected in the dataset.")
--- a/plots.py
+++ b/plots.py
@@ -1115,6 +1115,7 @@ class QualtricsPlotsMixin:
        title: str = "Speaking Style Trait Analysis",
        height: int | None = None,
        width: int | str | None = None,
+        color_gender: bool = False,
    ) -> alt.Chart:
        """Plot scores for a single speaking style trait across multiple voices."""
        df = self._ensure_dataframe(data)
@@ -1156,6 +1157,41 @@ class QualtricsPlotsMixin:
            else:
                trait_description = ""

+        if color_gender:
+            stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
+            
+            bars = alt.Chart(stats).mark_bar().encode(
+                x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
+                x2=alt.datum(1),  # Bars start at x=1 (left edge of domain)
+                y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
+                color=alt.Color('gender:N',
+                               scale=alt.Scale(domain=['Male', 'Female'],
+                                             range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]),
+                               legend=alt.Legend(orient='top', direction='horizontal', title='Gender')),
+                tooltip=[
+                    alt.Tooltip('Voice:N'),
+                    alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
+                    alt.Tooltip('count:Q', title='Count'),
+                    alt.Tooltip('gender:N', title='Gender')
+                ]
+            )
+            
+            text = alt.Chart(stats).mark_text(
+                align='left',
+                baseline='middle',
+                dx=5,
+                fontSize=12
+            ).encode(
+                x='mean_score:Q',
+                y=alt.Y('Voice:N', sort='-x'),
+                text='count:Q',
+                color=alt.condition(
+                    alt.datum.gender == 'Female',
+                    alt.value(ColorPalette.GENDER_FEMALE),
+                    alt.value(ColorPalette.GENDER_MALE)
+                )
+            )
+        else:
            # Horizontal bar chart - use x2 to explicitly start bars at x=1
            bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
                x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
@@ -1168,13 +1204,13 @@ class QualtricsPlotsMixin:
                ]
            )

-        # Count text at end of bars (right-aligned inside bar)
+            # Count text at end of bars
            text = alt.Chart(stats).mark_text(
-            align='right',
+                align='left',
                baseline='middle',
-            color='white',
+                color='black',
                fontSize=12,
-            dx=-5  # Slight padding from bar end
+                dx=5
            ).encode(
                x='mean_score:Q',
                y=alt.Y('Voice:N', sort='-x'),
@@ -1185,7 +1221,7 @@ class QualtricsPlotsMixin:
        chart = (bars + text).properties(
            title={
                "text": self._process_title(title),
-                "subtitle": [trait_description, "(Numbers on bars indicate respondent count)"]
+                "subtitle": [trait_description, "(Numbers near bars indicate respondent count)"]
            },
            width=width or 800,
            height=height or getattr(self, 'plot_height', 400)
@@ -1194,6 +1230,101 @@ class QualtricsPlotsMixin:
        chart = self._save_plot(chart, title)
        return chart

+    def plot_speaking_style_trait_scores_comparison(
+        self,
+        data_all: pl.LazyFrame | pl.DataFrame,
+        data_clean: pl.LazyFrame | pl.DataFrame,
+        trait_description: str = None,
+        title: str = "Speaking Style Trait Analysis (Comparison)",
+        height: int | None = None,
+        width: int | str | None = None,
+    ) -> alt.Chart:
+        """Plot scores comparing All Respondents vs Cleaned data (excl. straight-liners) in grouped bars."""
+        
+        # Helper to process each dataframe
+        def get_stats(d, group_label):
+            df = self._ensure_dataframe(d)
+            if df.is_empty(): return None
+            
+            return (
+                df.filter(pl.col("score").is_not_null())
+                .group_by("Voice")
+                .agg([
+                    pl.col("score").mean().alias("mean_score"),
+                    pl.col("score").count().alias("count")
+                ])
+                .with_columns(pl.lit(group_label).alias("dataset"))
+                .to_pandas()
+            )
+
+        stats_all = get_stats(data_all, "All Respondents")
+        stats_clean = get_stats(data_clean, "Excl. Straight-Liners")
+        
+        if stats_all is None or stats_clean is None:
+             return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
+
+        # Combine
+        stats = pd.concat([stats_all, stats_clean])
+        
+        # Determine sort order using "All Respondents" data (Desc)
+        sort_order = stats_all.sort_values('mean_score', ascending=False)['Voice'].tolist()
+        
+        # Add gender and combined category for color
+        stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
+        stats['color_group'] = stats.apply(
+            lambda x: f"{x['gender']} - {x['dataset']}", axis=1
+        )
+        
+        # Define Color Scale
+        domain = [
+            'Male - All Respondents', 'Male - Excl. Straight-Liners',
+            'Female - All Respondents', 'Female - Excl. Straight-Liners'
+        ]
+        range_colors = [
+            ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_RANK_3, 
+            ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_RANK_3
+        ]
+
+        # Base chart
+        base = alt.Chart(stats).encode(
+            y=alt.Y('Voice:N', title='Voice', sort=sort_order, axis=alt.Axis(grid=False)),
+        )
+
+        bars = base.mark_bar().encode(
+            x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
+            x2=alt.datum(1),
+            yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
+            color=alt.Color('color_group:N', 
+                           scale=alt.Scale(domain=domain, range=range_colors),
+                           legend=alt.Legend(title='Dataset', orient='top', columns=2)),
+            tooltip=[
+                alt.Tooltip('Voice:N'),
+                alt.Tooltip('dataset:N', title='Dataset'),
+                alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
+                alt.Tooltip('count:Q', title='Count'),
+                alt.Tooltip('gender:N', title='Gender')
+            ]
+        )
+        
+        text = base.mark_text(align='left', baseline='middle', dx=5, fontSize=9).encode(
+            x=alt.X('mean_score:Q'),
+            yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
+            text=alt.Text('count:Q'),
+             color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=None)
+        )
+
+        chart = (bars + text).properties(
+            title={
+                "text": self._process_title(title),
+                "subtitle": [trait_description if trait_description else "", "(Lighter shade = Straight-liners removed)"]
+            },
+            width=width or 800,
+            height=height or getattr(self, 'plot_height', 600)
+        )
+        
+        chart = self._save_plot(chart, title)
+        return chart
+
    def plot_speaking_style_scale_correlation(
        self,
        style_color: str,
@@ -2497,3 +2628,212 @@ class QualtricsPlotsMixin:
        
        chart = self._save_plot(chart, title)
        return chart
+
+    def plot_straight_liner_repeat_offenders(
+        self,
+        cumulative_df: pl.DataFrame | pd.DataFrame,
+        title: str = "Straight-Liner Repeat Offenders\n(Cumulative Distribution)",
+        height: int | None = None,
+        width: int | str | None = None,
+        total_respondents: int | None = None,
+    ) -> alt.Chart:
+        """Plot the cumulative distribution of straight-liner repeat offenders.
+
+        Shows how many respondents straight-lined at N or more question
+        groups, for every observed threshold.
+
+        Parameters:
+            cumulative_df: DataFrame with columns ``threshold`` (int),
+                ``count`` (int) and ``pct`` (float, 0-100).  Each row
+                represents "≥ threshold question groups".
+            title: Chart title.
+            height: Chart height in pixels.
+            width: Chart width in pixels.
+            total_respondents: If provided, shown in the subtitle for
+                context.
+
+        Returns:
+            The Altair chart object (already saved if ``fig_save_dir``
+            is configured).
+        """
+        if isinstance(cumulative_df, pl.DataFrame):
+            plot_df = cumulative_df.to_pandas()
+        else:
+            plot_df = cumulative_df.copy()
+
+        # Build readable x-axis labels ("≥1", "≥2", …)
+        plot_df["label"] = plot_df["threshold"].apply(lambda t: f"≥{t}")
+
+        # Explicit sort order so Altair keeps ascending threshold
+        sort_order = plot_df.sort_values("threshold")["label"].tolist()
+
+        # --- Bars: respondent count ---
+        bars = alt.Chart(plot_df).mark_bar(
+            color=ColorPalette.PRIMARY
+        ).encode(
+            x=alt.X(
+                "label:N",
+                title="Number of Straight-Lined Question Groups",
+                sort=sort_order,
+                axis=alt.Axis(grid=False),
+            ),
+            y=alt.Y(
+                "count:Q",
+                title="Number of Respondents",
+                axis=alt.Axis(grid=True),
+            ),
+            tooltip=[
+                alt.Tooltip("label:N", title="Threshold"),
+                alt.Tooltip("count:Q", title="Respondents"),
+                alt.Tooltip("pct:Q", title="% of Total", format=".1f"),
+            ],
+        )
+
+        # --- Text: count + percentage above each bar ---
+        text = alt.Chart(plot_df).mark_text(
+            dy=-10, color="black", fontSize=11
+        ).encode(
+            x=alt.X("label:N", sort=sort_order),
+            y=alt.Y("count:Q"),
+            text=alt.Text("count_label:N"),
+        )
+
+        # Build a combined label column "N  (xx.x%)"
+        plot_df["count_label"] = plot_df.apply(
+            lambda r: f"{int(r['count'])}  ({r['pct']:.1f}%)", axis=1
+        )
+
+        # Rebuild text layer with the updated df
+        text = alt.Chart(plot_df).mark_text(
+            dy=-10, color="black", fontSize=11
+        ).encode(
+            x=alt.X("label:N", sort=sort_order),
+            y=alt.Y("count:Q"),
+            text=alt.Text("count_label:N"),
+        )
+
+        # --- Subtitle ---
+        subtitle_parts = []
+        if total_respondents is not None:
+            subtitle_parts.append(
+                f"Total respondents: {total_respondents}"
+            )
+        subtitle_parts.append(
+            "Each bar shows how many respondents straight-lined "
+            "at least that many question groups"
+        )
+        subtitle = " | ".join(subtitle_parts)
+
+        title_config = {
+            "text": self._process_title(title),
+            "subtitle": subtitle,
+            "subtitleColor": "gray",
+            "subtitleFontSize": 10,
+            "anchor": "start",
+        }
+
+        chart = alt.layer(bars, text).properties(
+            title=title_config,
+            width=width or 800,
+            height=height or getattr(self, "plot_height", 400),
+        )
+
+        chart = self._save_plot(chart, title)
+        return chart
+
+    def plot_straight_liner_per_question(
+        self,
+        per_question_df: pl.DataFrame | pd.DataFrame,
+        title: str = "Straight-Lining Frequency per Question Group",
+        height: int | None = None,
+        width: int | str | None = None,
+        total_respondents: int | None = None,
+    ) -> alt.Chart:
+        """Plot how often each question group is straight-lined.
+
+        Parameters:
+            per_question_df: DataFrame with columns ``question`` (str,
+                human-readable name), ``count`` (int) and ``pct``
+                (float, 0-100).  Sorted descending by count.
+            title: Chart title.
+            height: Chart height in pixels.
+            width: Chart width in pixels.
+            total_respondents: Shown in subtitle for context.
+
+        Returns:
+            The Altair chart (saved if ``fig_save_dir`` is set).
+        """
+        if isinstance(per_question_df, pl.DataFrame):
+            plot_df = per_question_df.to_pandas()
+        else:
+            plot_df = per_question_df.copy()
+
+        # Sort order: largest count at top. Altair y-axis nominal sort places
+        # the first list element at the top, so descending order is correct.
+        sort_order = plot_df.sort_values("count", ascending=False)["question"].tolist()
+
+        # Combined label  "N  (xx.x%)"
+        plot_df["count_label"] = plot_df.apply(
+            lambda r: f"{int(r['count'])}  ({r['pct']:.1f}%)", axis=1
+        )
+
+        # --- Horizontal Bars ---
+        bars = alt.Chart(plot_df).mark_bar(
+            color=ColorPalette.PRIMARY,
+        ).encode(
+            y=alt.Y(
+                "question:N",
+                title=None,
+                sort=sort_order,
+                axis=alt.Axis(grid=False, labelLimit=250, labelAngle=0),
+            ),
+            x=alt.X(
+                "count:Q",
+                title="Number of Straight-Liners",
+                axis=alt.Axis(grid=True),
+            ),
+            tooltip=[
+                alt.Tooltip("question:N", title="Question"),
+                alt.Tooltip("count:Q", title="Straight-Liners"),
+                alt.Tooltip("pct:Q", title="% of Respondents", format=".1f"),
+            ],
+        )
+
+        # --- Text labels to the right of bars ---
+        text = alt.Chart(plot_df).mark_text(
+            align="left", dx=4, color="black", fontSize=10,
+        ).encode(
+            y=alt.Y("question:N", sort=sort_order),
+            x=alt.X("count:Q"),
+            text=alt.Text("count_label:N"),
+        )
+
+        # --- Subtitle ---
+        subtitle_parts = []
+        if total_respondents is not None:
+            subtitle_parts.append(f"Total respondents: {total_respondents}")
+        subtitle_parts.append(
+            "Count and share of respondents who straight-lined each question group"
+        )
+        subtitle = " | ".join(subtitle_parts)
+
+        title_config = {
+            "text": self._process_title(title),
+            "subtitle": subtitle,
+            "subtitleColor": "gray",
+            "subtitleFontSize": 10,
+            "anchor": "start",
+        }
+
+        # Scale height with number of questions for readable bar spacing
+        n_questions = len(plot_df)
+        auto_height = max(400, n_questions * 22)
+
+        chart = alt.layer(bars, text).properties(
+            title=title_config,
+            width=width or 700,
+            height=height or auto_height,
+        )
+
+        chart = self._save_plot(chart, title)
+        return chart