From 6c16993cb3a9e7db4434bafac076c1ab090263bb Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Mon, 9 Feb 2026 17:26:45 +0100 Subject: [PATCH] straight-liner plot analysis --- .vscode/extensions.json | 5 + XX_detailed_trait_analysis.py | 263 +++++++++++++++++++++++ XX_straight_liners.py | 265 +++++++++++++++++++++++ plots.py | 388 +++++++++++++++++++++++++++++++--- 4 files changed, 897 insertions(+), 24 deletions(-) create mode 100644 .vscode/extensions.json create mode 100644 XX_detailed_trait_analysis.py create mode 100644 XX_straight_liners.py diff --git a/.vscode/extensions.json b/.vscode/extensions.json new file mode 100644 index 0000000..9c50cb7 --- /dev/null +++ b/.vscode/extensions.json @@ -0,0 +1,5 @@ +{ + "recommendations": [ + "wakatime.vscode-wakatime" + ] +} \ No newline at end of file diff --git a/XX_detailed_trait_analysis.py b/XX_detailed_trait_analysis.py new file mode 100644 index 0000000..b053306 --- /dev/null +++ b/XX_detailed_trait_analysis.py @@ -0,0 +1,263 @@ +"""Extra analyses of the traits""" +# %% Imports + +import utils +import polars as pl +import argparse +import json +import re +from pathlib import Path +from validation import check_straight_liners + + +# %% Fixed Variables +RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv' +QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' + + +# %% CLI argument parsing for batch automation +# When run as script: uv run XX_statistical_significance.script.py --age '["18 +# Central filter configuration - add new filters here only +# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name' +FILTER_CONFIG = { + 'age': 'options_age', + 'gender': 'options_gender', + 'ethnicity': 'options_ethnicity', + 'income': 'options_income', + 'consumer': 'options_consumer', + 'business_owner': 'options_business_owner', + 'ai_user': 'options_ai_user', + 'investable_assets': 'options_investable_assets', + 'industry': 'options_industry', +} + +def parse_cli_args(): + parser = argparse.ArgumentParser(description='Generate quant report with optional filters') + + # Dynamically add filter arguments from config + for filter_name in FILTER_CONFIG: + parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') + + parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') + parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') + + # Only parse if running as script (not in Jupyter/interactive) + try: + # Check if running in Jupyter by looking for ipykernel + get_ipython() # noqa: F821 # type: ignore + # Return namespace with all filters set to None + no_filters = {f: None for f in FILTER_CONFIG} + # Use the same default as argparse + default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}' + return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir) + except NameError: + args = parser.parse_args() + # Parse JSON strings to lists + for filter_name in FILTER_CONFIG: + val = getattr(args, filter_name) + setattr(args, filter_name, json.loads(val) if val else None) + return args + +cli_args = parse_cli_args() + + +# %% +S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir) +data_all = S.load_data() + + +# %% Build filtered dataset based on CLI args + +# CLI args: None means "no filter applied" - filter_data() will skip None filters + +# Build filter values dict dynamically from FILTER_CONFIG +_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG} + +_d = S.filter_data(data_all, **_active_filters) + +# Write filter description file if filter-name is provided +if cli_args.filter_name and S.fig_save_dir: + # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.) + _filter_slug = S._get_filter_slug() + _filter_slug_dir = S.fig_save_dir / _filter_slug + _filter_slug_dir.mkdir(parents=True, exist_ok=True) + + # Build filter description + _filter_desc_lines = [ + f"Filter: {cli_args.filter_name}", + "", + "Applied Filters:", + ] + _short_desc_parts = [] + for filter_name, options_attr in FILTER_CONFIG.items(): + all_options = getattr(S, options_attr) + values = _active_filters[filter_name] + display_name = filter_name.replace('_', ' ').title() + # None means no filter applied (same as "All") + if values is not None and values != all_options: + _short_desc_parts.append(f"{display_name}: {', '.join(values)}") + _filter_desc_lines.append(f" {display_name}: {', '.join(values)}") + else: + _filter_desc_lines.append(f" {display_name}: All") + + # Write detailed description INSIDE the filter-slug directory + # Sanitize filter name for filename usage (replace / and other chars) + _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name) + _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt" + _filter_file.write_text('\n'.join(_filter_desc_lines)) + + # Append to summary index file at figures//filter_index.txt + _summary_file = S.fig_save_dir / "filter_index.txt" + _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents" + _summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n" + + # Append or create the summary file + if _summary_file.exists(): + _existing = _summary_file.read_text() + # Avoid duplicate entries for same slug + if _filter_slug not in _existing: + with _summary_file.open('a') as f: + f.write(_summary_line) + else: + _header = "Filter Index\n" + "=" * 80 + "\n\n" + _header += "Directory | Filter Name | Description\n" + _header += "-" * 80 + "\n" + _summary_file.write_text(_header + _summary_line) + +# Save to logical variable name for further analysis +data = _d +data.collect() + +# %% Voices per trait + + +ss_or, choice_map_or = S.get_ss_orange_red(data) +ss_gb, choice_map_gb = S.get_ss_green_blue(data) + +# Combine the data +ss_all = ss_or.join(ss_gb, on='_recordId') +_d = ss_all.collect() + +choice_map = {**choice_map_or, **choice_map_gb} +# print(_d.head()) +# print(choice_map) +ss_long = utils.process_speaking_style_data(ss_all, choice_map) + + +# %% Create plots + +for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()): + trait_d = ss_long.filter(pl.col("Description") == trait) + + S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True) + + + + + +# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes +# Save with different filename suffix so we can compare with/without straight-liners + +print("\n--- Straight-lining Checks on TRAITS ---") +sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5) +sl_traits_df + +# %% + +if sl_traits_df is not None and not sl_traits_df.is_empty(): + sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list() + n_sl_groups = sl_traits_df.height + print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.") + + # Create key in ss_long to match sl_traits_df for anti-join + # Question Group key in sl_traits_df is like "SS_Orange_Red__V14" + # ss_long has "Style_Group" and "Voice" + ss_long_w_key = ss_long.with_columns( + (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group") + ) + + # Prepare filter table: Record ID + Question Group + sl_filter = sl_traits_df.select([ + pl.col("Record ID").alias("_recordId"), + pl.col("Question Group") + ]) + + # Anti-join to remove specific question blocks that were straight-lined + ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group") + + # Re-plot with suffix in title + print("Re-plotting traits (Cleaned)...") + for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()): + trait_d = ss_long_clean.filter(pl.col("Description") == trait) + + # Modify title to create unique filename (and display title) + title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)" + + S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True) +else: + print("No straight-liners found on traits.") + + + + +# %% Compare All vs Cleaned +if sl_traits_df is not None and not sl_traits_df.is_empty(): + print("Generating Comparison Plots (All vs Cleaned)...") + + # Always apply the per-question-group filtering here to ensure consistency + # (Matches the logic used in the re-plotting section above) + print("Applying filter to remove straight-lined question blocks...") + ss_long_w_key = ss_long.with_columns( + (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group") + ) + sl_filter = sl_traits_df.select([ + pl.col("Record ID").alias("_recordId"), + pl.col("Question Group") + ]) + ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group") + + sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list() + + # --- Verification Prints --- + print(f"\n--- Verification of Filter ---") + print(f"Original Row Count: {ss_long.height}") + print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}") + print(f"Sample IDs affected: {sl_ids[:5]}") + print(f"Cleaned Row Count: {ss_long_clean.height}") + print(f"Rows Removed: {ss_long.height - ss_long_clean.height}") + + # Verify removal + # Re-construct key to verify + ss_long_check = ss_long.with_columns( + (pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group") + ) + sl_filter_check = sl_traits_df.select([ + pl.col("Record ID").alias("_recordId"), + pl.col("Question Group") + ]) + + should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height + print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }") + + # Show what was removed (the straight lining behavior) + print("\nSample of Straight-Liner Data (Values that caused removal):") + print(sl_traits_df.head(5)) + print("-" * 30 + "\n") + # --------------------------- + + for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()): + + # Get data for this trait from both datasets + trait_d_all = ss_long.filter(pl.col("Description") == trait) + trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait) + + # Plot comparison + title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)" + + S.plot_speaking_style_trait_scores_comparison( + trait_d_all, + trait_d_clean, + title=title_comp, + height=600 # Slightly taller for grouped bars + ) + diff --git a/XX_straight_liners.py b/XX_straight_liners.py new file mode 100644 index 0000000..68f359f --- /dev/null +++ b/XX_straight_liners.py @@ -0,0 +1,265 @@ +"""Extra analyses of the straight-liners""" +# %% Imports + +import utils +import polars as pl +import argparse +import json +import re +from pathlib import Path +from validation import check_straight_liners + + +# %% Fixed Variables +RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv' +QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf' + + +# %% CLI argument parsing for batch automation +# When run as script: uv run XX_statistical_significance.script.py --age '["18 +# Central filter configuration - add new filters here only +# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name' +FILTER_CONFIG = { + 'age': 'options_age', + 'gender': 'options_gender', + 'ethnicity': 'options_ethnicity', + 'income': 'options_income', + 'consumer': 'options_consumer', + 'business_owner': 'options_business_owner', + 'ai_user': 'options_ai_user', + 'investable_assets': 'options_investable_assets', + 'industry': 'options_industry', +} + +def parse_cli_args(): + parser = argparse.ArgumentParser(description='Generate quant report with optional filters') + + # Dynamically add filter arguments from config + for filter_name in FILTER_CONFIG: + parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values') + + parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)') + parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory') + + # Only parse if running as script (not in Jupyter/interactive) + try: + # Check if running in Jupyter by looking for ipykernel + get_ipython() # noqa: F821 # type: ignore + # Return namespace with all filters set to None + no_filters = {f: None for f in FILTER_CONFIG} + # Use the same default as argparse + default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}' + return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir) + except NameError: + args = parser.parse_args() + # Parse JSON strings to lists + for filter_name in FILTER_CONFIG: + val = getattr(args, filter_name) + setattr(args, filter_name, json.loads(val) if val else None) + return args + +cli_args = parse_cli_args() + + +# %% +S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir) +data_all = S.load_data() + + +# %% Build filtered dataset based on CLI args + +# CLI args: None means "no filter applied" - filter_data() will skip None filters + +# Build filter values dict dynamically from FILTER_CONFIG +_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG} + +_d = S.filter_data(data_all, **_active_filters) + +# Write filter description file if filter-name is provided +if cli_args.filter_name and S.fig_save_dir: + # Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.) + _filter_slug = S._get_filter_slug() + _filter_slug_dir = S.fig_save_dir / _filter_slug + _filter_slug_dir.mkdir(parents=True, exist_ok=True) + + # Build filter description + _filter_desc_lines = [ + f"Filter: {cli_args.filter_name}", + "", + "Applied Filters:", + ] + _short_desc_parts = [] + for filter_name, options_attr in FILTER_CONFIG.items(): + all_options = getattr(S, options_attr) + values = _active_filters[filter_name] + display_name = filter_name.replace('_', ' ').title() + # None means no filter applied (same as "All") + if values is not None and values != all_options: + _short_desc_parts.append(f"{display_name}: {', '.join(values)}") + _filter_desc_lines.append(f" {display_name}: {', '.join(values)}") + else: + _filter_desc_lines.append(f" {display_name}: All") + + # Write detailed description INSIDE the filter-slug directory + # Sanitize filter name for filename usage (replace / and other chars) + _safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name) + _filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt" + _filter_file.write_text('\n'.join(_filter_desc_lines)) + + # Append to summary index file at figures//filter_index.txt + _summary_file = S.fig_save_dir / "filter_index.txt" + _short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents" + _summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n" + + # Append or create the summary file + if _summary_file.exists(): + _existing = _summary_file.read_text() + # Avoid duplicate entries for same slug + if _filter_slug not in _existing: + with _summary_file.open('a') as f: + f.write(_summary_line) + else: + _header = "Filter Index\n" + "=" * 80 + "\n\n" + _header += "Directory | Filter Name | Description\n" + _header += "-" * 80 + "\n" + _summary_file.write_text(_header + _summary_line) + +# Save to logical variable name for further analysis +data = _d +data.collect() + + +# %% Determine straight-liner repeat offenders +# Extract question groups with renamed columns that check_straight_liners expects. +# The raw `data` has QID-based column names; the getter methods rename them to +# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc. + +ss_or, _ = S.get_ss_orange_red(data) +ss_gb, _ = S.get_ss_green_blue(data) +vs, _ = S.get_voice_scale_1_10(data) + +# Combine all question groups into one wide LazyFrame (joined on _recordId) +all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId') + +# Run straight-liner detection across all question groups +# max_score=5 catches all speaking-style straight-lining (1-5 scale) +# and voice-scale values ≤5 on the 1-10 scale +print("Running straight-liner detection across all question groups...") +sl_report, sl_df = check_straight_liners(all_questions, max_score=5) + +# %% Quantify repeat offenders +# sl_df has one row per (Record ID, Question Group) that was straight-lined. +# Group by Record ID to count how many question groups each person SL'd. + +if sl_df is not None and not sl_df.is_empty(): + total_respondents = data.select(pl.len()).collect().item() + + # Per-respondent count of straight-lined question groups + respondent_sl_counts = ( + sl_df + .group_by("Record ID") + .agg(pl.len().alias("sl_count")) + .sort("sl_count", descending=True) + ) + + max_sl = respondent_sl_counts["sl_count"].max() + print(f"\nTotal respondents: {total_respondents}") + print(f"Respondents who straight-lined at least 1 question group: " + f"{respondent_sl_counts.height}") + print(f"Maximum question groups straight-lined by one person: {max_sl}") + print() + + # Build cumulative distribution: for each threshold N, count respondents + # who straight-lined >= N question groups + cumulative_rows = [] + for threshold in range(1, max_sl + 1): + count = respondent_sl_counts.filter( + pl.col("sl_count") >= threshold + ).height + pct = (count / total_respondents) * 100 + cumulative_rows.append({ + "threshold": threshold, + "count": count, + "pct": pct, + }) + print( + f" ≥{threshold} question groups straight-lined: " + f"{count} respondents ({pct:.1f}%)" + ) + + cumulative_df = pl.DataFrame(cumulative_rows) + print(f"\n{cumulative_df}") + + # %% Save cumulative data to CSV + _filter_slug = S._get_filter_slug() + _csv_dir = Path(S.fig_save_dir) / _filter_slug + _csv_dir.mkdir(parents=True, exist_ok=True) + + _csv_path = _csv_dir / "straight_liner_repeat_offenders.csv" + cumulative_df.write_csv(_csv_path) + print(f"Saved cumulative data to {_csv_path}") + + # %% Plot the cumulative distribution + S.plot_straight_liner_repeat_offenders( + cumulative_df, + total_respondents=total_respondents, + ) + + # %% Per-question straight-lining frequency + # Build human-readable question group names from the raw keys + def _humanise_question_group(key: str) -> str: + """Convert internal question group key to a readable label. + + Examples: + SS_Green_Blue__V14 → Green/Blue – V14 + SS_Orange_Red__V48 → Orange/Red – V48 + Voice_Scale_1_10 → Voice Scale (1-10) + """ + if key.startswith("SS_Green_Blue__"): + voice = key.split("__")[1] + return f"Green/Blue – {voice}" + if key.startswith("SS_Orange_Red__"): + voice = key.split("__")[1] + return f"Orange/Red – {voice}" + if key == "Voice_Scale_1_10": + return "Voice Scale (1-10)" + # Fallback: replace underscores + return key.replace("_", " ") + + per_question_counts = ( + sl_df + .group_by("Question Group") + .agg(pl.col("Record ID").n_unique().alias("count")) + .sort("count", descending=True) + .with_columns( + (pl.col("count") / total_respondents * 100).alias("pct") + ) + ) + + # Add human-readable names + per_question_counts = per_question_counts.with_columns( + pl.col("Question Group").map_elements( + _humanise_question_group, return_dtype=pl.Utf8 + ).alias("question") + ) + + print("\n--- Per-Question Straight-Lining Frequency ---") + print(per_question_counts) + + # Save per-question data to CSV + _csv_path_pq = _csv_dir / "straight_liner_per_question.csv" + per_question_counts.write_csv(_csv_path_pq) + print(f"Saved per-question data to {_csv_path_pq}") + + # Plot + S.plot_straight_liner_per_question( + per_question_counts, + total_respondents=total_respondents, + ) + + # %% Show the top repeat offenders (respondents with most SL'd groups) + print("\n--- Top Repeat Offenders ---") + print(respondent_sl_counts.head(20)) + +else: + print("No straight-liners detected in the dataset.") \ No newline at end of file diff --git a/plots.py b/plots.py index c633ee7..c27c888 100644 --- a/plots.py +++ b/plots.py @@ -1115,6 +1115,7 @@ class QualtricsPlotsMixin: title: str = "Speaking Style Trait Analysis", height: int | None = None, width: int | str | None = None, + color_gender: bool = False, ) -> alt.Chart: """Plot scores for a single speaking style trait across multiple voices.""" df = self._ensure_dataframe(data) @@ -1156,36 +1157,71 @@ class QualtricsPlotsMixin: else: trait_description = "" - # Horizontal bar chart - use x2 to explicitly start bars at x=1 - bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode( - x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), - x2=alt.datum(1), # Bars start at x=1 (left edge of domain) - y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)), - tooltip=[ - alt.Tooltip('Voice:N'), - alt.Tooltip('mean_score:Q', title='Average', format='.2f'), - alt.Tooltip('count:Q', title='Count') - ] - ) + if color_gender: + stats['gender'] = stats['Voice'].apply(self._get_voice_gender) + + bars = alt.Chart(stats).mark_bar().encode( + x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), + x2=alt.datum(1), # Bars start at x=1 (left edge of domain) + y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)), + color=alt.Color('gender:N', + scale=alt.Scale(domain=['Male', 'Female'], + range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]), + legend=alt.Legend(orient='top', direction='horizontal', title='Gender')), + tooltip=[ + alt.Tooltip('Voice:N'), + alt.Tooltip('mean_score:Q', title='Average', format='.2f'), + alt.Tooltip('count:Q', title='Count'), + alt.Tooltip('gender:N', title='Gender') + ] + ) + + text = alt.Chart(stats).mark_text( + align='left', + baseline='middle', + dx=5, + fontSize=12 + ).encode( + x='mean_score:Q', + y=alt.Y('Voice:N', sort='-x'), + text='count:Q', + color=alt.condition( + alt.datum.gender == 'Female', + alt.value(ColorPalette.GENDER_FEMALE), + alt.value(ColorPalette.GENDER_MALE) + ) + ) + else: + # Horizontal bar chart - use x2 to explicitly start bars at x=1 + bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode( + x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), + x2=alt.datum(1), # Bars start at x=1 (left edge of domain) + y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)), + tooltip=[ + alt.Tooltip('Voice:N'), + alt.Tooltip('mean_score:Q', title='Average', format='.2f'), + alt.Tooltip('count:Q', title='Count') + ] + ) - # Count text at end of bars (right-aligned inside bar) - text = alt.Chart(stats).mark_text( - align='right', - baseline='middle', - color='white', - fontSize=12, - dx=-5 # Slight padding from bar end - ).encode( - x='mean_score:Q', - y=alt.Y('Voice:N', sort='-x'), - text='count:Q' - ) + # Count text at end of bars + text = alt.Chart(stats).mark_text( + align='left', + baseline='middle', + color='black', + fontSize=12, + dx=5 + ).encode( + x='mean_score:Q', + y=alt.Y('Voice:N', sort='-x'), + text='count:Q' + ) # Combine layers chart = (bars + text).properties( title={ "text": self._process_title(title), - "subtitle": [trait_description, "(Numbers on bars indicate respondent count)"] + "subtitle": [trait_description, "(Numbers near bars indicate respondent count)"] }, width=width or 800, height=height or getattr(self, 'plot_height', 400) @@ -1194,6 +1230,101 @@ class QualtricsPlotsMixin: chart = self._save_plot(chart, title) return chart + def plot_speaking_style_trait_scores_comparison( + self, + data_all: pl.LazyFrame | pl.DataFrame, + data_clean: pl.LazyFrame | pl.DataFrame, + trait_description: str = None, + title: str = "Speaking Style Trait Analysis (Comparison)", + height: int | None = None, + width: int | str | None = None, + ) -> alt.Chart: + """Plot scores comparing All Respondents vs Cleaned data (excl. straight-liners) in grouped bars.""" + + # Helper to process each dataframe + def get_stats(d, group_label): + df = self._ensure_dataframe(d) + if df.is_empty(): return None + + return ( + df.filter(pl.col("score").is_not_null()) + .group_by("Voice") + .agg([ + pl.col("score").mean().alias("mean_score"), + pl.col("score").count().alias("count") + ]) + .with_columns(pl.lit(group_label).alias("dataset")) + .to_pandas() + ) + + stats_all = get_stats(data_all, "All Respondents") + stats_clean = get_stats(data_clean, "Excl. Straight-Liners") + + if stats_all is None or stats_clean is None: + return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N') + + # Combine + stats = pd.concat([stats_all, stats_clean]) + + # Determine sort order using "All Respondents" data (Desc) + sort_order = stats_all.sort_values('mean_score', ascending=False)['Voice'].tolist() + + # Add gender and combined category for color + stats['gender'] = stats['Voice'].apply(self._get_voice_gender) + stats['color_group'] = stats.apply( + lambda x: f"{x['gender']} - {x['dataset']}", axis=1 + ) + + # Define Color Scale + domain = [ + 'Male - All Respondents', 'Male - Excl. Straight-Liners', + 'Female - All Respondents', 'Female - Excl. Straight-Liners' + ] + range_colors = [ + ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_RANK_3, + ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_RANK_3 + ] + + # Base chart + base = alt.Chart(stats).encode( + y=alt.Y('Voice:N', title='Voice', sort=sort_order, axis=alt.Axis(grid=False)), + ) + + bars = base.mark_bar().encode( + x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)), + x2=alt.datum(1), + yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']), + color=alt.Color('color_group:N', + scale=alt.Scale(domain=domain, range=range_colors), + legend=alt.Legend(title='Dataset', orient='top', columns=2)), + tooltip=[ + alt.Tooltip('Voice:N'), + alt.Tooltip('dataset:N', title='Dataset'), + alt.Tooltip('mean_score:Q', title='Average', format='.2f'), + alt.Tooltip('count:Q', title='Count'), + alt.Tooltip('gender:N', title='Gender') + ] + ) + + text = base.mark_text(align='left', baseline='middle', dx=5, fontSize=9).encode( + x=alt.X('mean_score:Q'), + yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']), + text=alt.Text('count:Q'), + color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=None) + ) + + chart = (bars + text).properties( + title={ + "text": self._process_title(title), + "subtitle": [trait_description if trait_description else "", "(Lighter shade = Straight-liners removed)"] + }, + width=width or 800, + height=height or getattr(self, 'plot_height', 600) + ) + + chart = self._save_plot(chart, title) + return chart + def plot_speaking_style_scale_correlation( self, style_color: str, @@ -2495,5 +2626,214 @@ class QualtricsPlotsMixin: height=height or getattr(self, 'plot_height', 400), ) + chart = self._save_plot(chart, title) + return chart + + def plot_straight_liner_repeat_offenders( + self, + cumulative_df: pl.DataFrame | pd.DataFrame, + title: str = "Straight-Liner Repeat Offenders\n(Cumulative Distribution)", + height: int | None = None, + width: int | str | None = None, + total_respondents: int | None = None, + ) -> alt.Chart: + """Plot the cumulative distribution of straight-liner repeat offenders. + + Shows how many respondents straight-lined at N or more question + groups, for every observed threshold. + + Parameters: + cumulative_df: DataFrame with columns ``threshold`` (int), + ``count`` (int) and ``pct`` (float, 0-100). Each row + represents "≥ threshold question groups". + title: Chart title. + height: Chart height in pixels. + width: Chart width in pixels. + total_respondents: If provided, shown in the subtitle for + context. + + Returns: + The Altair chart object (already saved if ``fig_save_dir`` + is configured). + """ + if isinstance(cumulative_df, pl.DataFrame): + plot_df = cumulative_df.to_pandas() + else: + plot_df = cumulative_df.copy() + + # Build readable x-axis labels ("≥1", "≥2", …) + plot_df["label"] = plot_df["threshold"].apply(lambda t: f"≥{t}") + + # Explicit sort order so Altair keeps ascending threshold + sort_order = plot_df.sort_values("threshold")["label"].tolist() + + # --- Bars: respondent count --- + bars = alt.Chart(plot_df).mark_bar( + color=ColorPalette.PRIMARY + ).encode( + x=alt.X( + "label:N", + title="Number of Straight-Lined Question Groups", + sort=sort_order, + axis=alt.Axis(grid=False), + ), + y=alt.Y( + "count:Q", + title="Number of Respondents", + axis=alt.Axis(grid=True), + ), + tooltip=[ + alt.Tooltip("label:N", title="Threshold"), + alt.Tooltip("count:Q", title="Respondents"), + alt.Tooltip("pct:Q", title="% of Total", format=".1f"), + ], + ) + + # --- Text: count + percentage above each bar --- + text = alt.Chart(plot_df).mark_text( + dy=-10, color="black", fontSize=11 + ).encode( + x=alt.X("label:N", sort=sort_order), + y=alt.Y("count:Q"), + text=alt.Text("count_label:N"), + ) + + # Build a combined label column "N (xx.x%)" + plot_df["count_label"] = plot_df.apply( + lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1 + ) + + # Rebuild text layer with the updated df + text = alt.Chart(plot_df).mark_text( + dy=-10, color="black", fontSize=11 + ).encode( + x=alt.X("label:N", sort=sort_order), + y=alt.Y("count:Q"), + text=alt.Text("count_label:N"), + ) + + # --- Subtitle --- + subtitle_parts = [] + if total_respondents is not None: + subtitle_parts.append( + f"Total respondents: {total_respondents}" + ) + subtitle_parts.append( + "Each bar shows how many respondents straight-lined " + "at least that many question groups" + ) + subtitle = " | ".join(subtitle_parts) + + title_config = { + "text": self._process_title(title), + "subtitle": subtitle, + "subtitleColor": "gray", + "subtitleFontSize": 10, + "anchor": "start", + } + + chart = alt.layer(bars, text).properties( + title=title_config, + width=width or 800, + height=height or getattr(self, "plot_height", 400), + ) + + chart = self._save_plot(chart, title) + return chart + + def plot_straight_liner_per_question( + self, + per_question_df: pl.DataFrame | pd.DataFrame, + title: str = "Straight-Lining Frequency per Question Group", + height: int | None = None, + width: int | str | None = None, + total_respondents: int | None = None, + ) -> alt.Chart: + """Plot how often each question group is straight-lined. + + Parameters: + per_question_df: DataFrame with columns ``question`` (str, + human-readable name), ``count`` (int) and ``pct`` + (float, 0-100). Sorted descending by count. + title: Chart title. + height: Chart height in pixels. + width: Chart width in pixels. + total_respondents: Shown in subtitle for context. + + Returns: + The Altair chart (saved if ``fig_save_dir`` is set). + """ + if isinstance(per_question_df, pl.DataFrame): + plot_df = per_question_df.to_pandas() + else: + plot_df = per_question_df.copy() + + # Sort order: largest count at top. Altair y-axis nominal sort places + # the first list element at the top, so descending order is correct. + sort_order = plot_df.sort_values("count", ascending=False)["question"].tolist() + + # Combined label "N (xx.x%)" + plot_df["count_label"] = plot_df.apply( + lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1 + ) + + # --- Horizontal Bars --- + bars = alt.Chart(plot_df).mark_bar( + color=ColorPalette.PRIMARY, + ).encode( + y=alt.Y( + "question:N", + title=None, + sort=sort_order, + axis=alt.Axis(grid=False, labelLimit=250, labelAngle=0), + ), + x=alt.X( + "count:Q", + title="Number of Straight-Liners", + axis=alt.Axis(grid=True), + ), + tooltip=[ + alt.Tooltip("question:N", title="Question"), + alt.Tooltip("count:Q", title="Straight-Liners"), + alt.Tooltip("pct:Q", title="% of Respondents", format=".1f"), + ], + ) + + # --- Text labels to the right of bars --- + text = alt.Chart(plot_df).mark_text( + align="left", dx=4, color="black", fontSize=10, + ).encode( + y=alt.Y("question:N", sort=sort_order), + x=alt.X("count:Q"), + text=alt.Text("count_label:N"), + ) + + # --- Subtitle --- + subtitle_parts = [] + if total_respondents is not None: + subtitle_parts.append(f"Total respondents: {total_respondents}") + subtitle_parts.append( + "Count and share of respondents who straight-lined each question group" + ) + subtitle = " | ".join(subtitle_parts) + + title_config = { + "text": self._process_title(title), + "subtitle": subtitle, + "subtitleColor": "gray", + "subtitleFontSize": 10, + "anchor": "start", + } + + # Scale height with number of questions for readable bar spacing + n_questions = len(plot_df) + auto_height = max(400, n_questions * 22) + + chart = alt.layer(bars, text).properties( + title=title_config, + width=width or 700, + height=height or auto_height, + ) + chart = self._save_plot(chart, title) return chart \ No newline at end of file