straight-liner plot analysis
This commit is contained in:
5
.vscode/extensions.json
vendored
Normal file
5
.vscode/extensions.json
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"recommendations": [
|
||||||
|
"wakatime.vscode-wakatime"
|
||||||
|
]
|
||||||
|
}
|
||||||
263
XX_detailed_trait_analysis.py
Normal file
263
XX_detailed_trait_analysis.py
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
"""Extra analyses of the traits"""
|
||||||
|
# %% Imports
|
||||||
|
|
||||||
|
import utils
|
||||||
|
import polars as pl
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from validation import check_straight_liners
|
||||||
|
|
||||||
|
|
||||||
|
# %% Fixed Variables
|
||||||
|
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||||
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||||
|
|
||||||
|
|
||||||
|
# %% CLI argument parsing for batch automation
|
||||||
|
# When run as script: uv run XX_statistical_significance.script.py --age '["18
|
||||||
|
# Central filter configuration - add new filters here only
|
||||||
|
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||||
|
FILTER_CONFIG = {
|
||||||
|
'age': 'options_age',
|
||||||
|
'gender': 'options_gender',
|
||||||
|
'ethnicity': 'options_ethnicity',
|
||||||
|
'income': 'options_income',
|
||||||
|
'consumer': 'options_consumer',
|
||||||
|
'business_owner': 'options_business_owner',
|
||||||
|
'ai_user': 'options_ai_user',
|
||||||
|
'investable_assets': 'options_investable_assets',
|
||||||
|
'industry': 'options_industry',
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse_cli_args():
|
||||||
|
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||||
|
|
||||||
|
# Dynamically add filter arguments from config
|
||||||
|
for filter_name in FILTER_CONFIG:
|
||||||
|
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||||
|
|
||||||
|
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||||
|
parser.add_argument('--figures-dir', type=str, default=f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||||
|
|
||||||
|
# Only parse if running as script (not in Jupyter/interactive)
|
||||||
|
try:
|
||||||
|
# Check if running in Jupyter by looking for ipykernel
|
||||||
|
get_ipython() # noqa: F821 # type: ignore
|
||||||
|
# Return namespace with all filters set to None
|
||||||
|
no_filters = {f: None for f in FILTER_CONFIG}
|
||||||
|
# Use the same default as argparse
|
||||||
|
default_fig_dir = f'figures/traits-likert-analysis/{Path(RESULTS_FILE).parts[2]}'
|
||||||
|
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
|
||||||
|
except NameError:
|
||||||
|
args = parser.parse_args()
|
||||||
|
# Parse JSON strings to lists
|
||||||
|
for filter_name in FILTER_CONFIG:
|
||||||
|
val = getattr(args, filter_name)
|
||||||
|
setattr(args, filter_name, json.loads(val) if val else None)
|
||||||
|
return args
|
||||||
|
|
||||||
|
cli_args = parse_cli_args()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
||||||
|
data_all = S.load_data()
|
||||||
|
|
||||||
|
|
||||||
|
# %% Build filtered dataset based on CLI args
|
||||||
|
|
||||||
|
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
||||||
|
|
||||||
|
# Build filter values dict dynamically from FILTER_CONFIG
|
||||||
|
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
||||||
|
|
||||||
|
_d = S.filter_data(data_all, **_active_filters)
|
||||||
|
|
||||||
|
# Write filter description file if filter-name is provided
|
||||||
|
if cli_args.filter_name and S.fig_save_dir:
|
||||||
|
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||||
|
_filter_slug = S._get_filter_slug()
|
||||||
|
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||||
|
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Build filter description
|
||||||
|
_filter_desc_lines = [
|
||||||
|
f"Filter: {cli_args.filter_name}",
|
||||||
|
"",
|
||||||
|
"Applied Filters:",
|
||||||
|
]
|
||||||
|
_short_desc_parts = []
|
||||||
|
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||||
|
all_options = getattr(S, options_attr)
|
||||||
|
values = _active_filters[filter_name]
|
||||||
|
display_name = filter_name.replace('_', ' ').title()
|
||||||
|
# None means no filter applied (same as "All")
|
||||||
|
if values is not None and values != all_options:
|
||||||
|
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||||
|
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||||
|
else:
|
||||||
|
_filter_desc_lines.append(f" {display_name}: All")
|
||||||
|
|
||||||
|
# Write detailed description INSIDE the filter-slug directory
|
||||||
|
# Sanitize filter name for filename usage (replace / and other chars)
|
||||||
|
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
||||||
|
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
||||||
|
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||||
|
|
||||||
|
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||||
|
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||||
|
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||||
|
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||||
|
|
||||||
|
# Append or create the summary file
|
||||||
|
if _summary_file.exists():
|
||||||
|
_existing = _summary_file.read_text()
|
||||||
|
# Avoid duplicate entries for same slug
|
||||||
|
if _filter_slug not in _existing:
|
||||||
|
with _summary_file.open('a') as f:
|
||||||
|
f.write(_summary_line)
|
||||||
|
else:
|
||||||
|
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||||
|
_header += "Directory | Filter Name | Description\n"
|
||||||
|
_header += "-" * 80 + "\n"
|
||||||
|
_summary_file.write_text(_header + _summary_line)
|
||||||
|
|
||||||
|
# Save to logical variable name for further analysis
|
||||||
|
data = _d
|
||||||
|
data.collect()
|
||||||
|
|
||||||
|
# %% Voices per trait
|
||||||
|
|
||||||
|
|
||||||
|
ss_or, choice_map_or = S.get_ss_orange_red(data)
|
||||||
|
ss_gb, choice_map_gb = S.get_ss_green_blue(data)
|
||||||
|
|
||||||
|
# Combine the data
|
||||||
|
ss_all = ss_or.join(ss_gb, on='_recordId')
|
||||||
|
_d = ss_all.collect()
|
||||||
|
|
||||||
|
choice_map = {**choice_map_or, **choice_map_gb}
|
||||||
|
# print(_d.head())
|
||||||
|
# print(choice_map)
|
||||||
|
ss_long = utils.process_speaking_style_data(ss_all, choice_map)
|
||||||
|
|
||||||
|
|
||||||
|
# %% Create plots
|
||||||
|
|
||||||
|
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
||||||
|
trait_d = ss_long.filter(pl.col("Description") == trait)
|
||||||
|
|
||||||
|
S.plot_speaking_style_trait_scores(trait_d, title=trait.replace(":", " ↔ "), height=550, color_gender=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %% Filter out straight-liner (PER TRAIT) and re-plot to see if any changes
|
||||||
|
# Save with different filename suffix so we can compare with/without straight-liners
|
||||||
|
|
||||||
|
print("\n--- Straight-lining Checks on TRAITS ---")
|
||||||
|
sl_report_traits, sl_traits_df = check_straight_liners(ss_all, max_score=5)
|
||||||
|
sl_traits_df
|
||||||
|
|
||||||
|
# %%
|
||||||
|
|
||||||
|
if sl_traits_df is not None and not sl_traits_df.is_empty():
|
||||||
|
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
|
||||||
|
n_sl_groups = sl_traits_df.height
|
||||||
|
print(f"\nExcluding {n_sl_groups} straight-lined question blocks from {len(sl_ids)} respondents.")
|
||||||
|
|
||||||
|
# Create key in ss_long to match sl_traits_df for anti-join
|
||||||
|
# Question Group key in sl_traits_df is like "SS_Orange_Red__V14"
|
||||||
|
# ss_long has "Style_Group" and "Voice"
|
||||||
|
ss_long_w_key = ss_long.with_columns(
|
||||||
|
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Prepare filter table: Record ID + Question Group
|
||||||
|
sl_filter = sl_traits_df.select([
|
||||||
|
pl.col("Record ID").alias("_recordId"),
|
||||||
|
pl.col("Question Group")
|
||||||
|
])
|
||||||
|
|
||||||
|
# Anti-join to remove specific question blocks that were straight-lined
|
||||||
|
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
|
||||||
|
|
||||||
|
# Re-plot with suffix in title
|
||||||
|
print("Re-plotting traits (Cleaned)...")
|
||||||
|
for i, trait in enumerate(ss_long_clean.select("Description").unique().to_series().to_list()):
|
||||||
|
trait_d = ss_long_clean.filter(pl.col("Description") == trait)
|
||||||
|
|
||||||
|
# Modify title to create unique filename (and display title)
|
||||||
|
title_clean = trait.replace(":", " ↔ ") + " (Excl. Straight-Liners)"
|
||||||
|
|
||||||
|
S.plot_speaking_style_trait_scores(trait_d, title=title_clean, height=550, color_gender=True)
|
||||||
|
else:
|
||||||
|
print("No straight-liners found on traits.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# %% Compare All vs Cleaned
|
||||||
|
if sl_traits_df is not None and not sl_traits_df.is_empty():
|
||||||
|
print("Generating Comparison Plots (All vs Cleaned)...")
|
||||||
|
|
||||||
|
# Always apply the per-question-group filtering here to ensure consistency
|
||||||
|
# (Matches the logic used in the re-plotting section above)
|
||||||
|
print("Applying filter to remove straight-lined question blocks...")
|
||||||
|
ss_long_w_key = ss_long.with_columns(
|
||||||
|
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
||||||
|
)
|
||||||
|
sl_filter = sl_traits_df.select([
|
||||||
|
pl.col("Record ID").alias("_recordId"),
|
||||||
|
pl.col("Question Group")
|
||||||
|
])
|
||||||
|
ss_long_clean = ss_long_w_key.join(sl_filter, on=["_recordId", "Question Group"], how="anti").drop("Question Group")
|
||||||
|
|
||||||
|
sl_ids = sl_traits_df.select(pl.col("Record ID").unique()).to_series().to_list()
|
||||||
|
|
||||||
|
# --- Verification Prints ---
|
||||||
|
print(f"\n--- Verification of Filter ---")
|
||||||
|
print(f"Original Row Count: {ss_long.height}")
|
||||||
|
print(f"Number of Straight-Liner Question Blocks: {sl_traits_df.height}")
|
||||||
|
print(f"Sample IDs affected: {sl_ids[:5]}")
|
||||||
|
print(f"Cleaned Row Count: {ss_long_clean.height}")
|
||||||
|
print(f"Rows Removed: {ss_long.height - ss_long_clean.height}")
|
||||||
|
|
||||||
|
# Verify removal
|
||||||
|
# Re-construct key to verify
|
||||||
|
ss_long_check = ss_long.with_columns(
|
||||||
|
(pl.col("Style_Group") + "__" + pl.col("Voice")).alias("Question Group")
|
||||||
|
)
|
||||||
|
sl_filter_check = sl_traits_df.select([
|
||||||
|
pl.col("Record ID").alias("_recordId"),
|
||||||
|
pl.col("Question Group")
|
||||||
|
])
|
||||||
|
|
||||||
|
should_be_removed = ss_long_check.join(sl_filter_check, on=["_recordId", "Question Group"], how="inner").height
|
||||||
|
print(f"Discrepancy Check (Should be 0): { (ss_long.height - ss_long_clean.height) - should_be_removed }")
|
||||||
|
|
||||||
|
# Show what was removed (the straight lining behavior)
|
||||||
|
print("\nSample of Straight-Liner Data (Values that caused removal):")
|
||||||
|
print(sl_traits_df.head(5))
|
||||||
|
print("-" * 30 + "\n")
|
||||||
|
# ---------------------------
|
||||||
|
|
||||||
|
for i, trait in enumerate(ss_long.select("Description").unique().to_series().to_list()):
|
||||||
|
|
||||||
|
# Get data for this trait from both datasets
|
||||||
|
trait_d_all = ss_long.filter(pl.col("Description") == trait)
|
||||||
|
trait_d_clean = ss_long_clean.filter(pl.col("Description") == trait)
|
||||||
|
|
||||||
|
# Plot comparison
|
||||||
|
title_comp = trait.replace(":", " ↔ ") + " (Impact of Straight-Liners)"
|
||||||
|
|
||||||
|
S.plot_speaking_style_trait_scores_comparison(
|
||||||
|
trait_d_all,
|
||||||
|
trait_d_clean,
|
||||||
|
title=title_comp,
|
||||||
|
height=600 # Slightly taller for grouped bars
|
||||||
|
)
|
||||||
|
|
||||||
265
XX_straight_liners.py
Normal file
265
XX_straight_liners.py
Normal file
@@ -0,0 +1,265 @@
|
|||||||
|
"""Extra analyses of the straight-liners"""
|
||||||
|
# %% Imports
|
||||||
|
|
||||||
|
import utils
|
||||||
|
import polars as pl
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from validation import check_straight_liners
|
||||||
|
|
||||||
|
|
||||||
|
# %% Fixed Variables
|
||||||
|
RESULTS_FILE = 'data/exports/2-4-26/JPMC_Chase Brand Personality_Quant Round 1_February 4, 2026_Labels.csv'
|
||||||
|
QSF_FILE = 'data/exports/OneDrive_2026-01-21/Soft Launch Data/JPMC_Chase_Brand_Personality_Quant_Round_1.qsf'
|
||||||
|
|
||||||
|
|
||||||
|
# %% CLI argument parsing for batch automation
|
||||||
|
# When run as script: uv run XX_statistical_significance.script.py --age '["18
|
||||||
|
# Central filter configuration - add new filters here only
|
||||||
|
# Format: 'cli_arg_name': 'QualtricsSurvey.options_* attribute name'
|
||||||
|
FILTER_CONFIG = {
|
||||||
|
'age': 'options_age',
|
||||||
|
'gender': 'options_gender',
|
||||||
|
'ethnicity': 'options_ethnicity',
|
||||||
|
'income': 'options_income',
|
||||||
|
'consumer': 'options_consumer',
|
||||||
|
'business_owner': 'options_business_owner',
|
||||||
|
'ai_user': 'options_ai_user',
|
||||||
|
'investable_assets': 'options_investable_assets',
|
||||||
|
'industry': 'options_industry',
|
||||||
|
}
|
||||||
|
|
||||||
|
def parse_cli_args():
|
||||||
|
parser = argparse.ArgumentParser(description='Generate quant report with optional filters')
|
||||||
|
|
||||||
|
# Dynamically add filter arguments from config
|
||||||
|
for filter_name in FILTER_CONFIG:
|
||||||
|
parser.add_argument(f'--{filter_name}', type=str, default=None, help=f'JSON list of {filter_name} values')
|
||||||
|
|
||||||
|
parser.add_argument('--filter-name', type=str, default=None, help='Name for this filter combination (used for .txt description file)')
|
||||||
|
parser.add_argument('--figures-dir', type=str, default=f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}', help='Override the default figures directory')
|
||||||
|
|
||||||
|
# Only parse if running as script (not in Jupyter/interactive)
|
||||||
|
try:
|
||||||
|
# Check if running in Jupyter by looking for ipykernel
|
||||||
|
get_ipython() # noqa: F821 # type: ignore
|
||||||
|
# Return namespace with all filters set to None
|
||||||
|
no_filters = {f: None for f in FILTER_CONFIG}
|
||||||
|
# Use the same default as argparse
|
||||||
|
default_fig_dir = f'figures/straight-liner-analysis/{Path(RESULTS_FILE).parts[2]}'
|
||||||
|
return argparse.Namespace(**no_filters, filter_name=None, figures_dir=default_fig_dir)
|
||||||
|
except NameError:
|
||||||
|
args = parser.parse_args()
|
||||||
|
# Parse JSON strings to lists
|
||||||
|
for filter_name in FILTER_CONFIG:
|
||||||
|
val = getattr(args, filter_name)
|
||||||
|
setattr(args, filter_name, json.loads(val) if val else None)
|
||||||
|
return args
|
||||||
|
|
||||||
|
cli_args = parse_cli_args()
|
||||||
|
|
||||||
|
|
||||||
|
# %%
|
||||||
|
S = utils.QualtricsSurvey(RESULTS_FILE, QSF_FILE, figures_dir=cli_args.figures_dir)
|
||||||
|
data_all = S.load_data()
|
||||||
|
|
||||||
|
|
||||||
|
# %% Build filtered dataset based on CLI args
|
||||||
|
|
||||||
|
# CLI args: None means "no filter applied" - filter_data() will skip None filters
|
||||||
|
|
||||||
|
# Build filter values dict dynamically from FILTER_CONFIG
|
||||||
|
_active_filters = {filter_name: getattr(cli_args, filter_name) for filter_name in FILTER_CONFIG}
|
||||||
|
|
||||||
|
_d = S.filter_data(data_all, **_active_filters)
|
||||||
|
|
||||||
|
# Write filter description file if filter-name is provided
|
||||||
|
if cli_args.filter_name and S.fig_save_dir:
|
||||||
|
# Get the filter slug (e.g., "All_Respondents", "Cons-Starter", etc.)
|
||||||
|
_filter_slug = S._get_filter_slug()
|
||||||
|
_filter_slug_dir = S.fig_save_dir / _filter_slug
|
||||||
|
_filter_slug_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Build filter description
|
||||||
|
_filter_desc_lines = [
|
||||||
|
f"Filter: {cli_args.filter_name}",
|
||||||
|
"",
|
||||||
|
"Applied Filters:",
|
||||||
|
]
|
||||||
|
_short_desc_parts = []
|
||||||
|
for filter_name, options_attr in FILTER_CONFIG.items():
|
||||||
|
all_options = getattr(S, options_attr)
|
||||||
|
values = _active_filters[filter_name]
|
||||||
|
display_name = filter_name.replace('_', ' ').title()
|
||||||
|
# None means no filter applied (same as "All")
|
||||||
|
if values is not None and values != all_options:
|
||||||
|
_short_desc_parts.append(f"{display_name}: {', '.join(values)}")
|
||||||
|
_filter_desc_lines.append(f" {display_name}: {', '.join(values)}")
|
||||||
|
else:
|
||||||
|
_filter_desc_lines.append(f" {display_name}: All")
|
||||||
|
|
||||||
|
# Write detailed description INSIDE the filter-slug directory
|
||||||
|
# Sanitize filter name for filename usage (replace / and other chars)
|
||||||
|
_safe_filter_name = re.sub(r'[^\w\s-]', '_', cli_args.filter_name)
|
||||||
|
_filter_file = _filter_slug_dir / f"{_safe_filter_name}.txt"
|
||||||
|
_filter_file.write_text('\n'.join(_filter_desc_lines))
|
||||||
|
|
||||||
|
# Append to summary index file at figures/<export_date>/filter_index.txt
|
||||||
|
_summary_file = S.fig_save_dir / "filter_index.txt"
|
||||||
|
_short_desc = "; ".join(_short_desc_parts) if _short_desc_parts else "All Respondents"
|
||||||
|
_summary_line = f"{_filter_slug} | {cli_args.filter_name} | {_short_desc}\n"
|
||||||
|
|
||||||
|
# Append or create the summary file
|
||||||
|
if _summary_file.exists():
|
||||||
|
_existing = _summary_file.read_text()
|
||||||
|
# Avoid duplicate entries for same slug
|
||||||
|
if _filter_slug not in _existing:
|
||||||
|
with _summary_file.open('a') as f:
|
||||||
|
f.write(_summary_line)
|
||||||
|
else:
|
||||||
|
_header = "Filter Index\n" + "=" * 80 + "\n\n"
|
||||||
|
_header += "Directory | Filter Name | Description\n"
|
||||||
|
_header += "-" * 80 + "\n"
|
||||||
|
_summary_file.write_text(_header + _summary_line)
|
||||||
|
|
||||||
|
# Save to logical variable name for further analysis
|
||||||
|
data = _d
|
||||||
|
data.collect()
|
||||||
|
|
||||||
|
|
||||||
|
# %% Determine straight-liner repeat offenders
|
||||||
|
# Extract question groups with renamed columns that check_straight_liners expects.
|
||||||
|
# The raw `data` has QID-based column names; the getter methods rename them to
|
||||||
|
# patterns like SS_Green_Blue__V14__Choice_1, Voice_Scale_1_10__V48, etc.
|
||||||
|
|
||||||
|
ss_or, _ = S.get_ss_orange_red(data)
|
||||||
|
ss_gb, _ = S.get_ss_green_blue(data)
|
||||||
|
vs, _ = S.get_voice_scale_1_10(data)
|
||||||
|
|
||||||
|
# Combine all question groups into one wide LazyFrame (joined on _recordId)
|
||||||
|
all_questions = ss_or.join(ss_gb, on='_recordId').join(vs, on='_recordId')
|
||||||
|
|
||||||
|
# Run straight-liner detection across all question groups
|
||||||
|
# max_score=5 catches all speaking-style straight-lining (1-5 scale)
|
||||||
|
# and voice-scale values ≤5 on the 1-10 scale
|
||||||
|
print("Running straight-liner detection across all question groups...")
|
||||||
|
sl_report, sl_df = check_straight_liners(all_questions, max_score=5)
|
||||||
|
|
||||||
|
# %% Quantify repeat offenders
|
||||||
|
# sl_df has one row per (Record ID, Question Group) that was straight-lined.
|
||||||
|
# Group by Record ID to count how many question groups each person SL'd.
|
||||||
|
|
||||||
|
if sl_df is not None and not sl_df.is_empty():
|
||||||
|
total_respondents = data.select(pl.len()).collect().item()
|
||||||
|
|
||||||
|
# Per-respondent count of straight-lined question groups
|
||||||
|
respondent_sl_counts = (
|
||||||
|
sl_df
|
||||||
|
.group_by("Record ID")
|
||||||
|
.agg(pl.len().alias("sl_count"))
|
||||||
|
.sort("sl_count", descending=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
max_sl = respondent_sl_counts["sl_count"].max()
|
||||||
|
print(f"\nTotal respondents: {total_respondents}")
|
||||||
|
print(f"Respondents who straight-lined at least 1 question group: "
|
||||||
|
f"{respondent_sl_counts.height}")
|
||||||
|
print(f"Maximum question groups straight-lined by one person: {max_sl}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Build cumulative distribution: for each threshold N, count respondents
|
||||||
|
# who straight-lined >= N question groups
|
||||||
|
cumulative_rows = []
|
||||||
|
for threshold in range(1, max_sl + 1):
|
||||||
|
count = respondent_sl_counts.filter(
|
||||||
|
pl.col("sl_count") >= threshold
|
||||||
|
).height
|
||||||
|
pct = (count / total_respondents) * 100
|
||||||
|
cumulative_rows.append({
|
||||||
|
"threshold": threshold,
|
||||||
|
"count": count,
|
||||||
|
"pct": pct,
|
||||||
|
})
|
||||||
|
print(
|
||||||
|
f" ≥{threshold} question groups straight-lined: "
|
||||||
|
f"{count} respondents ({pct:.1f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
cumulative_df = pl.DataFrame(cumulative_rows)
|
||||||
|
print(f"\n{cumulative_df}")
|
||||||
|
|
||||||
|
# %% Save cumulative data to CSV
|
||||||
|
_filter_slug = S._get_filter_slug()
|
||||||
|
_csv_dir = Path(S.fig_save_dir) / _filter_slug
|
||||||
|
_csv_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
_csv_path = _csv_dir / "straight_liner_repeat_offenders.csv"
|
||||||
|
cumulative_df.write_csv(_csv_path)
|
||||||
|
print(f"Saved cumulative data to {_csv_path}")
|
||||||
|
|
||||||
|
# %% Plot the cumulative distribution
|
||||||
|
S.plot_straight_liner_repeat_offenders(
|
||||||
|
cumulative_df,
|
||||||
|
total_respondents=total_respondents,
|
||||||
|
)
|
||||||
|
|
||||||
|
# %% Per-question straight-lining frequency
|
||||||
|
# Build human-readable question group names from the raw keys
|
||||||
|
def _humanise_question_group(key: str) -> str:
|
||||||
|
"""Convert internal question group key to a readable label.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
SS_Green_Blue__V14 → Green/Blue – V14
|
||||||
|
SS_Orange_Red__V48 → Orange/Red – V48
|
||||||
|
Voice_Scale_1_10 → Voice Scale (1-10)
|
||||||
|
"""
|
||||||
|
if key.startswith("SS_Green_Blue__"):
|
||||||
|
voice = key.split("__")[1]
|
||||||
|
return f"Green/Blue – {voice}"
|
||||||
|
if key.startswith("SS_Orange_Red__"):
|
||||||
|
voice = key.split("__")[1]
|
||||||
|
return f"Orange/Red – {voice}"
|
||||||
|
if key == "Voice_Scale_1_10":
|
||||||
|
return "Voice Scale (1-10)"
|
||||||
|
# Fallback: replace underscores
|
||||||
|
return key.replace("_", " ")
|
||||||
|
|
||||||
|
per_question_counts = (
|
||||||
|
sl_df
|
||||||
|
.group_by("Question Group")
|
||||||
|
.agg(pl.col("Record ID").n_unique().alias("count"))
|
||||||
|
.sort("count", descending=True)
|
||||||
|
.with_columns(
|
||||||
|
(pl.col("count") / total_respondents * 100).alias("pct")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add human-readable names
|
||||||
|
per_question_counts = per_question_counts.with_columns(
|
||||||
|
pl.col("Question Group").map_elements(
|
||||||
|
_humanise_question_group, return_dtype=pl.Utf8
|
||||||
|
).alias("question")
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n--- Per-Question Straight-Lining Frequency ---")
|
||||||
|
print(per_question_counts)
|
||||||
|
|
||||||
|
# Save per-question data to CSV
|
||||||
|
_csv_path_pq = _csv_dir / "straight_liner_per_question.csv"
|
||||||
|
per_question_counts.write_csv(_csv_path_pq)
|
||||||
|
print(f"Saved per-question data to {_csv_path_pq}")
|
||||||
|
|
||||||
|
# Plot
|
||||||
|
S.plot_straight_liner_per_question(
|
||||||
|
per_question_counts,
|
||||||
|
total_respondents=total_respondents,
|
||||||
|
)
|
||||||
|
|
||||||
|
# %% Show the top repeat offenders (respondents with most SL'd groups)
|
||||||
|
print("\n--- Top Repeat Offenders ---")
|
||||||
|
print(respondent_sl_counts.head(20))
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("No straight-liners detected in the dataset.")
|
||||||
388
plots.py
388
plots.py
@@ -1115,6 +1115,7 @@ class QualtricsPlotsMixin:
|
|||||||
title: str = "Speaking Style Trait Analysis",
|
title: str = "Speaking Style Trait Analysis",
|
||||||
height: int | None = None,
|
height: int | None = None,
|
||||||
width: int | str | None = None,
|
width: int | str | None = None,
|
||||||
|
color_gender: bool = False,
|
||||||
) -> alt.Chart:
|
) -> alt.Chart:
|
||||||
"""Plot scores for a single speaking style trait across multiple voices."""
|
"""Plot scores for a single speaking style trait across multiple voices."""
|
||||||
df = self._ensure_dataframe(data)
|
df = self._ensure_dataframe(data)
|
||||||
@@ -1156,36 +1157,71 @@ class QualtricsPlotsMixin:
|
|||||||
else:
|
else:
|
||||||
trait_description = ""
|
trait_description = ""
|
||||||
|
|
||||||
# Horizontal bar chart - use x2 to explicitly start bars at x=1
|
if color_gender:
|
||||||
bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
|
stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
|
||||||
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
|
|
||||||
x2=alt.datum(1), # Bars start at x=1 (left edge of domain)
|
bars = alt.Chart(stats).mark_bar().encode(
|
||||||
y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
|
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
|
||||||
tooltip=[
|
x2=alt.datum(1), # Bars start at x=1 (left edge of domain)
|
||||||
alt.Tooltip('Voice:N'),
|
y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
|
||||||
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
|
color=alt.Color('gender:N',
|
||||||
alt.Tooltip('count:Q', title='Count')
|
scale=alt.Scale(domain=['Male', 'Female'],
|
||||||
]
|
range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]),
|
||||||
)
|
legend=alt.Legend(orient='top', direction='horizontal', title='Gender')),
|
||||||
|
tooltip=[
|
||||||
|
alt.Tooltip('Voice:N'),
|
||||||
|
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
|
||||||
|
alt.Tooltip('count:Q', title='Count'),
|
||||||
|
alt.Tooltip('gender:N', title='Gender')
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
text = alt.Chart(stats).mark_text(
|
||||||
|
align='left',
|
||||||
|
baseline='middle',
|
||||||
|
dx=5,
|
||||||
|
fontSize=12
|
||||||
|
).encode(
|
||||||
|
x='mean_score:Q',
|
||||||
|
y=alt.Y('Voice:N', sort='-x'),
|
||||||
|
text='count:Q',
|
||||||
|
color=alt.condition(
|
||||||
|
alt.datum.gender == 'Female',
|
||||||
|
alt.value(ColorPalette.GENDER_FEMALE),
|
||||||
|
alt.value(ColorPalette.GENDER_MALE)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Horizontal bar chart - use x2 to explicitly start bars at x=1
|
||||||
|
bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
|
||||||
|
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
|
||||||
|
x2=alt.datum(1), # Bars start at x=1 (left edge of domain)
|
||||||
|
y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
|
||||||
|
tooltip=[
|
||||||
|
alt.Tooltip('Voice:N'),
|
||||||
|
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
|
||||||
|
alt.Tooltip('count:Q', title='Count')
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# Count text at end of bars (right-aligned inside bar)
|
# Count text at end of bars
|
||||||
text = alt.Chart(stats).mark_text(
|
text = alt.Chart(stats).mark_text(
|
||||||
align='right',
|
align='left',
|
||||||
baseline='middle',
|
baseline='middle',
|
||||||
color='white',
|
color='black',
|
||||||
fontSize=12,
|
fontSize=12,
|
||||||
dx=-5 # Slight padding from bar end
|
dx=5
|
||||||
).encode(
|
).encode(
|
||||||
x='mean_score:Q',
|
x='mean_score:Q',
|
||||||
y=alt.Y('Voice:N', sort='-x'),
|
y=alt.Y('Voice:N', sort='-x'),
|
||||||
text='count:Q'
|
text='count:Q'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Combine layers
|
# Combine layers
|
||||||
chart = (bars + text).properties(
|
chart = (bars + text).properties(
|
||||||
title={
|
title={
|
||||||
"text": self._process_title(title),
|
"text": self._process_title(title),
|
||||||
"subtitle": [trait_description, "(Numbers on bars indicate respondent count)"]
|
"subtitle": [trait_description, "(Numbers near bars indicate respondent count)"]
|
||||||
},
|
},
|
||||||
width=width or 800,
|
width=width or 800,
|
||||||
height=height or getattr(self, 'plot_height', 400)
|
height=height or getattr(self, 'plot_height', 400)
|
||||||
@@ -1194,6 +1230,101 @@ class QualtricsPlotsMixin:
|
|||||||
chart = self._save_plot(chart, title)
|
chart = self._save_plot(chart, title)
|
||||||
return chart
|
return chart
|
||||||
|
|
||||||
|
def plot_speaking_style_trait_scores_comparison(
|
||||||
|
self,
|
||||||
|
data_all: pl.LazyFrame | pl.DataFrame,
|
||||||
|
data_clean: pl.LazyFrame | pl.DataFrame,
|
||||||
|
trait_description: str = None,
|
||||||
|
title: str = "Speaking Style Trait Analysis (Comparison)",
|
||||||
|
height: int | None = None,
|
||||||
|
width: int | str | None = None,
|
||||||
|
) -> alt.Chart:
|
||||||
|
"""Plot scores comparing All Respondents vs Cleaned data (excl. straight-liners) in grouped bars."""
|
||||||
|
|
||||||
|
# Helper to process each dataframe
|
||||||
|
def get_stats(d, group_label):
|
||||||
|
df = self._ensure_dataframe(d)
|
||||||
|
if df.is_empty(): return None
|
||||||
|
|
||||||
|
return (
|
||||||
|
df.filter(pl.col("score").is_not_null())
|
||||||
|
.group_by("Voice")
|
||||||
|
.agg([
|
||||||
|
pl.col("score").mean().alias("mean_score"),
|
||||||
|
pl.col("score").count().alias("count")
|
||||||
|
])
|
||||||
|
.with_columns(pl.lit(group_label).alias("dataset"))
|
||||||
|
.to_pandas()
|
||||||
|
)
|
||||||
|
|
||||||
|
stats_all = get_stats(data_all, "All Respondents")
|
||||||
|
stats_clean = get_stats(data_clean, "Excl. Straight-Liners")
|
||||||
|
|
||||||
|
if stats_all is None or stats_clean is None:
|
||||||
|
return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
|
||||||
|
|
||||||
|
# Combine
|
||||||
|
stats = pd.concat([stats_all, stats_clean])
|
||||||
|
|
||||||
|
# Determine sort order using "All Respondents" data (Desc)
|
||||||
|
sort_order = stats_all.sort_values('mean_score', ascending=False)['Voice'].tolist()
|
||||||
|
|
||||||
|
# Add gender and combined category for color
|
||||||
|
stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
|
||||||
|
stats['color_group'] = stats.apply(
|
||||||
|
lambda x: f"{x['gender']} - {x['dataset']}", axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define Color Scale
|
||||||
|
domain = [
|
||||||
|
'Male - All Respondents', 'Male - Excl. Straight-Liners',
|
||||||
|
'Female - All Respondents', 'Female - Excl. Straight-Liners'
|
||||||
|
]
|
||||||
|
range_colors = [
|
||||||
|
ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_RANK_3,
|
||||||
|
ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_RANK_3
|
||||||
|
]
|
||||||
|
|
||||||
|
# Base chart
|
||||||
|
base = alt.Chart(stats).encode(
|
||||||
|
y=alt.Y('Voice:N', title='Voice', sort=sort_order, axis=alt.Axis(grid=False)),
|
||||||
|
)
|
||||||
|
|
||||||
|
bars = base.mark_bar().encode(
|
||||||
|
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
|
||||||
|
x2=alt.datum(1),
|
||||||
|
yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
|
||||||
|
color=alt.Color('color_group:N',
|
||||||
|
scale=alt.Scale(domain=domain, range=range_colors),
|
||||||
|
legend=alt.Legend(title='Dataset', orient='top', columns=2)),
|
||||||
|
tooltip=[
|
||||||
|
alt.Tooltip('Voice:N'),
|
||||||
|
alt.Tooltip('dataset:N', title='Dataset'),
|
||||||
|
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
|
||||||
|
alt.Tooltip('count:Q', title='Count'),
|
||||||
|
alt.Tooltip('gender:N', title='Gender')
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
text = base.mark_text(align='left', baseline='middle', dx=5, fontSize=9).encode(
|
||||||
|
x=alt.X('mean_score:Q'),
|
||||||
|
yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
|
||||||
|
text=alt.Text('count:Q'),
|
||||||
|
color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=None)
|
||||||
|
)
|
||||||
|
|
||||||
|
chart = (bars + text).properties(
|
||||||
|
title={
|
||||||
|
"text": self._process_title(title),
|
||||||
|
"subtitle": [trait_description if trait_description else "", "(Lighter shade = Straight-liners removed)"]
|
||||||
|
},
|
||||||
|
width=width or 800,
|
||||||
|
height=height or getattr(self, 'plot_height', 600)
|
||||||
|
)
|
||||||
|
|
||||||
|
chart = self._save_plot(chart, title)
|
||||||
|
return chart
|
||||||
|
|
||||||
def plot_speaking_style_scale_correlation(
|
def plot_speaking_style_scale_correlation(
|
||||||
self,
|
self,
|
||||||
style_color: str,
|
style_color: str,
|
||||||
@@ -2495,5 +2626,214 @@ class QualtricsPlotsMixin:
|
|||||||
height=height or getattr(self, 'plot_height', 400),
|
height=height or getattr(self, 'plot_height', 400),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
chart = self._save_plot(chart, title)
|
||||||
|
return chart
|
||||||
|
|
||||||
|
def plot_straight_liner_repeat_offenders(
|
||||||
|
self,
|
||||||
|
cumulative_df: pl.DataFrame | pd.DataFrame,
|
||||||
|
title: str = "Straight-Liner Repeat Offenders\n(Cumulative Distribution)",
|
||||||
|
height: int | None = None,
|
||||||
|
width: int | str | None = None,
|
||||||
|
total_respondents: int | None = None,
|
||||||
|
) -> alt.Chart:
|
||||||
|
"""Plot the cumulative distribution of straight-liner repeat offenders.
|
||||||
|
|
||||||
|
Shows how many respondents straight-lined at N or more question
|
||||||
|
groups, for every observed threshold.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
cumulative_df: DataFrame with columns ``threshold`` (int),
|
||||||
|
``count`` (int) and ``pct`` (float, 0-100). Each row
|
||||||
|
represents "≥ threshold question groups".
|
||||||
|
title: Chart title.
|
||||||
|
height: Chart height in pixels.
|
||||||
|
width: Chart width in pixels.
|
||||||
|
total_respondents: If provided, shown in the subtitle for
|
||||||
|
context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The Altair chart object (already saved if ``fig_save_dir``
|
||||||
|
is configured).
|
||||||
|
"""
|
||||||
|
if isinstance(cumulative_df, pl.DataFrame):
|
||||||
|
plot_df = cumulative_df.to_pandas()
|
||||||
|
else:
|
||||||
|
plot_df = cumulative_df.copy()
|
||||||
|
|
||||||
|
# Build readable x-axis labels ("≥1", "≥2", …)
|
||||||
|
plot_df["label"] = plot_df["threshold"].apply(lambda t: f"≥{t}")
|
||||||
|
|
||||||
|
# Explicit sort order so Altair keeps ascending threshold
|
||||||
|
sort_order = plot_df.sort_values("threshold")["label"].tolist()
|
||||||
|
|
||||||
|
# --- Bars: respondent count ---
|
||||||
|
bars = alt.Chart(plot_df).mark_bar(
|
||||||
|
color=ColorPalette.PRIMARY
|
||||||
|
).encode(
|
||||||
|
x=alt.X(
|
||||||
|
"label:N",
|
||||||
|
title="Number of Straight-Lined Question Groups",
|
||||||
|
sort=sort_order,
|
||||||
|
axis=alt.Axis(grid=False),
|
||||||
|
),
|
||||||
|
y=alt.Y(
|
||||||
|
"count:Q",
|
||||||
|
title="Number of Respondents",
|
||||||
|
axis=alt.Axis(grid=True),
|
||||||
|
),
|
||||||
|
tooltip=[
|
||||||
|
alt.Tooltip("label:N", title="Threshold"),
|
||||||
|
alt.Tooltip("count:Q", title="Respondents"),
|
||||||
|
alt.Tooltip("pct:Q", title="% of Total", format=".1f"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Text: count + percentage above each bar ---
|
||||||
|
text = alt.Chart(plot_df).mark_text(
|
||||||
|
dy=-10, color="black", fontSize=11
|
||||||
|
).encode(
|
||||||
|
x=alt.X("label:N", sort=sort_order),
|
||||||
|
y=alt.Y("count:Q"),
|
||||||
|
text=alt.Text("count_label:N"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build a combined label column "N (xx.x%)"
|
||||||
|
plot_df["count_label"] = plot_df.apply(
|
||||||
|
lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rebuild text layer with the updated df
|
||||||
|
text = alt.Chart(plot_df).mark_text(
|
||||||
|
dy=-10, color="black", fontSize=11
|
||||||
|
).encode(
|
||||||
|
x=alt.X("label:N", sort=sort_order),
|
||||||
|
y=alt.Y("count:Q"),
|
||||||
|
text=alt.Text("count_label:N"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Subtitle ---
|
||||||
|
subtitle_parts = []
|
||||||
|
if total_respondents is not None:
|
||||||
|
subtitle_parts.append(
|
||||||
|
f"Total respondents: {total_respondents}"
|
||||||
|
)
|
||||||
|
subtitle_parts.append(
|
||||||
|
"Each bar shows how many respondents straight-lined "
|
||||||
|
"at least that many question groups"
|
||||||
|
)
|
||||||
|
subtitle = " | ".join(subtitle_parts)
|
||||||
|
|
||||||
|
title_config = {
|
||||||
|
"text": self._process_title(title),
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"subtitleColor": "gray",
|
||||||
|
"subtitleFontSize": 10,
|
||||||
|
"anchor": "start",
|
||||||
|
}
|
||||||
|
|
||||||
|
chart = alt.layer(bars, text).properties(
|
||||||
|
title=title_config,
|
||||||
|
width=width or 800,
|
||||||
|
height=height or getattr(self, "plot_height", 400),
|
||||||
|
)
|
||||||
|
|
||||||
|
chart = self._save_plot(chart, title)
|
||||||
|
return chart
|
||||||
|
|
||||||
|
def plot_straight_liner_per_question(
|
||||||
|
self,
|
||||||
|
per_question_df: pl.DataFrame | pd.DataFrame,
|
||||||
|
title: str = "Straight-Lining Frequency per Question Group",
|
||||||
|
height: int | None = None,
|
||||||
|
width: int | str | None = None,
|
||||||
|
total_respondents: int | None = None,
|
||||||
|
) -> alt.Chart:
|
||||||
|
"""Plot how often each question group is straight-lined.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
per_question_df: DataFrame with columns ``question`` (str,
|
||||||
|
human-readable name), ``count`` (int) and ``pct``
|
||||||
|
(float, 0-100). Sorted descending by count.
|
||||||
|
title: Chart title.
|
||||||
|
height: Chart height in pixels.
|
||||||
|
width: Chart width in pixels.
|
||||||
|
total_respondents: Shown in subtitle for context.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The Altair chart (saved if ``fig_save_dir`` is set).
|
||||||
|
"""
|
||||||
|
if isinstance(per_question_df, pl.DataFrame):
|
||||||
|
plot_df = per_question_df.to_pandas()
|
||||||
|
else:
|
||||||
|
plot_df = per_question_df.copy()
|
||||||
|
|
||||||
|
# Sort order: largest count at top. Altair y-axis nominal sort places
|
||||||
|
# the first list element at the top, so descending order is correct.
|
||||||
|
sort_order = plot_df.sort_values("count", ascending=False)["question"].tolist()
|
||||||
|
|
||||||
|
# Combined label "N (xx.x%)"
|
||||||
|
plot_df["count_label"] = plot_df.apply(
|
||||||
|
lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Horizontal Bars ---
|
||||||
|
bars = alt.Chart(plot_df).mark_bar(
|
||||||
|
color=ColorPalette.PRIMARY,
|
||||||
|
).encode(
|
||||||
|
y=alt.Y(
|
||||||
|
"question:N",
|
||||||
|
title=None,
|
||||||
|
sort=sort_order,
|
||||||
|
axis=alt.Axis(grid=False, labelLimit=250, labelAngle=0),
|
||||||
|
),
|
||||||
|
x=alt.X(
|
||||||
|
"count:Q",
|
||||||
|
title="Number of Straight-Liners",
|
||||||
|
axis=alt.Axis(grid=True),
|
||||||
|
),
|
||||||
|
tooltip=[
|
||||||
|
alt.Tooltip("question:N", title="Question"),
|
||||||
|
alt.Tooltip("count:Q", title="Straight-Liners"),
|
||||||
|
alt.Tooltip("pct:Q", title="% of Respondents", format=".1f"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Text labels to the right of bars ---
|
||||||
|
text = alt.Chart(plot_df).mark_text(
|
||||||
|
align="left", dx=4, color="black", fontSize=10,
|
||||||
|
).encode(
|
||||||
|
y=alt.Y("question:N", sort=sort_order),
|
||||||
|
x=alt.X("count:Q"),
|
||||||
|
text=alt.Text("count_label:N"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Subtitle ---
|
||||||
|
subtitle_parts = []
|
||||||
|
if total_respondents is not None:
|
||||||
|
subtitle_parts.append(f"Total respondents: {total_respondents}")
|
||||||
|
subtitle_parts.append(
|
||||||
|
"Count and share of respondents who straight-lined each question group"
|
||||||
|
)
|
||||||
|
subtitle = " | ".join(subtitle_parts)
|
||||||
|
|
||||||
|
title_config = {
|
||||||
|
"text": self._process_title(title),
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"subtitleColor": "gray",
|
||||||
|
"subtitleFontSize": 10,
|
||||||
|
"anchor": "start",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Scale height with number of questions for readable bar spacing
|
||||||
|
n_questions = len(plot_df)
|
||||||
|
auto_height = max(400, n_questions * 22)
|
||||||
|
|
||||||
|
chart = alt.layer(bars, text).properties(
|
||||||
|
title=title_config,
|
||||||
|
width=width or 700,
|
||||||
|
height=height or auto_height,
|
||||||
|
)
|
||||||
|
|
||||||
chart = self._save_plot(chart, title)
|
chart = self._save_plot(chart, title)
|
||||||
return chart
|
return chart
|
||||||
Reference in New Issue
Block a user