straight-liner plot analysis

This commit is contained in:
2026-02-09 17:26:45 +01:00
parent 92c6fc03ab
commit 6c16993cb3
4 changed files with 897 additions and 24 deletions

388
plots.py
View File

@@ -1115,6 +1115,7 @@ class QualtricsPlotsMixin:
title: str = "Speaking Style Trait Analysis",
height: int | None = None,
width: int | str | None = None,
color_gender: bool = False,
) -> alt.Chart:
"""Plot scores for a single speaking style trait across multiple voices."""
df = self._ensure_dataframe(data)
@@ -1156,36 +1157,71 @@ class QualtricsPlotsMixin:
else:
trait_description = ""
# Horizontal bar chart - use x2 to explicitly start bars at x=1
bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
x2=alt.datum(1), # Bars start at x=1 (left edge of domain)
y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
tooltip=[
alt.Tooltip('Voice:N'),
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
alt.Tooltip('count:Q', title='Count')
]
)
if color_gender:
stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
bars = alt.Chart(stats).mark_bar().encode(
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
x2=alt.datum(1), # Bars start at x=1 (left edge of domain)
y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
color=alt.Color('gender:N',
scale=alt.Scale(domain=['Male', 'Female'],
range=[ColorPalette.GENDER_MALE, ColorPalette.GENDER_FEMALE]),
legend=alt.Legend(orient='top', direction='horizontal', title='Gender')),
tooltip=[
alt.Tooltip('Voice:N'),
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
alt.Tooltip('count:Q', title='Count'),
alt.Tooltip('gender:N', title='Gender')
]
)
text = alt.Chart(stats).mark_text(
align='left',
baseline='middle',
dx=5,
fontSize=12
).encode(
x='mean_score:Q',
y=alt.Y('Voice:N', sort='-x'),
text='count:Q',
color=alt.condition(
alt.datum.gender == 'Female',
alt.value(ColorPalette.GENDER_FEMALE),
alt.value(ColorPalette.GENDER_MALE)
)
)
else:
# Horizontal bar chart - use x2 to explicitly start bars at x=1
bars = alt.Chart(stats).mark_bar(color=ColorPalette.PRIMARY).encode(
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
x2=alt.datum(1), # Bars start at x=1 (left edge of domain)
y=alt.Y('Voice:N', title='Voice', sort='-x', axis=alt.Axis(grid=False)),
tooltip=[
alt.Tooltip('Voice:N'),
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
alt.Tooltip('count:Q', title='Count')
]
)
# Count text at end of bars (right-aligned inside bar)
text = alt.Chart(stats).mark_text(
align='right',
baseline='middle',
color='white',
fontSize=12,
dx=-5 # Slight padding from bar end
).encode(
x='mean_score:Q',
y=alt.Y('Voice:N', sort='-x'),
text='count:Q'
)
# Count text at end of bars
text = alt.Chart(stats).mark_text(
align='left',
baseline='middle',
color='black',
fontSize=12,
dx=5
).encode(
x='mean_score:Q',
y=alt.Y('Voice:N', sort='-x'),
text='count:Q'
)
# Combine layers
chart = (bars + text).properties(
title={
"text": self._process_title(title),
"subtitle": [trait_description, "(Numbers on bars indicate respondent count)"]
"subtitle": [trait_description, "(Numbers near bars indicate respondent count)"]
},
width=width or 800,
height=height or getattr(self, 'plot_height', 400)
@@ -1194,6 +1230,101 @@ class QualtricsPlotsMixin:
chart = self._save_plot(chart, title)
return chart
def plot_speaking_style_trait_scores_comparison(
self,
data_all: pl.LazyFrame | pl.DataFrame,
data_clean: pl.LazyFrame | pl.DataFrame,
trait_description: str = None,
title: str = "Speaking Style Trait Analysis (Comparison)",
height: int | None = None,
width: int | str | None = None,
) -> alt.Chart:
"""Plot scores comparing All Respondents vs Cleaned data (excl. straight-liners) in grouped bars."""
# Helper to process each dataframe
def get_stats(d, group_label):
df = self._ensure_dataframe(d)
if df.is_empty(): return None
return (
df.filter(pl.col("score").is_not_null())
.group_by("Voice")
.agg([
pl.col("score").mean().alias("mean_score"),
pl.col("score").count().alias("count")
])
.with_columns(pl.lit(group_label).alias("dataset"))
.to_pandas()
)
stats_all = get_stats(data_all, "All Respondents")
stats_clean = get_stats(data_clean, "Excl. Straight-Liners")
if stats_all is None or stats_clean is None:
return alt.Chart(pd.DataFrame({'text': ['No data']})).mark_text().encode(text='text:N')
# Combine
stats = pd.concat([stats_all, stats_clean])
# Determine sort order using "All Respondents" data (Desc)
sort_order = stats_all.sort_values('mean_score', ascending=False)['Voice'].tolist()
# Add gender and combined category for color
stats['gender'] = stats['Voice'].apply(self._get_voice_gender)
stats['color_group'] = stats.apply(
lambda x: f"{x['gender']} - {x['dataset']}", axis=1
)
# Define Color Scale
domain = [
'Male - All Respondents', 'Male - Excl. Straight-Liners',
'Female - All Respondents', 'Female - Excl. Straight-Liners'
]
range_colors = [
ColorPalette.GENDER_MALE, ColorPalette.GENDER_MALE_RANK_3,
ColorPalette.GENDER_FEMALE, ColorPalette.GENDER_FEMALE_RANK_3
]
# Base chart
base = alt.Chart(stats).encode(
y=alt.Y('Voice:N', title='Voice', sort=sort_order, axis=alt.Axis(grid=False)),
)
bars = base.mark_bar().encode(
x=alt.X('mean_score:Q', title='Average Score (1-5)', scale=alt.Scale(domain=[1, 5]), axis=alt.Axis(grid=True)),
x2=alt.datum(1),
yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
color=alt.Color('color_group:N',
scale=alt.Scale(domain=domain, range=range_colors),
legend=alt.Legend(title='Dataset', orient='top', columns=2)),
tooltip=[
alt.Tooltip('Voice:N'),
alt.Tooltip('dataset:N', title='Dataset'),
alt.Tooltip('mean_score:Q', title='Average', format='.2f'),
alt.Tooltip('count:Q', title='Count'),
alt.Tooltip('gender:N', title='Gender')
]
)
text = base.mark_text(align='left', baseline='middle', dx=5, fontSize=9).encode(
x=alt.X('mean_score:Q'),
yOffset=alt.YOffset('dataset:N', sort=['All Respondents', 'Excl. Straight-Liners']),
text=alt.Text('count:Q'),
color=alt.Color('color_group:N', scale=alt.Scale(domain=domain, range=range_colors), legend=None)
)
chart = (bars + text).properties(
title={
"text": self._process_title(title),
"subtitle": [trait_description if trait_description else "", "(Lighter shade = Straight-liners removed)"]
},
width=width or 800,
height=height or getattr(self, 'plot_height', 600)
)
chart = self._save_plot(chart, title)
return chart
def plot_speaking_style_scale_correlation(
self,
style_color: str,
@@ -2495,5 +2626,214 @@ class QualtricsPlotsMixin:
height=height or getattr(self, 'plot_height', 400),
)
chart = self._save_plot(chart, title)
return chart
def plot_straight_liner_repeat_offenders(
self,
cumulative_df: pl.DataFrame | pd.DataFrame,
title: str = "Straight-Liner Repeat Offenders\n(Cumulative Distribution)",
height: int | None = None,
width: int | str | None = None,
total_respondents: int | None = None,
) -> alt.Chart:
"""Plot the cumulative distribution of straight-liner repeat offenders.
Shows how many respondents straight-lined at N or more question
groups, for every observed threshold.
Parameters:
cumulative_df: DataFrame with columns ``threshold`` (int),
``count`` (int) and ``pct`` (float, 0-100). Each row
represents "≥ threshold question groups".
title: Chart title.
height: Chart height in pixels.
width: Chart width in pixels.
total_respondents: If provided, shown in the subtitle for
context.
Returns:
The Altair chart object (already saved if ``fig_save_dir``
is configured).
"""
if isinstance(cumulative_df, pl.DataFrame):
plot_df = cumulative_df.to_pandas()
else:
plot_df = cumulative_df.copy()
# Build readable x-axis labels ("≥1", "≥2", …)
plot_df["label"] = plot_df["threshold"].apply(lambda t: f"{t}")
# Explicit sort order so Altair keeps ascending threshold
sort_order = plot_df.sort_values("threshold")["label"].tolist()
# --- Bars: respondent count ---
bars = alt.Chart(plot_df).mark_bar(
color=ColorPalette.PRIMARY
).encode(
x=alt.X(
"label:N",
title="Number of Straight-Lined Question Groups",
sort=sort_order,
axis=alt.Axis(grid=False),
),
y=alt.Y(
"count:Q",
title="Number of Respondents",
axis=alt.Axis(grid=True),
),
tooltip=[
alt.Tooltip("label:N", title="Threshold"),
alt.Tooltip("count:Q", title="Respondents"),
alt.Tooltip("pct:Q", title="% of Total", format=".1f"),
],
)
# --- Text: count + percentage above each bar ---
text = alt.Chart(plot_df).mark_text(
dy=-10, color="black", fontSize=11
).encode(
x=alt.X("label:N", sort=sort_order),
y=alt.Y("count:Q"),
text=alt.Text("count_label:N"),
)
# Build a combined label column "N (xx.x%)"
plot_df["count_label"] = plot_df.apply(
lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1
)
# Rebuild text layer with the updated df
text = alt.Chart(plot_df).mark_text(
dy=-10, color="black", fontSize=11
).encode(
x=alt.X("label:N", sort=sort_order),
y=alt.Y("count:Q"),
text=alt.Text("count_label:N"),
)
# --- Subtitle ---
subtitle_parts = []
if total_respondents is not None:
subtitle_parts.append(
f"Total respondents: {total_respondents}"
)
subtitle_parts.append(
"Each bar shows how many respondents straight-lined "
"at least that many question groups"
)
subtitle = " | ".join(subtitle_parts)
title_config = {
"text": self._process_title(title),
"subtitle": subtitle,
"subtitleColor": "gray",
"subtitleFontSize": 10,
"anchor": "start",
}
chart = alt.layer(bars, text).properties(
title=title_config,
width=width or 800,
height=height or getattr(self, "plot_height", 400),
)
chart = self._save_plot(chart, title)
return chart
def plot_straight_liner_per_question(
self,
per_question_df: pl.DataFrame | pd.DataFrame,
title: str = "Straight-Lining Frequency per Question Group",
height: int | None = None,
width: int | str | None = None,
total_respondents: int | None = None,
) -> alt.Chart:
"""Plot how often each question group is straight-lined.
Parameters:
per_question_df: DataFrame with columns ``question`` (str,
human-readable name), ``count`` (int) and ``pct``
(float, 0-100). Sorted descending by count.
title: Chart title.
height: Chart height in pixels.
width: Chart width in pixels.
total_respondents: Shown in subtitle for context.
Returns:
The Altair chart (saved if ``fig_save_dir`` is set).
"""
if isinstance(per_question_df, pl.DataFrame):
plot_df = per_question_df.to_pandas()
else:
plot_df = per_question_df.copy()
# Sort order: largest count at top. Altair y-axis nominal sort places
# the first list element at the top, so descending order is correct.
sort_order = plot_df.sort_values("count", ascending=False)["question"].tolist()
# Combined label "N (xx.x%)"
plot_df["count_label"] = plot_df.apply(
lambda r: f"{int(r['count'])} ({r['pct']:.1f}%)", axis=1
)
# --- Horizontal Bars ---
bars = alt.Chart(plot_df).mark_bar(
color=ColorPalette.PRIMARY,
).encode(
y=alt.Y(
"question:N",
title=None,
sort=sort_order,
axis=alt.Axis(grid=False, labelLimit=250, labelAngle=0),
),
x=alt.X(
"count:Q",
title="Number of Straight-Liners",
axis=alt.Axis(grid=True),
),
tooltip=[
alt.Tooltip("question:N", title="Question"),
alt.Tooltip("count:Q", title="Straight-Liners"),
alt.Tooltip("pct:Q", title="% of Respondents", format=".1f"),
],
)
# --- Text labels to the right of bars ---
text = alt.Chart(plot_df).mark_text(
align="left", dx=4, color="black", fontSize=10,
).encode(
y=alt.Y("question:N", sort=sort_order),
x=alt.X("count:Q"),
text=alt.Text("count_label:N"),
)
# --- Subtitle ---
subtitle_parts = []
if total_respondents is not None:
subtitle_parts.append(f"Total respondents: {total_respondents}")
subtitle_parts.append(
"Count and share of respondents who straight-lined each question group"
)
subtitle = " | ".join(subtitle_parts)
title_config = {
"text": self._process_title(title),
"subtitle": subtitle,
"subtitleColor": "gray",
"subtitleFontSize": 10,
"anchor": "start",
}
# Scale height with number of questions for readable bar spacing
n_questions = len(plot_df)
auto_height = max(400, n_questions * 22)
chart = alt.layer(bars, text).properties(
title=title_config,
width=width or 700,
height=height or auto_height,
)
chart = self._save_plot(chart, title)
return chart