Started on looping through all interviews

This commit is contained in:
mtorsij
2025-12-01 22:23:42 +01:00
parent 15d706b8c5
commit ae9563eba3

View File

@@ -213,7 +213,7 @@ def _(mo):
@app.cell @app.cell
def _(themes_input): def _(themes_input):
# Parse themes into a clean Python list # Parse themes into a clean Python list
raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else [] raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
theme_list = [t.strip() for t in raw_lines if t.strip()] theme_list = [t.strip() for t in raw_lines if t.strip()]
return (theme_list,) return (theme_list,)
@@ -237,7 +237,7 @@ def _(Path, mo):
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview") label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
label_button label_button
return OUTPUT_DIR, label_button return
@app.cell @app.cell
@@ -247,9 +247,9 @@ def _(
Path, Path,
client, client,
file_dropdown, file_dropdown,
theme_label_button,
labeled_transcript, labeled_transcript,
mo, mo,
theme_label_button,
theme_list, theme_list,
): ):
import json import json
@@ -360,7 +360,140 @@ def _(
{status} {status}
""") """)
return theme_label_results return
@app.cell
def _(Path, mo):
# Step 3c: Load all labeled transcripts (assumed precomputed)
LABELED_DIR = Path("data/labeled_transcripts")
LABELED_DIR.mkdir(parents=True, exist_ok=True)
labeled_files = sorted(LABELED_DIR.glob("*.json"))
mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
These will be used to aggregate themes across all interviews.
""")
labeled_files
return (labeled_files,)
@app.cell
def _(labeled_files):
import json
all_labeled_records = []
for f in labeled_files:
try:
data = json.loads(f.read_text(encoding="utf-8"))
except Exception:
# Skip unreadable files
continue
interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
theme = data.get("theme", "")
present = bool(data.get("present", False))
try:
relevance = float(data.get("relevance", 0.0))
except (TypeError, ValueError):
relevance = 0.0
sections = data.get("sections") or []
all_labeled_records.append(
{
"interview_id": interview_id,
"theme": theme,
"present": present,
"relevance": relevance,
"sections": sections,
}
)
return (all_labeled_records,)
@app.cell
def _(all_labeled_records, mo):
# Derive full theme and interview sets
all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
theme_selector = mo.ui.dropdown(
options={t: t for t in all_themes},
label="Select theme to explore across all interviews",
)
mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
theme_selector
return all_interviews, theme_selector
@app.cell
def _(all_interviews, all_labeled_records, mo, theme_selector):
import statistics
selected_theme = theme_selector.value
theme_summary = {}
theme_sections = []
if selected_theme:
theme_records = [
r for r in all_labeled_records if r["theme"] == selected_theme
]
present_flags = [r["present"] for r in theme_records]
relevances = [r["relevance"] for r in theme_records if r["present"]]
theme_summary = {
"theme": selected_theme,
"num_interviews": len(all_interviews),
"num_interviews_with_theme": sum(present_flags),
"share_of_interviews_with_theme": (
sum(present_flags) / len(all_interviews) if all_interviews else 0.0
),
"avg_relevance_if_present": (
statistics.mean(relevances) if relevances else 0.0
),
}
for r in theme_records:
interview_id = r["interview_id"]
for s in r["sections"]:
theme_sections.append(
{
"interview_id": interview_id,
"speaker": s.get("speaker", ""),
"section_text": s.get("section_text", ""),
"relevance": r["relevance"],
}
)
mo.md(
f"""#### Theme Overview: `{selected_theme or "None selected"}`
- Total interviews: **{len(all_interviews)}**
- Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
- Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
- Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
"""
)
if theme_sections:
table_rows = [
{
"Interview": s["interview_id"],
"Speaker": s["speaker"],
"Relevance": f"{s['relevance']:.2f}",
"Section": s["section_text"],
}
for s in theme_sections
]
mo.ui.table(table_rows)
else:
mo.md("_No sections for this theme yet._")
return
@app.cell @app.cell