Started on looping through all interviews
This commit is contained in:
@@ -213,7 +213,7 @@ def _(mo):
|
||||
@app.cell
|
||||
def _(themes_input):
|
||||
# Parse themes into a clean Python list
|
||||
raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
|
||||
raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
|
||||
theme_list = [t.strip() for t in raw_lines if t.strip()]
|
||||
return (theme_list,)
|
||||
|
||||
@@ -237,7 +237,7 @@ def _(Path, mo):
|
||||
|
||||
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
|
||||
label_button
|
||||
return OUTPUT_DIR, label_button
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
@@ -247,9 +247,9 @@ def _(
|
||||
Path,
|
||||
client,
|
||||
file_dropdown,
|
||||
theme_label_button,
|
||||
labeled_transcript,
|
||||
mo,
|
||||
theme_label_button,
|
||||
theme_list,
|
||||
):
|
||||
import json
|
||||
@@ -360,7 +360,140 @@ def _(
|
||||
|
||||
{status}
|
||||
""")
|
||||
return theme_label_results
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(Path, mo):
|
||||
# Step 3c: Load all labeled transcripts (assumed precomputed)
|
||||
LABELED_DIR = Path("data/labeled_transcripts")
|
||||
LABELED_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
labeled_files = sorted(LABELED_DIR.glob("*.json"))
|
||||
|
||||
mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
|
||||
|
||||
Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
|
||||
These will be used to aggregate themes across all interviews.
|
||||
""")
|
||||
|
||||
labeled_files
|
||||
return (labeled_files,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(labeled_files):
|
||||
import json
|
||||
|
||||
all_labeled_records = []
|
||||
for f in labeled_files:
|
||||
try:
|
||||
data = json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
# Skip unreadable files
|
||||
continue
|
||||
|
||||
interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
|
||||
theme = data.get("theme", "")
|
||||
present = bool(data.get("present", False))
|
||||
try:
|
||||
relevance = float(data.get("relevance", 0.0))
|
||||
except (TypeError, ValueError):
|
||||
relevance = 0.0
|
||||
sections = data.get("sections") or []
|
||||
|
||||
all_labeled_records.append(
|
||||
{
|
||||
"interview_id": interview_id,
|
||||
"theme": theme,
|
||||
"present": present,
|
||||
"relevance": relevance,
|
||||
"sections": sections,
|
||||
}
|
||||
)
|
||||
return (all_labeled_records,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_labeled_records, mo):
|
||||
# Derive full theme and interview sets
|
||||
all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
|
||||
all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
|
||||
|
||||
theme_selector = mo.ui.dropdown(
|
||||
options={t: t for t in all_themes},
|
||||
label="Select theme to explore across all interviews",
|
||||
)
|
||||
|
||||
mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
|
||||
theme_selector
|
||||
return all_interviews, theme_selector
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(all_interviews, all_labeled_records, mo, theme_selector):
|
||||
import statistics
|
||||
|
||||
selected_theme = theme_selector.value
|
||||
theme_summary = {}
|
||||
theme_sections = []
|
||||
|
||||
if selected_theme:
|
||||
theme_records = [
|
||||
r for r in all_labeled_records if r["theme"] == selected_theme
|
||||
]
|
||||
|
||||
present_flags = [r["present"] for r in theme_records]
|
||||
relevances = [r["relevance"] for r in theme_records if r["present"]]
|
||||
|
||||
theme_summary = {
|
||||
"theme": selected_theme,
|
||||
"num_interviews": len(all_interviews),
|
||||
"num_interviews_with_theme": sum(present_flags),
|
||||
"share_of_interviews_with_theme": (
|
||||
sum(present_flags) / len(all_interviews) if all_interviews else 0.0
|
||||
),
|
||||
"avg_relevance_if_present": (
|
||||
statistics.mean(relevances) if relevances else 0.0
|
||||
),
|
||||
}
|
||||
|
||||
for r in theme_records:
|
||||
interview_id = r["interview_id"]
|
||||
for s in r["sections"]:
|
||||
theme_sections.append(
|
||||
{
|
||||
"interview_id": interview_id,
|
||||
"speaker": s.get("speaker", ""),
|
||||
"section_text": s.get("section_text", ""),
|
||||
"relevance": r["relevance"],
|
||||
}
|
||||
)
|
||||
|
||||
mo.md(
|
||||
f"""#### Theme Overview: `{selected_theme or "None selected"}`
|
||||
|
||||
- Total interviews: **{len(all_interviews)}**
|
||||
- Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
|
||||
- Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
|
||||
- Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
|
||||
"""
|
||||
)
|
||||
|
||||
if theme_sections:
|
||||
table_rows = [
|
||||
{
|
||||
"Interview": s["interview_id"],
|
||||
"Speaker": s["speaker"],
|
||||
"Relevance": f"{s['relevance']:.2f}",
|
||||
"Section": s["section_text"],
|
||||
}
|
||||
for s in theme_sections
|
||||
]
|
||||
mo.ui.table(table_rows)
|
||||
else:
|
||||
mo.md("_No sections for this theme yet._")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
|
||||
Reference in New Issue
Block a user