Started on looping through all interviews

2025-12-01 22:23:42 +01:00
parent 15d706b8c5
commit ae9563eba3
1 changed files with 139 additions and 6 deletions
--- a/Thematic_Analysis.py
+++ b/Thematic_Analysis.py
@@ -213,7 +213,7 @@ def _(mo):
@app.cell
 def _(themes_input):
    # Parse themes into a clean Python list
-    raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
+    raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
    theme_list = [t.strip() for t in raw_lines if t.strip()]
    return (theme_list,)
@@ -237,7 +237,7 @@ def _(Path, mo):
    label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
    label_button
-    return OUTPUT_DIR, label_button
+    return
@app.cell
@@ -247,9 +247,9 @@ def _(
    Path,
    client,
    file_dropdown,
    theme_label_button,
    labeled_transcript,
    mo,
    theme_label_button,
    theme_list,
 ):
    import json
@@ -360,7 +360,140 @@ def _(
    {status}
    """)
-    return theme_label_results
+    return
@app.cell
 def _(Path, mo):
    # Step 3c: Load all labeled transcripts (assumed precomputed)
    LABELED_DIR = Path("data/labeled_transcripts")
    LABELED_DIR.mkdir(parents=True, exist_ok=True)
    labeled_files = sorted(LABELED_DIR.glob("*.json"))
    mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
    Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
    These will be used to aggregate themes across all interviews.
    """)
    labeled_files
    return (labeled_files,)
@app.cell
 def _(labeled_files):
    import json
    all_labeled_records = []
    for f in labeled_files:
        try:
            data = json.loads(f.read_text(encoding="utf-8"))
        except Exception:
            # Skip unreadable files
            continue
        interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
        theme = data.get("theme", "")
        present = bool(data.get("present", False))
        try:
            relevance = float(data.get("relevance", 0.0))
        except (TypeError, ValueError):
            relevance = 0.0
        sections = data.get("sections") or []
        all_labeled_records.append(
            {
                "interview_id": interview_id,
                "theme": theme,
                "present": present,
                "relevance": relevance,
                "sections": sections,
            }
        )
    return (all_labeled_records,)
@app.cell
 def _(all_labeled_records, mo):
    # Derive full theme and interview sets
    all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
    all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
    theme_selector = mo.ui.dropdown(
        options={t: t for t in all_themes},
        label="Select theme to explore across all interviews",
    )
    mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
    theme_selector
    return all_interviews, theme_selector
@app.cell
 def _(all_interviews, all_labeled_records, mo, theme_selector):
    import statistics
    selected_theme = theme_selector.value
    theme_summary = {}
    theme_sections = []
    if selected_theme:
        theme_records = [
            r for r in all_labeled_records if r["theme"] == selected_theme
        ]
        present_flags = [r["present"] for r in theme_records]
        relevances = [r["relevance"] for r in theme_records if r["present"]]
        theme_summary = {
            "theme": selected_theme,
            "num_interviews": len(all_interviews),
            "num_interviews_with_theme": sum(present_flags),
            "share_of_interviews_with_theme": (
                sum(present_flags) / len(all_interviews) if all_interviews else 0.0
            ),
            "avg_relevance_if_present": (
                statistics.mean(relevances) if relevances else 0.0
            ),
        }
        for r in theme_records:
            interview_id = r["interview_id"]
            for s in r["sections"]:
                theme_sections.append(
                    {
                        "interview_id": interview_id,
                        "speaker": s.get("speaker", ""),
                        "section_text": s.get("section_text", ""),
                        "relevance": r["relevance"],
                    }
                )
    mo.md(
        f"""#### Theme Overview: `{selected_theme or "None selected"}`
    - Total interviews: **{len(all_interviews)}**
    - Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
    - Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
    - Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
    """
    )
    if theme_sections:
        table_rows = [
            {
                "Interview": s["interview_id"],
                "Speaker": s["speaker"],
                "Relevance": f"{s['relevance']:.2f}",
                "Section": s["section_text"],
            }
            for s in theme_sections
        ]
        mo.ui.table(table_rows)
    else:
        mo.md("_No sections for this theme yet._")
    return
@app.cell