Started on looping through all interviews

2025-12-01 22:23:42 +01:00
parent 15d706b8c5
commit ae9563eba3
1 changed files with 139 additions and 6 deletions
--- a/Thematic_Analysis.py
+++ b/Thematic_Analysis.py
@@ -213,7 +213,7 @@ def _(mo):
@app.cell
 def _(themes_input):
    # Parse themes into a clean Python list
-    raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
+    raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
    theme_list = [t.strip() for t in raw_lines if t.strip()]
    return (theme_list,)

@@ -237,7 +237,7 @@ def _(Path, mo):

    label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
    label_button
-    return OUTPUT_DIR, label_button
+    return


@app.cell
@@ -247,9 +247,9 @@ def _(
    Path,
    client,
    file_dropdown,
-    theme_label_button,
    labeled_transcript,
    mo,
+    theme_label_button,
    theme_list,
 ):
    import json
@@ -360,7 +360,140 @@ def _(

    {status}
    """)
-    return theme_label_results
+    return
+
+
+@app.cell
+def _(Path, mo):
+    # Step 3c: Load all labeled transcripts (assumed precomputed)
+    LABELED_DIR = Path("data/labeled_transcripts")
+    LABELED_DIR.mkdir(parents=True, exist_ok=True)
+
+    labeled_files = sorted(LABELED_DIR.glob("*.json"))
+
+    mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
+
+    Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
+    These will be used to aggregate themes across all interviews.
+    """)
+
+    labeled_files
+    return (labeled_files,)
+
+
+@app.cell
+def _(labeled_files):
+    import json
+
+    all_labeled_records = []
+    for f in labeled_files:
+        try:
+            data = json.loads(f.read_text(encoding="utf-8"))
+        except Exception:
+            # Skip unreadable files
+            continue
+
+        interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
+        theme = data.get("theme", "")
+        present = bool(data.get("present", False))
+        try:
+            relevance = float(data.get("relevance", 0.0))
+        except (TypeError, ValueError):
+            relevance = 0.0
+        sections = data.get("sections") or []
+
+        all_labeled_records.append(
+            {
+                "interview_id": interview_id,
+                "theme": theme,
+                "present": present,
+                "relevance": relevance,
+                "sections": sections,
+            }
+        )
+    return (all_labeled_records,)
+
+
+@app.cell
+def _(all_labeled_records, mo):
+    # Derive full theme and interview sets
+    all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
+    all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
+
+    theme_selector = mo.ui.dropdown(
+        options={t: t for t in all_themes},
+        label="Select theme to explore across all interviews",
+    )
+
+    mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
+    theme_selector
+    return all_interviews, theme_selector
+
+
+@app.cell
+def _(all_interviews, all_labeled_records, mo, theme_selector):
+    import statistics
+
+    selected_theme = theme_selector.value
+    theme_summary = {}
+    theme_sections = []
+
+    if selected_theme:
+        theme_records = [
+            r for r in all_labeled_records if r["theme"] == selected_theme
+        ]
+
+        present_flags = [r["present"] for r in theme_records]
+        relevances = [r["relevance"] for r in theme_records if r["present"]]
+
+        theme_summary = {
+            "theme": selected_theme,
+            "num_interviews": len(all_interviews),
+            "num_interviews_with_theme": sum(present_flags),
+            "share_of_interviews_with_theme": (
+                sum(present_flags) / len(all_interviews) if all_interviews else 0.0
+            ),
+            "avg_relevance_if_present": (
+                statistics.mean(relevances) if relevances else 0.0
+            ),
+        }
+
+        for r in theme_records:
+            interview_id = r["interview_id"]
+            for s in r["sections"]:
+                theme_sections.append(
+                    {
+                        "interview_id": interview_id,
+                        "speaker": s.get("speaker", ""),
+                        "section_text": s.get("section_text", ""),
+                        "relevance": r["relevance"],
+                    }
+                )
+
+    mo.md(
+        f"""#### Theme Overview: `{selected_theme or "None selected"}`
+
+    - Total interviews: **{len(all_interviews)}**
+    - Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
+    - Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
+    - Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
+    """
+    )
+
+    if theme_sections:
+        table_rows = [
+            {
+                "Interview": s["interview_id"],
+                "Speaker": s["speaker"],
+                "Relevance": f"{s['relevance']:.2f}",
+                "Section": s["section_text"],
+            }
+            for s in theme_sections
+        ]
+        mo.ui.table(table_rows)
+    else:
+        mo.md("_No sections for this theme yet._")
+    return


@app.cell