Started on looping through all interviews

New prompt for thematic analysis added
Revert "multi theme handling"
2025-12-01 22:23:42 +01:00 · 2025-12-01 22:17:57 +01:00 · 2025-12-01 22:07:15 +01:00
2 changed files with 312 additions and 3 deletions
--- a/Architecture_Overview.py
+++ b/Architecture_Overview.py
@@ -82,7 +82,6 @@ def _(mo):
    2.  **Process:**
        *   The LLM analyzes each transcript segment-by-segment.
        *   It extracts specific quotes that match a Theme Definition.
        *   **Multi-Theme Handling:** If a quote applies to multiple themes, it is tagged with *all* relevant themes. In the dataset, this creates multiple entries (one per theme) so the quote informs the synthesis of each relevant topic.
        *   **Granular Sentiment Analysis:** For each quote, the model identifies:
            *   **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
            *   **Sentiment:** Positive / Neutral / Negative.
--- a/Thematic_Analysis.py
+++ b/Thematic_Analysis.py
@@ -13,8 +13,8 @@ def _():
    VM_NAME = 'hiperf-gpu'
    MODEL = 'llama3.3:70b'
-    client = connect_qumo_ollama(VM_NAME)
+    #client = connect_qumo_ollama(VM_NAME)
-    return MODEL, Path, client, load_srt, mo
+    return MODEL, Path, load_srt, mo
@app.cell(hide_code=True)
@@ -186,6 +186,316 @@ def _(mo):
    return
@app.cell
 def _(mo):
    # Step 3a: Define themes for labelling
    themes_input = mo.ui.text_area(
        value="""brand voice and tone
    customer experience priorities
    design system and consistency
    AI and conversational interfaces""",
        label="Themes (one per line)",
        full_width=True,
        rows=6,
    )
    mo.md("""### Step 3a: Define Themes
    Enter one theme per line. These will be used to
    label each interview transcript. Themes may overlap; the
    same section can relate to multiple themes.
    """)
    themes_input
    return (themes_input,)
@app.cell
 def _(themes_input):
    # Parse themes into a clean Python list
    raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
    theme_list = [t.strip() for t in raw_lines if t.strip()]
    return (theme_list,)
@app.cell
 def _(Path, mo):
    # Configuration for JSON output directory
    OUTPUT_DIR = Path("data/labels")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    mo.md(f"""### Step 3b: LLM-based Theme Labelling
    This step runs an LLM over the current interview transcript
    for each defined theme and saves one JSON file per theme
    for this interview in `{OUTPUT_DIR}`.
    For each theme, the model will return full sections of the
    conversation (multi-sentence chunks, not just short quotes)
    that are about that theme.
    """)
    label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
    label_button
    return
@app.cell
 def _(
    MODEL,
    OUTPUT_THEME_DIR,
    Path,
    client,
    file_dropdown,
    labeled_transcript,
    mo,
    theme_label_button,
    theme_list,
 ):
    import json
    from datetime import datetime
    theme_label_results = {}
    if theme_label_button.value and file_dropdown.value and theme_list:
        interview_id = Path(file_dropdown.value).stem
        for theme in theme_list:
            prompt = f"""You are an expert qualitative researcher.
            You will analyse a single interview transcript for ONE specific theme.
            Theme: "{theme}"
            Tasks:
            1. Decide if the theme is present in this interview.
            2. If present, estimate how relevant it is on a 0–1 scale
            where 0 = not mentioned, 0.5 = moderately important,
            1 = central theme of the interview.
            3. Identify all sections of the conversation that are
            primarily about this theme. A section can span multiple
            consecutive utterances and should form a coherent piece
            of the dialogue about the theme, not just a single
            sentence.
            Each section should include:
            - the dominant speaker label (or "mixed" if multiple)
            - the full section text (one or more sentences)
            Return your answer ONLY as a JSON object with this schema:
            {{
            "theme": string,                  // the theme name
            "present": bool,                  // whether the theme appears
            "relevance": float,               // 0.0–1.0
                "sections": [
                    {{
                        "speaker": string,           // main speaker label for the section
                        "section_text": string       // full section text about the theme
                    }}
                ]
            }}
            Transcript:
            """
            {labeled_transcript}
            """
            """
            response = client.generate(model=MODEL, prompt=prompt)
            raw_text = response.response.strip()
            try:
                parsed = json.loads(raw_text)
            except json.JSONDecodeError:
                # Fallback: try to extract JSON between braces
                try:
                    start = raw_text.index("{")
                    end = raw_text.rindex("}") + 1
                    parsed = json.loads(raw_text[start:end])
                except Exception:
                    parsed = {
                        "theme": theme,
                        "present": False,
                        "relevance": 0.0,
                        "sections": [],
                        "_parse_error": True,
                        "_raw": raw_text,
                    }
            # Normalise fields
            parsed["theme"] = parsed.get("theme", theme)
            parsed["present"] = bool(parsed.get("present", False))
            try:
                parsed["relevance"] = float(parsed.get("relevance", 0.0))
            except (TypeError, ValueError):
                parsed["relevance"] = 0.0
            if not isinstance(parsed.get("sections"), list):
                parsed["sections"] = []
            theme_label_results[theme] = parsed
            # Write per-interview-per-theme JSON file
            out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
            out_data = {
                "interview_id": interview_id,
                "theme": parsed["theme"],
                "present": parsed["present"],
                "relevance": parsed["relevance"],
                "sections": parsed["sections"],
                "generated_at": datetime.utcnow().isoformat() + "Z",
            }
            out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
    if theme_label_button.value:
        if not file_dropdown.value:
            status = "No transcript selected."
        elif not theme_list:
            status = "No themes defined. Please add at least one theme."
        else:
            status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
    else:
        status = "Click 'Run Theme Labelling for This Interview' to start."
    mo.md(f"""### Theme Labelling Status
    {status}
    """)
    return
@app.cell
 def _(Path, mo):
    # Step 3c: Load all labeled transcripts (assumed precomputed)
    LABELED_DIR = Path("data/labeled_transcripts")
    LABELED_DIR.mkdir(parents=True, exist_ok=True)
    labeled_files = sorted(LABELED_DIR.glob("*.json"))
    mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
    Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
    These will be used to aggregate themes across all interviews.
    """)
    labeled_files
    return (labeled_files,)
@app.cell
 def _(labeled_files):
    import json
    all_labeled_records = []
    for f in labeled_files:
        try:
            data = json.loads(f.read_text(encoding="utf-8"))
        except Exception:
            # Skip unreadable files
            continue
        interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
        theme = data.get("theme", "")
        present = bool(data.get("present", False))
        try:
            relevance = float(data.get("relevance", 0.0))
        except (TypeError, ValueError):
            relevance = 0.0
        sections = data.get("sections") or []
        all_labeled_records.append(
            {
                "interview_id": interview_id,
                "theme": theme,
                "present": present,
                "relevance": relevance,
                "sections": sections,
            }
        )
    return (all_labeled_records,)
@app.cell
 def _(all_labeled_records, mo):
    # Derive full theme and interview sets
    all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
    all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
    theme_selector = mo.ui.dropdown(
        options={t: t for t in all_themes},
        label="Select theme to explore across all interviews",
    )
    mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
    theme_selector
    return all_interviews, theme_selector
@app.cell
 def _(all_interviews, all_labeled_records, mo, theme_selector):
    import statistics
    selected_theme = theme_selector.value
    theme_summary = {}
    theme_sections = []
    if selected_theme:
        theme_records = [
            r for r in all_labeled_records if r["theme"] == selected_theme
        ]
        present_flags = [r["present"] for r in theme_records]
        relevances = [r["relevance"] for r in theme_records if r["present"]]
        theme_summary = {
            "theme": selected_theme,
            "num_interviews": len(all_interviews),
            "num_interviews_with_theme": sum(present_flags),
            "share_of_interviews_with_theme": (
                sum(present_flags) / len(all_interviews) if all_interviews else 0.0
            ),
            "avg_relevance_if_present": (
                statistics.mean(relevances) if relevances else 0.0
            ),
        }
        for r in theme_records:
            interview_id = r["interview_id"]
            for s in r["sections"]:
                theme_sections.append(
                    {
                        "interview_id": interview_id,
                        "speaker": s.get("speaker", ""),
                        "section_text": s.get("section_text", ""),
                        "relevance": r["relevance"],
                    }
                )
    mo.md(
        f"""#### Theme Overview: `{selected_theme or "None selected"}`
    - Total interviews: **{len(all_interviews)}**
    - Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
    - Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
    - Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
    """
    )
    if theme_sections:
        table_rows = [
            {
                "Interview": s["interview_id"],
                "Speaker": s["speaker"],
                "Relevance": f"{s['relevance']:.2f}",
                "Section": s["section_text"],
            }
            for s in theme_sections
        ]
        mo.ui.table(table_rows)
    else:
        mo.md("_No sections for this theme yet._")
    return
@app.cell
 def _(mo):
    # Editable analysis task prompt
Author	SHA1	Message	Date
mtorsij	ae9563eba3	Started on looping through all interviews	2025-12-01 22:23:42 +01:00
mtorsij	15d706b8c5	New prompt for thematic analysis added	2025-12-01 22:17:57 +01:00
mtorsij	82bd2b7c3b	Revert "multi theme handling" This reverts commit `b21f402e1e`.	2025-12-01 22:07:15 +01:00