New prompt for thematic analysis added

2025-12-01 22:17:57 +01:00
parent 82bd2b7c3b
commit 15d706b8c5
1 changed files with 179 additions and 2 deletions
--- a/Thematic_Analysis.py
+++ b/Thematic_Analysis.py
@@ -13,8 +13,8 @@ def _():
    VM_NAME = 'hiperf-gpu'
    MODEL = 'llama3.3:70b'

-    client = connect_qumo_ollama(VM_NAME)
-    return MODEL, Path, client, load_srt, mo
+    #client = connect_qumo_ollama(VM_NAME)
+    return MODEL, Path, load_srt, mo


@app.cell(hide_code=True)
@@ -186,6 +186,183 @@ def _(mo):
    return


+@app.cell
+def _(mo):
+    # Step 3a: Define themes for labelling
+    themes_input = mo.ui.text_area(
+        value="""brand voice and tone
+    customer experience priorities
+    design system and consistency
+    AI and conversational interfaces""",
+        label="Themes (one per line)",
+        full_width=True,
+        rows=6,
+    )
+
+    mo.md("""### Step 3a: Define Themes
+
+    Enter one theme per line. These will be used to
+    label each interview transcript. Themes may overlap; the
+    same section can relate to multiple themes.
+    """)
+
+    themes_input
+    return (themes_input,)
+
+
+@app.cell
+def _(themes_input):
+    # Parse themes into a clean Python list
+    raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
+    theme_list = [t.strip() for t in raw_lines if t.strip()]
+    return (theme_list,)
+
+
+@app.cell
+def _(Path, mo):
+    # Configuration for JSON output directory
+    OUTPUT_DIR = Path("data/labels")
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    mo.md(f"""### Step 3b: LLM-based Theme Labelling
+
+    This step runs an LLM over the current interview transcript
+    for each defined theme and saves one JSON file per theme
+    for this interview in `{OUTPUT_DIR}`.
+
+    For each theme, the model will return full sections of the
+    conversation (multi-sentence chunks, not just short quotes)
+    that are about that theme.
+    """)
+
+    label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
+    label_button
+    return OUTPUT_DIR, label_button
+
+
+@app.cell
+def _(
+    MODEL,
+    OUTPUT_THEME_DIR,
+    Path,
+    client,
+    file_dropdown,
+    theme_label_button,
+    labeled_transcript,
+    mo,
+    theme_list,
+):
+    import json
+    from datetime import datetime
+
+    theme_label_results = {}
+
+    if theme_label_button.value and file_dropdown.value and theme_list:
+        interview_id = Path(file_dropdown.value).stem
+
+        for theme in theme_list:
+            prompt = f"""You are an expert qualitative researcher.
+
+            You will analyse a single interview transcript for ONE specific theme.
+
+            Theme: "{theme}"
+
+            Tasks:
+            1. Decide if the theme is present in this interview.
+            2. If present, estimate how relevant it is on a 0–1 scale
+            where 0 = not mentioned, 0.5 = moderately important,
+            1 = central theme of the interview.
+            3. Identify all sections of the conversation that are
+            primarily about this theme. A section can span multiple
+            consecutive utterances and should form a coherent piece
+            of the dialogue about the theme, not just a single
+            sentence.
+
+            Each section should include:
+            - the dominant speaker label (or "mixed" if multiple)
+            - the full section text (one or more sentences)
+
+            Return your answer ONLY as a JSON object with this schema:
+            {{
+            "theme": string,                  // the theme name
+            "present": bool,                  // whether the theme appears
+            "relevance": float,               // 0.0–1.0
+                "sections": [
+                    {{
+                        "speaker": string,           // main speaker label for the section
+                        "section_text": string       // full section text about the theme
+                    }}
+                ]
+            }}
+
+            Transcript:
+            """
+            {labeled_transcript}
+            """
+            """
+
+            response = client.generate(model=MODEL, prompt=prompt)
+            raw_text = response.response.strip()
+
+            try:
+                parsed = json.loads(raw_text)
+            except json.JSONDecodeError:
+                # Fallback: try to extract JSON between braces
+                try:
+                    start = raw_text.index("{")
+                    end = raw_text.rindex("}") + 1
+                    parsed = json.loads(raw_text[start:end])
+                except Exception:
+                    parsed = {
+                        "theme": theme,
+                        "present": False,
+                        "relevance": 0.0,
+                        "sections": [],
+                        "_parse_error": True,
+                        "_raw": raw_text,
+                    }
+
+            # Normalise fields
+            parsed["theme"] = parsed.get("theme", theme)
+            parsed["present"] = bool(parsed.get("present", False))
+            try:
+                parsed["relevance"] = float(parsed.get("relevance", 0.0))
+            except (TypeError, ValueError):
+                parsed["relevance"] = 0.0
+            if not isinstance(parsed.get("sections"), list):
+                parsed["sections"] = []
+
+            theme_label_results[theme] = parsed
+
+            # Write per-interview-per-theme JSON file
+            out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
+            out_data = {
+                "interview_id": interview_id,
+                "theme": parsed["theme"],
+                "present": parsed["present"],
+                "relevance": parsed["relevance"],
+                "sections": parsed["sections"],
+                "generated_at": datetime.utcnow().isoformat() + "Z",
+            }
+            out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
+
+    if theme_label_button.value:
+        if not file_dropdown.value:
+            status = "No transcript selected."
+        elif not theme_list:
+            status = "No themes defined. Please add at least one theme."
+        else:
+            status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
+    else:
+        status = "Click 'Run Theme Labelling for This Interview' to start."
+
+    mo.md(f"""### Theme Labelling Status
+
+{status}
+""")
+    return theme_label_results
+
+
@app.cell
 def _(mo):
    # Editable analysis task prompt