diff --git a/Thematic_Analysis.py b/Thematic_Analysis.py index c4e9f42..8e502d3 100644 --- a/Thematic_Analysis.py +++ b/Thematic_Analysis.py @@ -13,8 +13,8 @@ def _(): VM_NAME = 'hiperf-gpu' MODEL = 'llama3.3:70b' - client = connect_qumo_ollama(VM_NAME) - return MODEL, Path, client, load_srt, mo + #client = connect_qumo_ollama(VM_NAME) + return MODEL, Path, load_srt, mo @app.cell(hide_code=True) @@ -186,6 +186,183 @@ def _(mo): return +@app.cell +def _(mo): + # Step 3a: Define themes for labelling + themes_input = mo.ui.text_area( + value="""brand voice and tone + customer experience priorities + design system and consistency + AI and conversational interfaces""", + label="Themes (one per line)", + full_width=True, + rows=6, + ) + + mo.md("""### Step 3a: Define Themes + + Enter one theme per line. These will be used to + label each interview transcript. Themes may overlap; the + same section can relate to multiple themes. + """) + + themes_input + return (themes_input,) + + +@app.cell +def _(themes_input): + # Parse themes into a clean Python list + raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else [] + theme_list = [t.strip() for t in raw_lines if t.strip()] + return (theme_list,) + + +@app.cell +def _(Path, mo): + # Configuration for JSON output directory + OUTPUT_DIR = Path("data/labels") + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + mo.md(f"""### Step 3b: LLM-based Theme Labelling + + This step runs an LLM over the current interview transcript + for each defined theme and saves one JSON file per theme + for this interview in `{OUTPUT_DIR}`. + + For each theme, the model will return full sections of the + conversation (multi-sentence chunks, not just short quotes) + that are about that theme. + """) + + label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview") + label_button + return OUTPUT_DIR, label_button + + +@app.cell +def _( + MODEL, + OUTPUT_THEME_DIR, + Path, + client, + file_dropdown, + theme_label_button, + labeled_transcript, + mo, + theme_list, +): + import json + from datetime import datetime + + theme_label_results = {} + + if theme_label_button.value and file_dropdown.value and theme_list: + interview_id = Path(file_dropdown.value).stem + + for theme in theme_list: + prompt = f"""You are an expert qualitative researcher. + + You will analyse a single interview transcript for ONE specific theme. + + Theme: "{theme}" + + Tasks: + 1. Decide if the theme is present in this interview. + 2. If present, estimate how relevant it is on a 0–1 scale + where 0 = not mentioned, 0.5 = moderately important, + 1 = central theme of the interview. + 3. Identify all sections of the conversation that are + primarily about this theme. A section can span multiple + consecutive utterances and should form a coherent piece + of the dialogue about the theme, not just a single + sentence. + + Each section should include: + - the dominant speaker label (or "mixed" if multiple) + - the full section text (one or more sentences) + + Return your answer ONLY as a JSON object with this schema: + {{ + "theme": string, // the theme name + "present": bool, // whether the theme appears + "relevance": float, // 0.0–1.0 + "sections": [ + {{ + "speaker": string, // main speaker label for the section + "section_text": string // full section text about the theme + }} + ] + }} + + Transcript: + """ + {labeled_transcript} + """ + """ + + response = client.generate(model=MODEL, prompt=prompt) + raw_text = response.response.strip() + + try: + parsed = json.loads(raw_text) + except json.JSONDecodeError: + # Fallback: try to extract JSON between braces + try: + start = raw_text.index("{") + end = raw_text.rindex("}") + 1 + parsed = json.loads(raw_text[start:end]) + except Exception: + parsed = { + "theme": theme, + "present": False, + "relevance": 0.0, + "sections": [], + "_parse_error": True, + "_raw": raw_text, + } + + # Normalise fields + parsed["theme"] = parsed.get("theme", theme) + parsed["present"] = bool(parsed.get("present", False)) + try: + parsed["relevance"] = float(parsed.get("relevance", 0.0)) + except (TypeError, ValueError): + parsed["relevance"] = 0.0 + if not isinstance(parsed.get("sections"), list): + parsed["sections"] = [] + + theme_label_results[theme] = parsed + + # Write per-interview-per-theme JSON file + out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json" + out_data = { + "interview_id": interview_id, + "theme": parsed["theme"], + "present": parsed["present"], + "relevance": parsed["relevance"], + "sections": parsed["sections"], + "generated_at": datetime.utcnow().isoformat() + "Z", + } + out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8") + + if theme_label_button.value: + if not file_dropdown.value: + status = "No transcript selected." + elif not theme_list: + status = "No themes defined. Please add at least one theme." + else: + status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'." + else: + status = "Click 'Run Theme Labelling for This Interview' to start." + + mo.md(f"""### Theme Labelling Status + +{status} +""") + return theme_label_results + + @app.cell def _(mo): # Editable analysis task prompt