import marimo __generated_with = "0.18.0" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo from pathlib import Path from utils import connect_qumo_ollama, load_srt VM_NAME = 'hiperf-gpu' MODEL = 'llama3.3:70b' #client = connect_qumo_ollama(VM_NAME) return MODEL, Path, load_srt, mo @app.cell(hide_code=True) def _(mo): mo.md(r""" # Interview Transcript Thematic Analysis This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs. """) return @app.cell def _(Path, mo): # Load transcript from SRT file TRANSCRIPT_DIR = Path("data/transcripts") srt_files = list(TRANSCRIPT_DIR.glob("*.srt")) # File selector file_dropdown = mo.ui.dropdown( options={f.name: str(f) for f in srt_files}, label="Select transcript file" ) file_dropdown return (file_dropdown,) @app.cell def _(file_dropdown, load_srt, mo): # Load and display transcript preview transcript_raw = "" if file_dropdown.value: transcript_raw = load_srt(file_dropdown.value) mo.md(f""" ## Transcript Preview **File:** `{file_dropdown.value or 'None selected'}` **Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
Show first 2000 characters ``` {transcript_raw[:2000]}... ```
""") return (transcript_raw,) @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 1: Infer Speaker Roles The model will analyze the transcript to identify who is the interviewer and who is the interviewee. """) return @app.cell def _(mo, transcript_raw): # Infer speaker roles from transcript context role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker. Based on the conversation context, determine who is: - The interviewer(s) - asking questions, guiding the conversation - The interviewee(s) - providing answers, sharing expertise/opinions Return ONLY a simple mapping in this exact format (one per line): SPEAKER_XX: Role - Brief description For example: SPEAKER_00: Interviewer - Michael from the voice branding team SPEAKER_01: Interviewee - Head of Digital Design {transcript_raw[:4000]} """ infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles") infer_roles_button return infer_roles_button, role_inference_prompt @app.cell def _(MODEL, client, infer_roles_button, mo, role_inference_prompt): inferred_roles_text = "" if infer_roles_button.value: response = client.generate(model=MODEL, prompt=role_inference_prompt) inferred_roles_text = response.response mo.md(f""" ### Inferred Roles {inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"} """) return @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 2: Confirm or Edit Speaker Roles Review the inferred roles below and make corrections if needed. """) return @app.cell def _(mo, transcript_raw): import re # Extract unique speakers from transcript speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw))) # Create editable text inputs for each speaker role_inputs = { speaker: mo.ui.text( value=f"{speaker}", label=speaker, full_width=True ) for speaker in speakers } mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:") return (role_inputs,) @app.cell def _(mo, role_inputs): # Display role inputs as a form mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())]) return @app.cell def _(mo, role_inputs, transcript_raw): # Apply role labels to transcript labeled_transcript = transcript_raw for speaker_id, input_widget in role_inputs.items(): if input_widget.value and input_widget.value != speaker_id: labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:") # Build role mapping summary role_mapping = "\n".join([ f"- {speaker_id} → {input_widget.value}" for speaker_id, input_widget in sorted(role_inputs.items()) ]) mo.md(f""" ### Role Mapping Applied {role_mapping} """) return labeled_transcript, role_mapping @app.cell(hide_code=True) def _(mo): mo.md(r""" ## Step 3: Thematic Analysis Configure your analysis task and run the thematic analysis. """) return @app.cell def _(mo): # Step 3a: Define themes for labelling themes_input = mo.ui.text_area( value="""brand voice and tone customer experience priorities design system and consistency AI and conversational interfaces""", label="Themes (one per line)", full_width=True, rows=6, ) mo.md("""### Step 3a: Define Themes Enter one theme per line. These will be used to label each interview transcript. Themes may overlap; the same section can relate to multiple themes. """) themes_input return (themes_input,) @app.cell def _(themes_input): # Parse themes into a clean Python list raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else [] theme_list = [t.strip() for t in raw_lines if t.strip()] return (theme_list,) @app.cell def _(Path, mo): # Configuration for JSON output directory OUTPUT_DIR = Path("data/labels") OUTPUT_DIR.mkdir(parents=True, exist_ok=True) mo.md(f"""### Step 3b: LLM-based Theme Labelling This step runs an LLM over the current interview transcript for each defined theme and saves one JSON file per theme for this interview in `{OUTPUT_DIR}`. For each theme, the model will return full sections of the conversation (multi-sentence chunks, not just short quotes) that are about that theme. """) label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview") label_button return @app.cell def _( MODEL, OUTPUT_THEME_DIR, Path, client, file_dropdown, labeled_transcript, mo, theme_label_button, theme_list, ): import json from datetime import datetime theme_label_results = {} if theme_label_button.value and file_dropdown.value and theme_list: interview_id = Path(file_dropdown.value).stem for theme in theme_list: prompt = f"""You are an expert qualitative researcher. You will analyse a single interview transcript for ONE specific theme. Theme: "{theme}" Tasks: 1. Decide if the theme is present in this interview. 2. If present, estimate how relevant it is on a 0–1 scale where 0 = not mentioned, 0.5 = moderately important, 1 = central theme of the interview. 3. Identify all sections of the conversation that are primarily about this theme. A section can span multiple consecutive utterances and should form a coherent piece of the dialogue about the theme, not just a single sentence. Each section should include: - the dominant speaker label (or "mixed" if multiple) - the full section text (one or more sentences) Return your answer ONLY as a JSON object with this schema: {{ "theme": string, // the theme name "present": bool, // whether the theme appears "relevance": float, // 0.0–1.0 "sections": [ {{ "speaker": string, // main speaker label for the section "section_text": string // full section text about the theme }} ] }} Transcript: """ {labeled_transcript} """ """ response = client.generate(model=MODEL, prompt=prompt) raw_text = response.response.strip() try: parsed = json.loads(raw_text) except json.JSONDecodeError: # Fallback: try to extract JSON between braces try: start = raw_text.index("{") end = raw_text.rindex("}") + 1 parsed = json.loads(raw_text[start:end]) except Exception: parsed = { "theme": theme, "present": False, "relevance": 0.0, "sections": [], "_parse_error": True, "_raw": raw_text, } # Normalise fields parsed["theme"] = parsed.get("theme", theme) parsed["present"] = bool(parsed.get("present", False)) try: parsed["relevance"] = float(parsed.get("relevance", 0.0)) except (TypeError, ValueError): parsed["relevance"] = 0.0 if not isinstance(parsed.get("sections"), list): parsed["sections"] = [] theme_label_results[theme] = parsed # Write per-interview-per-theme JSON file out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json" out_data = { "interview_id": interview_id, "theme": parsed["theme"], "present": parsed["present"], "relevance": parsed["relevance"], "sections": parsed["sections"], "generated_at": datetime.utcnow().isoformat() + "Z", } out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8") if theme_label_button.value: if not file_dropdown.value: status = "No transcript selected." elif not theme_list: status = "No themes defined. Please add at least one theme." else: status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'." else: status = "Click 'Run Theme Labelling for This Interview' to start." mo.md(f"""### Theme Labelling Status {status} """) return @app.cell def _(Path, mo): # Step 3c: Load all labeled transcripts (assumed precomputed) LABELED_DIR = Path("data/labeled_transcripts") LABELED_DIR.mkdir(parents=True, exist_ok=True) labeled_files = sorted(LABELED_DIR.glob("*.json")) mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`. These will be used to aggregate themes across all interviews. """) labeled_files return (labeled_files,) @app.cell def _(labeled_files): import json all_labeled_records = [] for f in labeled_files: try: data = json.loads(f.read_text(encoding="utf-8")) except Exception: # Skip unreadable files continue interview_id = data.get("interview_id") or f.stem.split("__", 1)[0] theme = data.get("theme", "") present = bool(data.get("present", False)) try: relevance = float(data.get("relevance", 0.0)) except (TypeError, ValueError): relevance = 0.0 sections = data.get("sections") or [] all_labeled_records.append( { "interview_id": interview_id, "theme": theme, "present": present, "relevance": relevance, "sections": sections, } ) return (all_labeled_records,) @app.cell def _(all_labeled_records, mo): # Derive full theme and interview sets all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]}) all_interviews = sorted({r["interview_id"] for r in all_labeled_records}) theme_selector = mo.ui.dropdown( options={t: t for t in all_themes}, label="Select theme to explore across all interviews", ) mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts") theme_selector return all_interviews, theme_selector @app.cell def _(all_interviews, all_labeled_records, mo, theme_selector): import statistics selected_theme = theme_selector.value theme_summary = {} theme_sections = [] if selected_theme: theme_records = [ r for r in all_labeled_records if r["theme"] == selected_theme ] present_flags = [r["present"] for r in theme_records] relevances = [r["relevance"] for r in theme_records if r["present"]] theme_summary = { "theme": selected_theme, "num_interviews": len(all_interviews), "num_interviews_with_theme": sum(present_flags), "share_of_interviews_with_theme": ( sum(present_flags) / len(all_interviews) if all_interviews else 0.0 ), "avg_relevance_if_present": ( statistics.mean(relevances) if relevances else 0.0 ), } for r in theme_records: interview_id = r["interview_id"] for s in r["sections"]: theme_sections.append( { "interview_id": interview_id, "speaker": s.get("speaker", ""), "section_text": s.get("section_text", ""), "relevance": r["relevance"], } ) mo.md( f"""#### Theme Overview: `{selected_theme or "None selected"}` - Total interviews: **{len(all_interviews)}** - Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}** - Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}** - Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}** """ ) if theme_sections: table_rows = [ { "Interview": s["interview_id"], "Speaker": s["speaker"], "Relevance": f"{s['relevance']:.2f}", "Section": s["section_text"], } for s in theme_sections ] mo.ui.table(table_rows) else: mo.md("_No sections for this theme yet._") return @app.cell def _(mo): # Editable analysis task prompt analysis_task_input = mo.ui.text_area( value="""Perform a thematic analysis of this interview transcript. Identify and describe: 1. **Key Themes** - Major topics and ideas that emerge from the conversation 2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution) 3. **Insights** - Notable observations or implications from the discussion Focus on themes related to: - Brand voice and tone strategy - Customer experience priorities - Design system and consistency - AI/conversational interface considerations""", label="Analysis Task", full_width=True, rows=12 ) analysis_task_input return (analysis_task_input,) @app.cell def _(analysis_task_input, labeled_transcript, mo, role_mapping): # Build full analysis prompt full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data. ## Speaker Roles {role_mapping} ## Task {analysis_task_input.value} ## Interview Transcript ''' {labeled_transcript} ''' Provide your analysis in well-structured markdown format.""" run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis") mo.vstack([ mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"), run_analysis_button ]) return full_analysis_prompt, run_analysis_button @app.cell def _(full_analysis_prompt, mo): mo.md(rf""" # Full Analysis Prompt --- {full_analysis_prompt} """) return @app.cell def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button): analysis_response = "" if run_analysis_button.value: response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt) analysis_response = response_2.response mo.md(f""" ## Analysis Results {analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"} """) return if __name__ == "__main__": app.run()