diff --git a/Thematic_Analysis.py b/Thematic_Analysis.py new file mode 100644 index 0000000..c4e9f42 --- /dev/null +++ b/Thematic_Analysis.py @@ -0,0 +1,273 @@ +import marimo + +__generated_with = "0.18.0" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import marimo as mo + from pathlib import Path + from utils import connect_qumo_ollama, load_srt + + VM_NAME = 'hiperf-gpu' + MODEL = 'llama3.3:70b' + + client = connect_qumo_ollama(VM_NAME) + return MODEL, Path, client, load_srt, mo + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Interview Transcript Thematic Analysis + + This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs. + """) + return + + +@app.cell +def _(Path, mo): + # Load transcript from SRT file + TRANSCRIPT_DIR = Path("data/transcripts") + srt_files = list(TRANSCRIPT_DIR.glob("*.srt")) + + # File selector + file_dropdown = mo.ui.dropdown( + options={f.name: str(f) for f in srt_files}, + label="Select transcript file" + ) + file_dropdown + return (file_dropdown,) + + +@app.cell +def _(file_dropdown, load_srt, mo): + # Load and display transcript preview + transcript_raw = "" + if file_dropdown.value: + transcript_raw = load_srt(file_dropdown.value) + + mo.md(f""" + ## Transcript Preview + + **File:** `{file_dropdown.value or 'None selected'}` + **Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words + +
+ Show first 2000 characters + + ``` + {transcript_raw[:2000]}... + ``` +
+ """) + return (transcript_raw,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 1: Infer Speaker Roles + + The model will analyze the transcript to identify who is the interviewer and who is the interviewee. + """) + return + + +@app.cell +def _(mo, transcript_raw): + # Infer speaker roles from transcript context + role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker. + + Based on the conversation context, determine who is: + - The interviewer(s) - asking questions, guiding the conversation + - The interviewee(s) - providing answers, sharing expertise/opinions + + Return ONLY a simple mapping in this exact format (one per line): + SPEAKER_XX: Role - Brief description + + For example: + SPEAKER_00: Interviewer - Michael from the voice branding team + SPEAKER_01: Interviewee - Head of Digital Design + + + {transcript_raw[:4000]} + + """ + + infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles") + infer_roles_button + return infer_roles_button, role_inference_prompt + + +@app.cell +def _(MODEL, client, infer_roles_button, mo, role_inference_prompt): + inferred_roles_text = "" + if infer_roles_button.value: + response = client.generate(model=MODEL, prompt=role_inference_prompt) + inferred_roles_text = response.response + + mo.md(f""" + ### Inferred Roles + + {inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"} + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 2: Confirm or Edit Speaker Roles + + Review the inferred roles below and make corrections if needed. + """) + return + + +@app.cell +def _(mo, transcript_raw): + import re + # Extract unique speakers from transcript + speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw))) + + # Create editable text inputs for each speaker + role_inputs = { + speaker: mo.ui.text( + value=f"{speaker}", + label=speaker, + full_width=True + ) + for speaker in speakers + } + + mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:") + return (role_inputs,) + + +@app.cell +def _(mo, role_inputs): + # Display role inputs as a form + mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())]) + return + + +@app.cell +def _(mo, role_inputs, transcript_raw): + # Apply role labels to transcript + labeled_transcript = transcript_raw + for speaker_id, input_widget in role_inputs.items(): + if input_widget.value and input_widget.value != speaker_id: + labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:") + + # Build role mapping summary + role_mapping = "\n".join([ + f"- {speaker_id} → {input_widget.value}" + for speaker_id, input_widget in sorted(role_inputs.items()) + ]) + + mo.md(f""" + ### Role Mapping Applied + + {role_mapping} + """) + return labeled_transcript, role_mapping + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Step 3: Thematic Analysis + + Configure your analysis task and run the thematic analysis. + """) + return + + +@app.cell +def _(mo): + # Editable analysis task prompt + analysis_task_input = mo.ui.text_area( + value="""Perform a thematic analysis of this interview transcript. + + Identify and describe: + 1. **Key Themes** - Major topics and ideas that emerge from the conversation + 2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution) + 3. **Insights** - Notable observations or implications from the discussion + + Focus on themes related to: + - Brand voice and tone strategy + - Customer experience priorities + - Design system and consistency + - AI/conversational interface considerations""", + label="Analysis Task", + full_width=True, + rows=12 + ) + analysis_task_input + return (analysis_task_input,) + + +@app.cell +def _(analysis_task_input, labeled_transcript, mo, role_mapping): + # Build full analysis prompt + full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data. + + ## Speaker Roles + {role_mapping} + + ## Task + {analysis_task_input.value} + + ## Interview Transcript + + ''' + + + {labeled_transcript} + + + ''' + + Provide your analysis in well-structured markdown format.""" + + run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis") + + mo.vstack([ + mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"), + run_analysis_button + ]) + return full_analysis_prompt, run_analysis_button + + +@app.cell +def _(full_analysis_prompt, mo): + mo.md(rf""" + # Full Analysis Prompt + + --- + + {full_analysis_prompt} + """) + return + + +@app.cell +def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button): + analysis_response = "" + if run_analysis_button.value: + response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt) + analysis_response = response_2.response + + mo.md(f""" + ## Analysis Results + + {analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"} + """) + return + + +if __name__ == "__main__": + app.run() diff --git a/VB_interviews_sandbox.py b/VB_interviews_sandbox.py index 1ba8a7a..29c0827 100644 --- a/VB_interviews_sandbox.py +++ b/VB_interviews_sandbox.py @@ -9,8 +9,8 @@ def _(): import marimo as mo from utils import connect_qumo_ollama - # VM_NAME = 'hiperf-gpu' - VM_NAME = 'ollama-lite' + VM_NAME = 'hiperf-gpu' + # VM_NAME = 'ollama-lite' client = connect_qumo_ollama(VM_NAME) return VM_NAME, client, mo diff --git a/layouts/VB_interviews_sandbox.slides.json b/layouts/VB_interviews_sandbox.slides.json deleted file mode 100644 index af4970a..0000000 --- a/layouts/VB_interviews_sandbox.slides.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "type": "slides", - "data": {} -} \ No newline at end of file diff --git a/utils.py b/utils.py index 4ab1c1d..9b6495d 100644 --- a/utils.py +++ b/utils.py @@ -2,11 +2,66 @@ Standard utils for this repository """ +import re +from pathlib import Path + import requests import ollama from ollama import Client +def load_srt(path: str | Path) -> str: + """Load and parse an SRT file, returning clean transcript with speaker labels. + + Args: + path: Path to the SRT file + + Returns: + Clean transcript string with format "SPEAKER_XX: text" per line, + timestamps stripped, consecutive lines from same speaker merged. + """ + path = Path(path) + content = path.read_text(encoding='utf-8') + + # Parse SRT blocks: sequence number, timestamp, speaker|text + # Pattern matches: number, timestamp line, content line(s) + blocks = re.split(r'\n\n+', content.strip()) + + turns = [] + for block in blocks: + lines = block.strip().split('\n') + if len(lines) < 3: + continue + + # Skip sequence number (line 0) and timestamp (line 1) + # Content is line 2 onwards + text_lines = lines[2:] + text = ' '.join(text_lines) + + # Parse speaker|text format + if '|' in text: + speaker, utterance = text.split('|', 1) + speaker = speaker.strip() + utterance = utterance.strip() + else: + speaker = "UNKNOWN" + utterance = text.strip() + + turns.append((speaker, utterance)) + + # Merge consecutive turns from same speaker + merged = [] + for speaker, utterance in turns: + if merged and merged[-1][0] == speaker: + merged[-1] = (speaker, merged[-1][1] + ' ' + utterance) + else: + merged.append((speaker, utterance)) + + # Format as "SPEAKER_XX: text" + transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged] + return '\n\n'.join(transcript_lines) + + def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client: """Establish connection to Qumo Ollama instance @@ -25,7 +80,7 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client: except requests.ConnectionError: print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up") - print("Connection succesful.\nAvailable models:") + print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:") for m in client.list().models: print(f" - '{m.model}' ") return client