diff --git a/Thematic_Analysis.py b/Thematic_Analysis.py
new file mode 100644
index 0000000..c4e9f42
--- /dev/null
+++ b/Thematic_Analysis.py
@@ -0,0 +1,273 @@
+import marimo
+
+__generated_with = "0.18.0"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+ import marimo as mo
+ from pathlib import Path
+ from utils import connect_qumo_ollama, load_srt
+
+ VM_NAME = 'hiperf-gpu'
+ MODEL = 'llama3.3:70b'
+
+ client = connect_qumo_ollama(VM_NAME)
+ return MODEL, Path, client, load_srt, mo
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ # Interview Transcript Thematic Analysis
+
+ This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
+ """)
+ return
+
+
+@app.cell
+def _(Path, mo):
+ # Load transcript from SRT file
+ TRANSCRIPT_DIR = Path("data/transcripts")
+ srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
+
+ # File selector
+ file_dropdown = mo.ui.dropdown(
+ options={f.name: str(f) for f in srt_files},
+ label="Select transcript file"
+ )
+ file_dropdown
+ return (file_dropdown,)
+
+
+@app.cell
+def _(file_dropdown, load_srt, mo):
+ # Load and display transcript preview
+ transcript_raw = ""
+ if file_dropdown.value:
+ transcript_raw = load_srt(file_dropdown.value)
+
+ mo.md(f"""
+ ## Transcript Preview
+
+ **File:** `{file_dropdown.value or 'None selected'}`
+ **Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
+
+
+ Show first 2000 characters
+
+ ```
+ {transcript_raw[:2000]}...
+ ```
+
+ """)
+ return (transcript_raw,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Step 1: Infer Speaker Roles
+
+ The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
+ """)
+ return
+
+
+@app.cell
+def _(mo, transcript_raw):
+ # Infer speaker roles from transcript context
+ role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
+
+ Based on the conversation context, determine who is:
+ - The interviewer(s) - asking questions, guiding the conversation
+ - The interviewee(s) - providing answers, sharing expertise/opinions
+
+ Return ONLY a simple mapping in this exact format (one per line):
+ SPEAKER_XX: Role - Brief description
+
+ For example:
+ SPEAKER_00: Interviewer - Michael from the voice branding team
+ SPEAKER_01: Interviewee - Head of Digital Design
+
+
+ {transcript_raw[:4000]}
+
+ """
+
+ infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
+ infer_roles_button
+ return infer_roles_button, role_inference_prompt
+
+
+@app.cell
+def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
+ inferred_roles_text = ""
+ if infer_roles_button.value:
+ response = client.generate(model=MODEL, prompt=role_inference_prompt)
+ inferred_roles_text = response.response
+
+ mo.md(f"""
+ ### Inferred Roles
+
+ {inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
+ """)
+ return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Step 2: Confirm or Edit Speaker Roles
+
+ Review the inferred roles below and make corrections if needed.
+ """)
+ return
+
+
+@app.cell
+def _(mo, transcript_raw):
+ import re
+ # Extract unique speakers from transcript
+ speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
+
+ # Create editable text inputs for each speaker
+ role_inputs = {
+ speaker: mo.ui.text(
+ value=f"{speaker}",
+ label=speaker,
+ full_width=True
+ )
+ for speaker in speakers
+ }
+
+ mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
+ return (role_inputs,)
+
+
+@app.cell
+def _(mo, role_inputs):
+ # Display role inputs as a form
+ mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
+ return
+
+
+@app.cell
+def _(mo, role_inputs, transcript_raw):
+ # Apply role labels to transcript
+ labeled_transcript = transcript_raw
+ for speaker_id, input_widget in role_inputs.items():
+ if input_widget.value and input_widget.value != speaker_id:
+ labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
+
+ # Build role mapping summary
+ role_mapping = "\n".join([
+ f"- {speaker_id} → {input_widget.value}"
+ for speaker_id, input_widget in sorted(role_inputs.items())
+ ])
+
+ mo.md(f"""
+ ### Role Mapping Applied
+
+ {role_mapping}
+ """)
+ return labeled_transcript, role_mapping
+
+
+@app.cell(hide_code=True)
+def _(mo):
+ mo.md(r"""
+ ## Step 3: Thematic Analysis
+
+ Configure your analysis task and run the thematic analysis.
+ """)
+ return
+
+
+@app.cell
+def _(mo):
+ # Editable analysis task prompt
+ analysis_task_input = mo.ui.text_area(
+ value="""Perform a thematic analysis of this interview transcript.
+
+ Identify and describe:
+ 1. **Key Themes** - Major topics and ideas that emerge from the conversation
+ 2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
+ 3. **Insights** - Notable observations or implications from the discussion
+
+ Focus on themes related to:
+ - Brand voice and tone strategy
+ - Customer experience priorities
+ - Design system and consistency
+ - AI/conversational interface considerations""",
+ label="Analysis Task",
+ full_width=True,
+ rows=12
+ )
+ analysis_task_input
+ return (analysis_task_input,)
+
+
+@app.cell
+def _(analysis_task_input, labeled_transcript, mo, role_mapping):
+ # Build full analysis prompt
+ full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
+
+ ## Speaker Roles
+ {role_mapping}
+
+ ## Task
+ {analysis_task_input.value}
+
+ ## Interview Transcript
+
+ '''
+
+
+ {labeled_transcript}
+
+
+ '''
+
+ Provide your analysis in well-structured markdown format."""
+
+ run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
+
+ mo.vstack([
+ mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
+ run_analysis_button
+ ])
+ return full_analysis_prompt, run_analysis_button
+
+
+@app.cell
+def _(full_analysis_prompt, mo):
+ mo.md(rf"""
+ # Full Analysis Prompt
+
+ ---
+
+ {full_analysis_prompt}
+ """)
+ return
+
+
+@app.cell
+def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
+ analysis_response = ""
+ if run_analysis_button.value:
+ response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
+ analysis_response = response_2.response
+
+ mo.md(f"""
+ ## Analysis Results
+
+ {analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
+ """)
+ return
+
+
+if __name__ == "__main__":
+ app.run()
diff --git a/VB_interviews_sandbox.py b/VB_interviews_sandbox.py
index 1ba8a7a..29c0827 100644
--- a/VB_interviews_sandbox.py
+++ b/VB_interviews_sandbox.py
@@ -9,8 +9,8 @@ def _():
import marimo as mo
from utils import connect_qumo_ollama
- # VM_NAME = 'hiperf-gpu'
- VM_NAME = 'ollama-lite'
+ VM_NAME = 'hiperf-gpu'
+ # VM_NAME = 'ollama-lite'
client = connect_qumo_ollama(VM_NAME)
return VM_NAME, client, mo
diff --git a/layouts/VB_interviews_sandbox.slides.json b/layouts/VB_interviews_sandbox.slides.json
deleted file mode 100644
index af4970a..0000000
--- a/layouts/VB_interviews_sandbox.slides.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
- "type": "slides",
- "data": {}
-}
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 4ab1c1d..9b6495d 100644
--- a/utils.py
+++ b/utils.py
@@ -2,11 +2,66 @@
Standard utils for this repository
"""
+import re
+from pathlib import Path
+
import requests
import ollama
from ollama import Client
+def load_srt(path: str | Path) -> str:
+ """Load and parse an SRT file, returning clean transcript with speaker labels.
+
+ Args:
+ path: Path to the SRT file
+
+ Returns:
+ Clean transcript string with format "SPEAKER_XX: text" per line,
+ timestamps stripped, consecutive lines from same speaker merged.
+ """
+ path = Path(path)
+ content = path.read_text(encoding='utf-8')
+
+ # Parse SRT blocks: sequence number, timestamp, speaker|text
+ # Pattern matches: number, timestamp line, content line(s)
+ blocks = re.split(r'\n\n+', content.strip())
+
+ turns = []
+ for block in blocks:
+ lines = block.strip().split('\n')
+ if len(lines) < 3:
+ continue
+
+ # Skip sequence number (line 0) and timestamp (line 1)
+ # Content is line 2 onwards
+ text_lines = lines[2:]
+ text = ' '.join(text_lines)
+
+ # Parse speaker|text format
+ if '|' in text:
+ speaker, utterance = text.split('|', 1)
+ speaker = speaker.strip()
+ utterance = utterance.strip()
+ else:
+ speaker = "UNKNOWN"
+ utterance = text.strip()
+
+ turns.append((speaker, utterance))
+
+ # Merge consecutive turns from same speaker
+ merged = []
+ for speaker, utterance in turns:
+ if merged and merged[-1][0] == speaker:
+ merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
+ else:
+ merged.append((speaker, utterance))
+
+ # Format as "SPEAKER_XX: text"
+ transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
+ return '\n\n'.join(transcript_lines)
+
+
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
"""Establish connection to Qumo Ollama instance
@@ -25,7 +80,7 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
except requests.ConnectionError:
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
- print("Connection succesful.\nAvailable models:")
+ print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
for m in client.list().models:
print(f" - '{m.model}' ")
return client