thematic analysis opzetje

This commit is contained in:
2025-12-01 15:09:16 +01:00
parent 74aecff2bd
commit 9499d6c068
4 changed files with 331 additions and 7 deletions

273
Thematic_Analysis.py Normal file
View File

@@ -0,0 +1,273 @@
import marimo
__generated_with = "0.18.0"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
from pathlib import Path
from utils import connect_qumo_ollama, load_srt
VM_NAME = 'hiperf-gpu'
MODEL = 'llama3.3:70b'
client = connect_qumo_ollama(VM_NAME)
return MODEL, Path, client, load_srt, mo
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Interview Transcript Thematic Analysis
This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
""")
return
@app.cell
def _(Path, mo):
# Load transcript from SRT file
TRANSCRIPT_DIR = Path("data/transcripts")
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
# File selector
file_dropdown = mo.ui.dropdown(
options={f.name: str(f) for f in srt_files},
label="Select transcript file"
)
file_dropdown
return (file_dropdown,)
@app.cell
def _(file_dropdown, load_srt, mo):
# Load and display transcript preview
transcript_raw = ""
if file_dropdown.value:
transcript_raw = load_srt(file_dropdown.value)
mo.md(f"""
## Transcript Preview
**File:** `{file_dropdown.value or 'None selected'}`
**Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
<details>
<summary>Show first 2000 characters</summary>
```
{transcript_raw[:2000]}...
```
</details>
""")
return (transcript_raw,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1: Infer Speaker Roles
The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
""")
return
@app.cell
def _(mo, transcript_raw):
# Infer speaker roles from transcript context
role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
Based on the conversation context, determine who is:
- The interviewer(s) - asking questions, guiding the conversation
- The interviewee(s) - providing answers, sharing expertise/opinions
Return ONLY a simple mapping in this exact format (one per line):
SPEAKER_XX: Role - Brief description
For example:
SPEAKER_00: Interviewer - Michael from the voice branding team
SPEAKER_01: Interviewee - Head of Digital Design
<transcript>
{transcript_raw[:4000]}
</transcript>
"""
infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
infer_roles_button
return infer_roles_button, role_inference_prompt
@app.cell
def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
inferred_roles_text = ""
if infer_roles_button.value:
response = client.generate(model=MODEL, prompt=role_inference_prompt)
inferred_roles_text = response.response
mo.md(f"""
### Inferred Roles
{inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 2: Confirm or Edit Speaker Roles
Review the inferred roles below and make corrections if needed.
""")
return
@app.cell
def _(mo, transcript_raw):
import re
# Extract unique speakers from transcript
speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
# Create editable text inputs for each speaker
role_inputs = {
speaker: mo.ui.text(
value=f"{speaker}",
label=speaker,
full_width=True
)
for speaker in speakers
}
mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
return (role_inputs,)
@app.cell
def _(mo, role_inputs):
# Display role inputs as a form
mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
return
@app.cell
def _(mo, role_inputs, transcript_raw):
# Apply role labels to transcript
labeled_transcript = transcript_raw
for speaker_id, input_widget in role_inputs.items():
if input_widget.value and input_widget.value != speaker_id:
labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
# Build role mapping summary
role_mapping = "\n".join([
f"- {speaker_id}{input_widget.value}"
for speaker_id, input_widget in sorted(role_inputs.items())
])
mo.md(f"""
### Role Mapping Applied
{role_mapping}
""")
return labeled_transcript, role_mapping
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 3: Thematic Analysis
Configure your analysis task and run the thematic analysis.
""")
return
@app.cell
def _(mo):
# Editable analysis task prompt
analysis_task_input = mo.ui.text_area(
value="""Perform a thematic analysis of this interview transcript.
Identify and describe:
1. **Key Themes** - Major topics and ideas that emerge from the conversation
2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
3. **Insights** - Notable observations or implications from the discussion
Focus on themes related to:
- Brand voice and tone strategy
- Customer experience priorities
- Design system and consistency
- AI/conversational interface considerations""",
label="Analysis Task",
full_width=True,
rows=12
)
analysis_task_input
return (analysis_task_input,)
@app.cell
def _(analysis_task_input, labeled_transcript, mo, role_mapping):
# Build full analysis prompt
full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
## Speaker Roles
{role_mapping}
## Task
{analysis_task_input.value}
## Interview Transcript
'''
<transcript>
{labeled_transcript}
</transcript>
'''
Provide your analysis in well-structured markdown format."""
run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
mo.vstack([
mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
run_analysis_button
])
return full_analysis_prompt, run_analysis_button
@app.cell
def _(full_analysis_prompt, mo):
mo.md(rf"""
# Full Analysis Prompt
---
{full_analysis_prompt}
""")
return
@app.cell
def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
analysis_response = ""
if run_analysis_button.value:
response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
analysis_response = response_2.response
mo.md(f"""
## Analysis Results
{analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
""")
return
if __name__ == "__main__":
app.run()

View File

@@ -9,8 +9,8 @@ def _():
import marimo as mo import marimo as mo
from utils import connect_qumo_ollama from utils import connect_qumo_ollama
# VM_NAME = 'hiperf-gpu' VM_NAME = 'hiperf-gpu'
VM_NAME = 'ollama-lite' # VM_NAME = 'ollama-lite'
client = connect_qumo_ollama(VM_NAME) client = connect_qumo_ollama(VM_NAME)
return VM_NAME, client, mo return VM_NAME, client, mo

View File

@@ -1,4 +0,0 @@
{
"type": "slides",
"data": {}
}

View File

@@ -2,11 +2,66 @@
Standard utils for this repository Standard utils for this repository
""" """
import re
from pathlib import Path
import requests import requests
import ollama import ollama
from ollama import Client from ollama import Client
def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels.
Args:
path: Path to the SRT file
Returns:
Clean transcript string with format "SPEAKER_XX: text" per line,
timestamps stripped, consecutive lines from same speaker merged.
"""
path = Path(path)
content = path.read_text(encoding='utf-8')
# Parse SRT blocks: sequence number, timestamp, speaker|text
# Pattern matches: number, timestamp line, content line(s)
blocks = re.split(r'\n\n+', content.strip())
turns = []
for block in blocks:
lines = block.strip().split('\n')
if len(lines) < 3:
continue
# Skip sequence number (line 0) and timestamp (line 1)
# Content is line 2 onwards
text_lines = lines[2:]
text = ' '.join(text_lines)
# Parse speaker|text format
if '|' in text:
speaker, utterance = text.split('|', 1)
speaker = speaker.strip()
utterance = utterance.strip()
else:
speaker = "UNKNOWN"
utterance = text.strip()
turns.append((speaker, utterance))
# Merge consecutive turns from same speaker
merged = []
for speaker, utterance in turns:
if merged and merged[-1][0] == speaker:
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
else:
merged.append((speaker, utterance))
# Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines)
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client: def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
"""Establish connection to Qumo Ollama instance """Establish connection to Qumo Ollama instance
@@ -25,7 +80,7 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
except requests.ConnectionError: except requests.ConnectionError:
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up") print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
print("Connection succesful.\nAvailable models:") print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
for m in client.list().models: for m in client.list().models:
print(f" - '{m.model}' ") print(f" - '{m.model}' ")
return client return client