thematic analysis opzetje
This commit is contained in:
273
Thematic_Analysis.py
Normal file
273
Thematic_Analysis.py
Normal file
@@ -0,0 +1,273 @@
|
|||||||
|
import marimo
|
||||||
|
|
||||||
|
__generated_with = "0.18.0"
|
||||||
|
app = marimo.App(width="medium")
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _():
|
||||||
|
import marimo as mo
|
||||||
|
from pathlib import Path
|
||||||
|
from utils import connect_qumo_ollama, load_srt
|
||||||
|
|
||||||
|
VM_NAME = 'hiperf-gpu'
|
||||||
|
MODEL = 'llama3.3:70b'
|
||||||
|
|
||||||
|
client = connect_qumo_ollama(VM_NAME)
|
||||||
|
return MODEL, Path, client, load_srt, mo
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
# Interview Transcript Thematic Analysis
|
||||||
|
|
||||||
|
This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(Path, mo):
|
||||||
|
# Load transcript from SRT file
|
||||||
|
TRANSCRIPT_DIR = Path("data/transcripts")
|
||||||
|
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
|
||||||
|
|
||||||
|
# File selector
|
||||||
|
file_dropdown = mo.ui.dropdown(
|
||||||
|
options={f.name: str(f) for f in srt_files},
|
||||||
|
label="Select transcript file"
|
||||||
|
)
|
||||||
|
file_dropdown
|
||||||
|
return (file_dropdown,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(file_dropdown, load_srt, mo):
|
||||||
|
# Load and display transcript preview
|
||||||
|
transcript_raw = ""
|
||||||
|
if file_dropdown.value:
|
||||||
|
transcript_raw = load_srt(file_dropdown.value)
|
||||||
|
|
||||||
|
mo.md(f"""
|
||||||
|
## Transcript Preview
|
||||||
|
|
||||||
|
**File:** `{file_dropdown.value or 'None selected'}`
|
||||||
|
**Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Show first 2000 characters</summary>
|
||||||
|
|
||||||
|
```
|
||||||
|
{transcript_raw[:2000]}...
|
||||||
|
```
|
||||||
|
</details>
|
||||||
|
""")
|
||||||
|
return (transcript_raw,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
## Step 1: Infer Speaker Roles
|
||||||
|
|
||||||
|
The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo, transcript_raw):
|
||||||
|
# Infer speaker roles from transcript context
|
||||||
|
role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
|
||||||
|
|
||||||
|
Based on the conversation context, determine who is:
|
||||||
|
- The interviewer(s) - asking questions, guiding the conversation
|
||||||
|
- The interviewee(s) - providing answers, sharing expertise/opinions
|
||||||
|
|
||||||
|
Return ONLY a simple mapping in this exact format (one per line):
|
||||||
|
SPEAKER_XX: Role - Brief description
|
||||||
|
|
||||||
|
For example:
|
||||||
|
SPEAKER_00: Interviewer - Michael from the voice branding team
|
||||||
|
SPEAKER_01: Interviewee - Head of Digital Design
|
||||||
|
|
||||||
|
<transcript>
|
||||||
|
{transcript_raw[:4000]}
|
||||||
|
</transcript>
|
||||||
|
"""
|
||||||
|
|
||||||
|
infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
|
||||||
|
infer_roles_button
|
||||||
|
return infer_roles_button, role_inference_prompt
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
|
||||||
|
inferred_roles_text = ""
|
||||||
|
if infer_roles_button.value:
|
||||||
|
response = client.generate(model=MODEL, prompt=role_inference_prompt)
|
||||||
|
inferred_roles_text = response.response
|
||||||
|
|
||||||
|
mo.md(f"""
|
||||||
|
### Inferred Roles
|
||||||
|
|
||||||
|
{inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
## Step 2: Confirm or Edit Speaker Roles
|
||||||
|
|
||||||
|
Review the inferred roles below and make corrections if needed.
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo, transcript_raw):
|
||||||
|
import re
|
||||||
|
# Extract unique speakers from transcript
|
||||||
|
speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
|
||||||
|
|
||||||
|
# Create editable text inputs for each speaker
|
||||||
|
role_inputs = {
|
||||||
|
speaker: mo.ui.text(
|
||||||
|
value=f"{speaker}",
|
||||||
|
label=speaker,
|
||||||
|
full_width=True
|
||||||
|
)
|
||||||
|
for speaker in speakers
|
||||||
|
}
|
||||||
|
|
||||||
|
mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
|
||||||
|
return (role_inputs,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo, role_inputs):
|
||||||
|
# Display role inputs as a form
|
||||||
|
mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo, role_inputs, transcript_raw):
|
||||||
|
# Apply role labels to transcript
|
||||||
|
labeled_transcript = transcript_raw
|
||||||
|
for speaker_id, input_widget in role_inputs.items():
|
||||||
|
if input_widget.value and input_widget.value != speaker_id:
|
||||||
|
labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
|
||||||
|
|
||||||
|
# Build role mapping summary
|
||||||
|
role_mapping = "\n".join([
|
||||||
|
f"- {speaker_id} → {input_widget.value}"
|
||||||
|
for speaker_id, input_widget in sorted(role_inputs.items())
|
||||||
|
])
|
||||||
|
|
||||||
|
mo.md(f"""
|
||||||
|
### Role Mapping Applied
|
||||||
|
|
||||||
|
{role_mapping}
|
||||||
|
""")
|
||||||
|
return labeled_transcript, role_mapping
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell(hide_code=True)
|
||||||
|
def _(mo):
|
||||||
|
mo.md(r"""
|
||||||
|
## Step 3: Thematic Analysis
|
||||||
|
|
||||||
|
Configure your analysis task and run the thematic analysis.
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
# Editable analysis task prompt
|
||||||
|
analysis_task_input = mo.ui.text_area(
|
||||||
|
value="""Perform a thematic analysis of this interview transcript.
|
||||||
|
|
||||||
|
Identify and describe:
|
||||||
|
1. **Key Themes** - Major topics and ideas that emerge from the conversation
|
||||||
|
2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
|
||||||
|
3. **Insights** - Notable observations or implications from the discussion
|
||||||
|
|
||||||
|
Focus on themes related to:
|
||||||
|
- Brand voice and tone strategy
|
||||||
|
- Customer experience priorities
|
||||||
|
- Design system and consistency
|
||||||
|
- AI/conversational interface considerations""",
|
||||||
|
label="Analysis Task",
|
||||||
|
full_width=True,
|
||||||
|
rows=12
|
||||||
|
)
|
||||||
|
analysis_task_input
|
||||||
|
return (analysis_task_input,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(analysis_task_input, labeled_transcript, mo, role_mapping):
|
||||||
|
# Build full analysis prompt
|
||||||
|
full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
|
||||||
|
|
||||||
|
## Speaker Roles
|
||||||
|
{role_mapping}
|
||||||
|
|
||||||
|
## Task
|
||||||
|
{analysis_task_input.value}
|
||||||
|
|
||||||
|
## Interview Transcript
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
<transcript>
|
||||||
|
{labeled_transcript}
|
||||||
|
</transcript>
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
Provide your analysis in well-structured markdown format."""
|
||||||
|
|
||||||
|
run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
|
||||||
|
|
||||||
|
mo.vstack([
|
||||||
|
mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
|
||||||
|
run_analysis_button
|
||||||
|
])
|
||||||
|
return full_analysis_prompt, run_analysis_button
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(full_analysis_prompt, mo):
|
||||||
|
mo.md(rf"""
|
||||||
|
# Full Analysis Prompt
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
{full_analysis_prompt}
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
|
||||||
|
analysis_response = ""
|
||||||
|
if run_analysis_button.value:
|
||||||
|
response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
|
||||||
|
analysis_response = response_2.response
|
||||||
|
|
||||||
|
mo.md(f"""
|
||||||
|
## Analysis Results
|
||||||
|
|
||||||
|
{analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
@@ -9,8 +9,8 @@ def _():
|
|||||||
import marimo as mo
|
import marimo as mo
|
||||||
from utils import connect_qumo_ollama
|
from utils import connect_qumo_ollama
|
||||||
|
|
||||||
# VM_NAME = 'hiperf-gpu'
|
VM_NAME = 'hiperf-gpu'
|
||||||
VM_NAME = 'ollama-lite'
|
# VM_NAME = 'ollama-lite'
|
||||||
|
|
||||||
client = connect_qumo_ollama(VM_NAME)
|
client = connect_qumo_ollama(VM_NAME)
|
||||||
return VM_NAME, client, mo
|
return VM_NAME, client, mo
|
||||||
|
|||||||
@@ -1,4 +0,0 @@
|
|||||||
{
|
|
||||||
"type": "slides",
|
|
||||||
"data": {}
|
|
||||||
}
|
|
||||||
57
utils.py
57
utils.py
@@ -2,11 +2,66 @@
|
|||||||
Standard utils for this repository
|
Standard utils for this repository
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import ollama
|
import ollama
|
||||||
from ollama import Client
|
from ollama import Client
|
||||||
|
|
||||||
|
|
||||||
|
def load_srt(path: str | Path) -> str:
|
||||||
|
"""Load and parse an SRT file, returning clean transcript with speaker labels.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the SRT file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Clean transcript string with format "SPEAKER_XX: text" per line,
|
||||||
|
timestamps stripped, consecutive lines from same speaker merged.
|
||||||
|
"""
|
||||||
|
path = Path(path)
|
||||||
|
content = path.read_text(encoding='utf-8')
|
||||||
|
|
||||||
|
# Parse SRT blocks: sequence number, timestamp, speaker|text
|
||||||
|
# Pattern matches: number, timestamp line, content line(s)
|
||||||
|
blocks = re.split(r'\n\n+', content.strip())
|
||||||
|
|
||||||
|
turns = []
|
||||||
|
for block in blocks:
|
||||||
|
lines = block.strip().split('\n')
|
||||||
|
if len(lines) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Skip sequence number (line 0) and timestamp (line 1)
|
||||||
|
# Content is line 2 onwards
|
||||||
|
text_lines = lines[2:]
|
||||||
|
text = ' '.join(text_lines)
|
||||||
|
|
||||||
|
# Parse speaker|text format
|
||||||
|
if '|' in text:
|
||||||
|
speaker, utterance = text.split('|', 1)
|
||||||
|
speaker = speaker.strip()
|
||||||
|
utterance = utterance.strip()
|
||||||
|
else:
|
||||||
|
speaker = "UNKNOWN"
|
||||||
|
utterance = text.strip()
|
||||||
|
|
||||||
|
turns.append((speaker, utterance))
|
||||||
|
|
||||||
|
# Merge consecutive turns from same speaker
|
||||||
|
merged = []
|
||||||
|
for speaker, utterance in turns:
|
||||||
|
if merged and merged[-1][0] == speaker:
|
||||||
|
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
|
||||||
|
else:
|
||||||
|
merged.append((speaker, utterance))
|
||||||
|
|
||||||
|
# Format as "SPEAKER_XX: text"
|
||||||
|
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
|
||||||
|
return '\n\n'.join(transcript_lines)
|
||||||
|
|
||||||
|
|
||||||
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
|
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
|
||||||
"""Establish connection to Qumo Ollama instance
|
"""Establish connection to Qumo Ollama instance
|
||||||
|
|
||||||
@@ -25,7 +80,7 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
|
|||||||
except requests.ConnectionError:
|
except requests.ConnectionError:
|
||||||
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
||||||
|
|
||||||
print("Connection succesful.\nAvailable models:")
|
print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
|
||||||
for m in client.list().models:
|
for m in client.list().models:
|
||||||
print(f" - '{m.model}' ")
|
print(f" - '{m.model}' ")
|
||||||
return client
|
return client
|
||||||
|
|||||||
Reference in New Issue
Block a user