import marimo
__generated_with = "0.18.0"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
from pathlib import Path
from utils import connect_qumo_ollama, load_srt
VM_NAME = 'hiperf-gpu'
MODEL = 'llama3.3:70b'
#client = connect_qumo_ollama(VM_NAME)
return MODEL, Path, load_srt, mo
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Interview Transcript Thematic Analysis
This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
""")
return
@app.cell
def _(Path, mo):
# Load transcript from SRT file
TRANSCRIPT_DIR = Path("data/transcripts")
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
# File selector
file_dropdown = mo.ui.dropdown(
options={f.name: str(f) for f in srt_files},
label="Select transcript file"
)
file_dropdown
return (file_dropdown,)
@app.cell
def _(file_dropdown, load_srt, mo):
# Load and display transcript preview
transcript_raw = ""
if file_dropdown.value:
transcript_raw = load_srt(file_dropdown.value)
mo.md(f"""
## Transcript Preview
**File:** `{file_dropdown.value or 'None selected'}`
**Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
Show first 2000 characters
```
{transcript_raw[:2000]}...
```
""")
return (transcript_raw,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1: Infer Speaker Roles
The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
""")
return
@app.cell
def _(mo, transcript_raw):
# Infer speaker roles from transcript context
role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
Based on the conversation context, determine who is:
- The interviewer(s) - asking questions, guiding the conversation
- The interviewee(s) - providing answers, sharing expertise/opinions
Return ONLY a simple mapping in this exact format (one per line):
SPEAKER_XX: Role - Brief description
For example:
SPEAKER_00: Interviewer - Michael from the voice branding team
SPEAKER_01: Interviewee - Head of Digital Design
{transcript_raw[:4000]}
"""
infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
infer_roles_button
return infer_roles_button, role_inference_prompt
@app.cell
def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
inferred_roles_text = ""
if infer_roles_button.value:
response = client.generate(model=MODEL, prompt=role_inference_prompt)
inferred_roles_text = response.response
mo.md(f"""
### Inferred Roles
{inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 2: Confirm or Edit Speaker Roles
Review the inferred roles below and make corrections if needed.
""")
return
@app.cell
def _(mo, transcript_raw):
import re
# Extract unique speakers from transcript
speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
# Create editable text inputs for each speaker
role_inputs = {
speaker: mo.ui.text(
value=f"{speaker}",
label=speaker,
full_width=True
)
for speaker in speakers
}
mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
return (role_inputs,)
@app.cell
def _(mo, role_inputs):
# Display role inputs as a form
mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
return
@app.cell
def _(mo, role_inputs, transcript_raw):
# Apply role labels to transcript
labeled_transcript = transcript_raw
for speaker_id, input_widget in role_inputs.items():
if input_widget.value and input_widget.value != speaker_id:
labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
# Build role mapping summary
role_mapping = "\n".join([
f"- {speaker_id} → {input_widget.value}"
for speaker_id, input_widget in sorted(role_inputs.items())
])
mo.md(f"""
### Role Mapping Applied
{role_mapping}
""")
return labeled_transcript, role_mapping
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 3: Thematic Analysis
Configure your analysis task and run the thematic analysis.
""")
return
@app.cell
def _(mo):
# Step 3a: Define themes for labelling
themes_input = mo.ui.text_area(
value="""brand voice and tone
customer experience priorities
design system and consistency
AI and conversational interfaces""",
label="Themes (one per line)",
full_width=True,
rows=6,
)
mo.md("""### Step 3a: Define Themes
Enter one theme per line. These will be used to
label each interview transcript. Themes may overlap; the
same section can relate to multiple themes.
""")
themes_input
return (themes_input,)
@app.cell
def _(themes_input):
# Parse themes into a clean Python list
raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
theme_list = [t.strip() for t in raw_lines if t.strip()]
return (theme_list,)
@app.cell
def _(Path, mo):
# Configuration for JSON output directory
OUTPUT_DIR = Path("data/labels")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
mo.md(f"""### Step 3b: LLM-based Theme Labelling
This step runs an LLM over the current interview transcript
for each defined theme and saves one JSON file per theme
for this interview in `{OUTPUT_DIR}`.
For each theme, the model will return full sections of the
conversation (multi-sentence chunks, not just short quotes)
that are about that theme.
""")
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
label_button
return
@app.cell
def _(
MODEL,
OUTPUT_THEME_DIR,
Path,
client,
file_dropdown,
labeled_transcript,
mo,
theme_label_button,
theme_list,
):
import json
from datetime import datetime
theme_label_results = {}
if theme_label_button.value and file_dropdown.value and theme_list:
interview_id = Path(file_dropdown.value).stem
for theme in theme_list:
prompt = f"""You are an expert qualitative researcher.
You will analyse a single interview transcript for ONE specific theme.
Theme: "{theme}"
Tasks:
1. Decide if the theme is present in this interview.
2. If present, estimate how relevant it is on a 0–1 scale
where 0 = not mentioned, 0.5 = moderately important,
1 = central theme of the interview.
3. Identify all sections of the conversation that are
primarily about this theme. A section can span multiple
consecutive utterances and should form a coherent piece
of the dialogue about the theme, not just a single
sentence.
Each section should include:
- the dominant speaker label (or "mixed" if multiple)
- the full section text (one or more sentences)
Return your answer ONLY as a JSON object with this schema:
{{
"theme": string, // the theme name
"present": bool, // whether the theme appears
"relevance": float, // 0.0–1.0
"sections": [
{{
"speaker": string, // main speaker label for the section
"section_text": string // full section text about the theme
}}
]
}}
Transcript:
"""
{labeled_transcript}
"""
"""
response = client.generate(model=MODEL, prompt=prompt)
raw_text = response.response.strip()
try:
parsed = json.loads(raw_text)
except json.JSONDecodeError:
# Fallback: try to extract JSON between braces
try:
start = raw_text.index("{")
end = raw_text.rindex("}") + 1
parsed = json.loads(raw_text[start:end])
except Exception:
parsed = {
"theme": theme,
"present": False,
"relevance": 0.0,
"sections": [],
"_parse_error": True,
"_raw": raw_text,
}
# Normalise fields
parsed["theme"] = parsed.get("theme", theme)
parsed["present"] = bool(parsed.get("present", False))
try:
parsed["relevance"] = float(parsed.get("relevance", 0.0))
except (TypeError, ValueError):
parsed["relevance"] = 0.0
if not isinstance(parsed.get("sections"), list):
parsed["sections"] = []
theme_label_results[theme] = parsed
# Write per-interview-per-theme JSON file
out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
out_data = {
"interview_id": interview_id,
"theme": parsed["theme"],
"present": parsed["present"],
"relevance": parsed["relevance"],
"sections": parsed["sections"],
"generated_at": datetime.utcnow().isoformat() + "Z",
}
out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
if theme_label_button.value:
if not file_dropdown.value:
status = "No transcript selected."
elif not theme_list:
status = "No themes defined. Please add at least one theme."
else:
status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
else:
status = "Click 'Run Theme Labelling for This Interview' to start."
mo.md(f"""### Theme Labelling Status
{status}
""")
return
@app.cell
def _(Path, mo):
# Step 3c: Load all labeled transcripts (assumed precomputed)
LABELED_DIR = Path("data/labeled_transcripts")
LABELED_DIR.mkdir(parents=True, exist_ok=True)
labeled_files = sorted(LABELED_DIR.glob("*.json"))
mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
These will be used to aggregate themes across all interviews.
""")
labeled_files
return (labeled_files,)
@app.cell
def _(labeled_files):
import json
all_labeled_records = []
for f in labeled_files:
try:
data = json.loads(f.read_text(encoding="utf-8"))
except Exception:
# Skip unreadable files
continue
interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
theme = data.get("theme", "")
present = bool(data.get("present", False))
try:
relevance = float(data.get("relevance", 0.0))
except (TypeError, ValueError):
relevance = 0.0
sections = data.get("sections") or []
all_labeled_records.append(
{
"interview_id": interview_id,
"theme": theme,
"present": present,
"relevance": relevance,
"sections": sections,
}
)
return (all_labeled_records,)
@app.cell
def _(all_labeled_records, mo):
# Derive full theme and interview sets
all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
theme_selector = mo.ui.dropdown(
options={t: t for t in all_themes},
label="Select theme to explore across all interviews",
)
mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
theme_selector
return all_interviews, theme_selector
@app.cell
def _(all_interviews, all_labeled_records, mo, theme_selector):
import statistics
selected_theme = theme_selector.value
theme_summary = {}
theme_sections = []
if selected_theme:
theme_records = [
r for r in all_labeled_records if r["theme"] == selected_theme
]
present_flags = [r["present"] for r in theme_records]
relevances = [r["relevance"] for r in theme_records if r["present"]]
theme_summary = {
"theme": selected_theme,
"num_interviews": len(all_interviews),
"num_interviews_with_theme": sum(present_flags),
"share_of_interviews_with_theme": (
sum(present_flags) / len(all_interviews) if all_interviews else 0.0
),
"avg_relevance_if_present": (
statistics.mean(relevances) if relevances else 0.0
),
}
for r in theme_records:
interview_id = r["interview_id"]
for s in r["sections"]:
theme_sections.append(
{
"interview_id": interview_id,
"speaker": s.get("speaker", ""),
"section_text": s.get("section_text", ""),
"relevance": r["relevance"],
}
)
mo.md(
f"""#### Theme Overview: `{selected_theme or "None selected"}`
- Total interviews: **{len(all_interviews)}**
- Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
- Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
- Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
"""
)
if theme_sections:
table_rows = [
{
"Interview": s["interview_id"],
"Speaker": s["speaker"],
"Relevance": f"{s['relevance']:.2f}",
"Section": s["section_text"],
}
for s in theme_sections
]
mo.ui.table(table_rows)
else:
mo.md("_No sections for this theme yet._")
return
@app.cell
def _(mo):
# Editable analysis task prompt
analysis_task_input = mo.ui.text_area(
value="""Perform a thematic analysis of this interview transcript.
Identify and describe:
1. **Key Themes** - Major topics and ideas that emerge from the conversation
2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
3. **Insights** - Notable observations or implications from the discussion
Focus on themes related to:
- Brand voice and tone strategy
- Customer experience priorities
- Design system and consistency
- AI/conversational interface considerations""",
label="Analysis Task",
full_width=True,
rows=12
)
analysis_task_input
return (analysis_task_input,)
@app.cell
def _(analysis_task_input, labeled_transcript, mo, role_mapping):
# Build full analysis prompt
full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
## Speaker Roles
{role_mapping}
## Task
{analysis_task_input.value}
## Interview Transcript
'''
{labeled_transcript}
'''
Provide your analysis in well-structured markdown format."""
run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
mo.vstack([
mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
run_analysis_button
])
return full_analysis_prompt, run_analysis_button
@app.cell
def _(full_analysis_prompt, mo):
mo.md(rf"""
# Full Analysis Prompt
---
{full_analysis_prompt}
""")
return
@app.cell
def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
analysis_response = ""
if run_analysis_button.value:
response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
analysis_response = response_2.response
mo.md(f"""
## Analysis Results
{analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
""")
return
if __name__ == "__main__":
app.run()