Files
Interview-Analysis/Thematic_Analysis.py
2025-12-01 22:17:57 +01:00

451 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import marimo
__generated_with = "0.18.0"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
from pathlib import Path
from utils import connect_qumo_ollama, load_srt
VM_NAME = 'hiperf-gpu'
MODEL = 'llama3.3:70b'
#client = connect_qumo_ollama(VM_NAME)
return MODEL, Path, load_srt, mo
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Interview Transcript Thematic Analysis
This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
""")
return
@app.cell
def _(Path, mo):
# Load transcript from SRT file
TRANSCRIPT_DIR = Path("data/transcripts")
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
# File selector
file_dropdown = mo.ui.dropdown(
options={f.name: str(f) for f in srt_files},
label="Select transcript file"
)
file_dropdown
return (file_dropdown,)
@app.cell
def _(file_dropdown, load_srt, mo):
# Load and display transcript preview
transcript_raw = ""
if file_dropdown.value:
transcript_raw = load_srt(file_dropdown.value)
mo.md(f"""
## Transcript Preview
**File:** `{file_dropdown.value or 'None selected'}`
**Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
<details>
<summary>Show first 2000 characters</summary>
```
{transcript_raw[:2000]}...
```
</details>
""")
return (transcript_raw,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 1: Infer Speaker Roles
The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
""")
return
@app.cell
def _(mo, transcript_raw):
# Infer speaker roles from transcript context
role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
Based on the conversation context, determine who is:
- The interviewer(s) - asking questions, guiding the conversation
- The interviewee(s) - providing answers, sharing expertise/opinions
Return ONLY a simple mapping in this exact format (one per line):
SPEAKER_XX: Role - Brief description
For example:
SPEAKER_00: Interviewer - Michael from the voice branding team
SPEAKER_01: Interviewee - Head of Digital Design
<transcript>
{transcript_raw[:4000]}
</transcript>
"""
infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
infer_roles_button
return infer_roles_button, role_inference_prompt
@app.cell
def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
inferred_roles_text = ""
if infer_roles_button.value:
response = client.generate(model=MODEL, prompt=role_inference_prompt)
inferred_roles_text = response.response
mo.md(f"""
### Inferred Roles
{inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
""")
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 2: Confirm or Edit Speaker Roles
Review the inferred roles below and make corrections if needed.
""")
return
@app.cell
def _(mo, transcript_raw):
import re
# Extract unique speakers from transcript
speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
# Create editable text inputs for each speaker
role_inputs = {
speaker: mo.ui.text(
value=f"{speaker}",
label=speaker,
full_width=True
)
for speaker in speakers
}
mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
return (role_inputs,)
@app.cell
def _(mo, role_inputs):
# Display role inputs as a form
mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
return
@app.cell
def _(mo, role_inputs, transcript_raw):
# Apply role labels to transcript
labeled_transcript = transcript_raw
for speaker_id, input_widget in role_inputs.items():
if input_widget.value and input_widget.value != speaker_id:
labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
# Build role mapping summary
role_mapping = "\n".join([
f"- {speaker_id}{input_widget.value}"
for speaker_id, input_widget in sorted(role_inputs.items())
])
mo.md(f"""
### Role Mapping Applied
{role_mapping}
""")
return labeled_transcript, role_mapping
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Step 3: Thematic Analysis
Configure your analysis task and run the thematic analysis.
""")
return
@app.cell
def _(mo):
# Step 3a: Define themes for labelling
themes_input = mo.ui.text_area(
value="""brand voice and tone
customer experience priorities
design system and consistency
AI and conversational interfaces""",
label="Themes (one per line)",
full_width=True,
rows=6,
)
mo.md("""### Step 3a: Define Themes
Enter one theme per line. These will be used to
label each interview transcript. Themes may overlap; the
same section can relate to multiple themes.
""")
themes_input
return (themes_input,)
@app.cell
def _(themes_input):
# Parse themes into a clean Python list
raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
theme_list = [t.strip() for t in raw_lines if t.strip()]
return (theme_list,)
@app.cell
def _(Path, mo):
# Configuration for JSON output directory
OUTPUT_DIR = Path("data/labels")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
mo.md(f"""### Step 3b: LLM-based Theme Labelling
This step runs an LLM over the current interview transcript
for each defined theme and saves one JSON file per theme
for this interview in `{OUTPUT_DIR}`.
For each theme, the model will return full sections of the
conversation (multi-sentence chunks, not just short quotes)
that are about that theme.
""")
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
label_button
return OUTPUT_DIR, label_button
@app.cell
def _(
MODEL,
OUTPUT_THEME_DIR,
Path,
client,
file_dropdown,
theme_label_button,
labeled_transcript,
mo,
theme_list,
):
import json
from datetime import datetime
theme_label_results = {}
if theme_label_button.value and file_dropdown.value and theme_list:
interview_id = Path(file_dropdown.value).stem
for theme in theme_list:
prompt = f"""You are an expert qualitative researcher.
You will analyse a single interview transcript for ONE specific theme.
Theme: "{theme}"
Tasks:
1. Decide if the theme is present in this interview.
2. If present, estimate how relevant it is on a 01 scale
where 0 = not mentioned, 0.5 = moderately important,
1 = central theme of the interview.
3. Identify all sections of the conversation that are
primarily about this theme. A section can span multiple
consecutive utterances and should form a coherent piece
of the dialogue about the theme, not just a single
sentence.
Each section should include:
- the dominant speaker label (or "mixed" if multiple)
- the full section text (one or more sentences)
Return your answer ONLY as a JSON object with this schema:
{{
"theme": string, // the theme name
"present": bool, // whether the theme appears
"relevance": float, // 0.01.0
"sections": [
{{
"speaker": string, // main speaker label for the section
"section_text": string // full section text about the theme
}}
]
}}
Transcript:
"""
{labeled_transcript}
"""
"""
response = client.generate(model=MODEL, prompt=prompt)
raw_text = response.response.strip()
try:
parsed = json.loads(raw_text)
except json.JSONDecodeError:
# Fallback: try to extract JSON between braces
try:
start = raw_text.index("{")
end = raw_text.rindex("}") + 1
parsed = json.loads(raw_text[start:end])
except Exception:
parsed = {
"theme": theme,
"present": False,
"relevance": 0.0,
"sections": [],
"_parse_error": True,
"_raw": raw_text,
}
# Normalise fields
parsed["theme"] = parsed.get("theme", theme)
parsed["present"] = bool(parsed.get("present", False))
try:
parsed["relevance"] = float(parsed.get("relevance", 0.0))
except (TypeError, ValueError):
parsed["relevance"] = 0.0
if not isinstance(parsed.get("sections"), list):
parsed["sections"] = []
theme_label_results[theme] = parsed
# Write per-interview-per-theme JSON file
out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
out_data = {
"interview_id": interview_id,
"theme": parsed["theme"],
"present": parsed["present"],
"relevance": parsed["relevance"],
"sections": parsed["sections"],
"generated_at": datetime.utcnow().isoformat() + "Z",
}
out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
if theme_label_button.value:
if not file_dropdown.value:
status = "No transcript selected."
elif not theme_list:
status = "No themes defined. Please add at least one theme."
else:
status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
else:
status = "Click 'Run Theme Labelling for This Interview' to start."
mo.md(f"""### Theme Labelling Status
{status}
""")
return theme_label_results
@app.cell
def _(mo):
# Editable analysis task prompt
analysis_task_input = mo.ui.text_area(
value="""Perform a thematic analysis of this interview transcript.
Identify and describe:
1. **Key Themes** - Major topics and ideas that emerge from the conversation
2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
3. **Insights** - Notable observations or implications from the discussion
Focus on themes related to:
- Brand voice and tone strategy
- Customer experience priorities
- Design system and consistency
- AI/conversational interface considerations""",
label="Analysis Task",
full_width=True,
rows=12
)
analysis_task_input
return (analysis_task_input,)
@app.cell
def _(analysis_task_input, labeled_transcript, mo, role_mapping):
# Build full analysis prompt
full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
## Speaker Roles
{role_mapping}
## Task
{analysis_task_input.value}
## Interview Transcript
'''
<transcript>
{labeled_transcript}
</transcript>
'''
Provide your analysis in well-structured markdown format."""
run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
mo.vstack([
mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
run_analysis_button
])
return full_analysis_prompt, run_analysis_button
@app.cell
def _(full_analysis_prompt, mo):
mo.md(rf"""
# Full Analysis Prompt
---
{full_analysis_prompt}
""")
return
@app.cell
def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
analysis_response = ""
if run_analysis_button.value:
response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
analysis_response = response_2.response
mo.md(f"""
## Analysis Results
{analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
""")
return
if __name__ == "__main__":
app.run()