451 lines
12 KiB
Python
451 lines
12 KiB
Python
import marimo
|
||
|
||
__generated_with = "0.18.0"
|
||
app = marimo.App(width="medium")
|
||
|
||
|
||
@app.cell
|
||
def _():
|
||
import marimo as mo
|
||
from pathlib import Path
|
||
from utils import connect_qumo_ollama, load_srt
|
||
|
||
VM_NAME = 'hiperf-gpu'
|
||
MODEL = 'llama3.3:70b'
|
||
|
||
#client = connect_qumo_ollama(VM_NAME)
|
||
return MODEL, Path, load_srt, mo
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md(r"""
|
||
# Interview Transcript Thematic Analysis
|
||
|
||
This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell
|
||
def _(Path, mo):
|
||
# Load transcript from SRT file
|
||
TRANSCRIPT_DIR = Path("data/transcripts")
|
||
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
|
||
|
||
# File selector
|
||
file_dropdown = mo.ui.dropdown(
|
||
options={f.name: str(f) for f in srt_files},
|
||
label="Select transcript file"
|
||
)
|
||
file_dropdown
|
||
return (file_dropdown,)
|
||
|
||
|
||
@app.cell
|
||
def _(file_dropdown, load_srt, mo):
|
||
# Load and display transcript preview
|
||
transcript_raw = ""
|
||
if file_dropdown.value:
|
||
transcript_raw = load_srt(file_dropdown.value)
|
||
|
||
mo.md(f"""
|
||
## Transcript Preview
|
||
|
||
**File:** `{file_dropdown.value or 'None selected'}`
|
||
**Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
|
||
|
||
<details>
|
||
<summary>Show first 2000 characters</summary>
|
||
|
||
```
|
||
{transcript_raw[:2000]}...
|
||
```
|
||
</details>
|
||
""")
|
||
return (transcript_raw,)
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md(r"""
|
||
## Step 1: Infer Speaker Roles
|
||
|
||
The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell
|
||
def _(mo, transcript_raw):
|
||
# Infer speaker roles from transcript context
|
||
role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
|
||
|
||
Based on the conversation context, determine who is:
|
||
- The interviewer(s) - asking questions, guiding the conversation
|
||
- The interviewee(s) - providing answers, sharing expertise/opinions
|
||
|
||
Return ONLY a simple mapping in this exact format (one per line):
|
||
SPEAKER_XX: Role - Brief description
|
||
|
||
For example:
|
||
SPEAKER_00: Interviewer - Michael from the voice branding team
|
||
SPEAKER_01: Interviewee - Head of Digital Design
|
||
|
||
<transcript>
|
||
{transcript_raw[:4000]}
|
||
</transcript>
|
||
"""
|
||
|
||
infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
|
||
infer_roles_button
|
||
return infer_roles_button, role_inference_prompt
|
||
|
||
|
||
@app.cell
|
||
def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
|
||
inferred_roles_text = ""
|
||
if infer_roles_button.value:
|
||
response = client.generate(model=MODEL, prompt=role_inference_prompt)
|
||
inferred_roles_text = response.response
|
||
|
||
mo.md(f"""
|
||
### Inferred Roles
|
||
|
||
{inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md(r"""
|
||
## Step 2: Confirm or Edit Speaker Roles
|
||
|
||
Review the inferred roles below and make corrections if needed.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell
|
||
def _(mo, transcript_raw):
|
||
import re
|
||
# Extract unique speakers from transcript
|
||
speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
|
||
|
||
# Create editable text inputs for each speaker
|
||
role_inputs = {
|
||
speaker: mo.ui.text(
|
||
value=f"{speaker}",
|
||
label=speaker,
|
||
full_width=True
|
||
)
|
||
for speaker in speakers
|
||
}
|
||
|
||
mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
|
||
return (role_inputs,)
|
||
|
||
|
||
@app.cell
|
||
def _(mo, role_inputs):
|
||
# Display role inputs as a form
|
||
mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
|
||
return
|
||
|
||
|
||
@app.cell
|
||
def _(mo, role_inputs, transcript_raw):
|
||
# Apply role labels to transcript
|
||
labeled_transcript = transcript_raw
|
||
for speaker_id, input_widget in role_inputs.items():
|
||
if input_widget.value and input_widget.value != speaker_id:
|
||
labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
|
||
|
||
# Build role mapping summary
|
||
role_mapping = "\n".join([
|
||
f"- {speaker_id} → {input_widget.value}"
|
||
for speaker_id, input_widget in sorted(role_inputs.items())
|
||
])
|
||
|
||
mo.md(f"""
|
||
### Role Mapping Applied
|
||
|
||
{role_mapping}
|
||
""")
|
||
return labeled_transcript, role_mapping
|
||
|
||
|
||
@app.cell(hide_code=True)
|
||
def _(mo):
|
||
mo.md(r"""
|
||
## Step 3: Thematic Analysis
|
||
|
||
Configure your analysis task and run the thematic analysis.
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell
|
||
def _(mo):
|
||
# Step 3a: Define themes for labelling
|
||
themes_input = mo.ui.text_area(
|
||
value="""brand voice and tone
|
||
customer experience priorities
|
||
design system and consistency
|
||
AI and conversational interfaces""",
|
||
label="Themes (one per line)",
|
||
full_width=True,
|
||
rows=6,
|
||
)
|
||
|
||
mo.md("""### Step 3a: Define Themes
|
||
|
||
Enter one theme per line. These will be used to
|
||
label each interview transcript. Themes may overlap; the
|
||
same section can relate to multiple themes.
|
||
""")
|
||
|
||
themes_input
|
||
return (themes_input,)
|
||
|
||
|
||
@app.cell
|
||
def _(themes_input):
|
||
# Parse themes into a clean Python list
|
||
raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
|
||
theme_list = [t.strip() for t in raw_lines if t.strip()]
|
||
return (theme_list,)
|
||
|
||
|
||
@app.cell
|
||
def _(Path, mo):
|
||
# Configuration for JSON output directory
|
||
OUTPUT_DIR = Path("data/labels")
|
||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
mo.md(f"""### Step 3b: LLM-based Theme Labelling
|
||
|
||
This step runs an LLM over the current interview transcript
|
||
for each defined theme and saves one JSON file per theme
|
||
for this interview in `{OUTPUT_DIR}`.
|
||
|
||
For each theme, the model will return full sections of the
|
||
conversation (multi-sentence chunks, not just short quotes)
|
||
that are about that theme.
|
||
""")
|
||
|
||
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
|
||
label_button
|
||
return OUTPUT_DIR, label_button
|
||
|
||
|
||
@app.cell
|
||
def _(
|
||
MODEL,
|
||
OUTPUT_THEME_DIR,
|
||
Path,
|
||
client,
|
||
file_dropdown,
|
||
theme_label_button,
|
||
labeled_transcript,
|
||
mo,
|
||
theme_list,
|
||
):
|
||
import json
|
||
from datetime import datetime
|
||
|
||
theme_label_results = {}
|
||
|
||
if theme_label_button.value and file_dropdown.value and theme_list:
|
||
interview_id = Path(file_dropdown.value).stem
|
||
|
||
for theme in theme_list:
|
||
prompt = f"""You are an expert qualitative researcher.
|
||
|
||
You will analyse a single interview transcript for ONE specific theme.
|
||
|
||
Theme: "{theme}"
|
||
|
||
Tasks:
|
||
1. Decide if the theme is present in this interview.
|
||
2. If present, estimate how relevant it is on a 0–1 scale
|
||
where 0 = not mentioned, 0.5 = moderately important,
|
||
1 = central theme of the interview.
|
||
3. Identify all sections of the conversation that are
|
||
primarily about this theme. A section can span multiple
|
||
consecutive utterances and should form a coherent piece
|
||
of the dialogue about the theme, not just a single
|
||
sentence.
|
||
|
||
Each section should include:
|
||
- the dominant speaker label (or "mixed" if multiple)
|
||
- the full section text (one or more sentences)
|
||
|
||
Return your answer ONLY as a JSON object with this schema:
|
||
{{
|
||
"theme": string, // the theme name
|
||
"present": bool, // whether the theme appears
|
||
"relevance": float, // 0.0–1.0
|
||
"sections": [
|
||
{{
|
||
"speaker": string, // main speaker label for the section
|
||
"section_text": string // full section text about the theme
|
||
}}
|
||
]
|
||
}}
|
||
|
||
Transcript:
|
||
"""
|
||
{labeled_transcript}
|
||
"""
|
||
"""
|
||
|
||
response = client.generate(model=MODEL, prompt=prompt)
|
||
raw_text = response.response.strip()
|
||
|
||
try:
|
||
parsed = json.loads(raw_text)
|
||
except json.JSONDecodeError:
|
||
# Fallback: try to extract JSON between braces
|
||
try:
|
||
start = raw_text.index("{")
|
||
end = raw_text.rindex("}") + 1
|
||
parsed = json.loads(raw_text[start:end])
|
||
except Exception:
|
||
parsed = {
|
||
"theme": theme,
|
||
"present": False,
|
||
"relevance": 0.0,
|
||
"sections": [],
|
||
"_parse_error": True,
|
||
"_raw": raw_text,
|
||
}
|
||
|
||
# Normalise fields
|
||
parsed["theme"] = parsed.get("theme", theme)
|
||
parsed["present"] = bool(parsed.get("present", False))
|
||
try:
|
||
parsed["relevance"] = float(parsed.get("relevance", 0.0))
|
||
except (TypeError, ValueError):
|
||
parsed["relevance"] = 0.0
|
||
if not isinstance(parsed.get("sections"), list):
|
||
parsed["sections"] = []
|
||
|
||
theme_label_results[theme] = parsed
|
||
|
||
# Write per-interview-per-theme JSON file
|
||
out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
|
||
out_data = {
|
||
"interview_id": interview_id,
|
||
"theme": parsed["theme"],
|
||
"present": parsed["present"],
|
||
"relevance": parsed["relevance"],
|
||
"sections": parsed["sections"],
|
||
"generated_at": datetime.utcnow().isoformat() + "Z",
|
||
}
|
||
out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||
|
||
if theme_label_button.value:
|
||
if not file_dropdown.value:
|
||
status = "No transcript selected."
|
||
elif not theme_list:
|
||
status = "No themes defined. Please add at least one theme."
|
||
else:
|
||
status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
|
||
else:
|
||
status = "Click 'Run Theme Labelling for This Interview' to start."
|
||
|
||
mo.md(f"""### Theme Labelling Status
|
||
|
||
{status}
|
||
""")
|
||
return theme_label_results
|
||
|
||
|
||
@app.cell
|
||
def _(mo):
|
||
# Editable analysis task prompt
|
||
analysis_task_input = mo.ui.text_area(
|
||
value="""Perform a thematic analysis of this interview transcript.
|
||
|
||
Identify and describe:
|
||
1. **Key Themes** - Major topics and ideas that emerge from the conversation
|
||
2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
|
||
3. **Insights** - Notable observations or implications from the discussion
|
||
|
||
Focus on themes related to:
|
||
- Brand voice and tone strategy
|
||
- Customer experience priorities
|
||
- Design system and consistency
|
||
- AI/conversational interface considerations""",
|
||
label="Analysis Task",
|
||
full_width=True,
|
||
rows=12
|
||
)
|
||
analysis_task_input
|
||
return (analysis_task_input,)
|
||
|
||
|
||
@app.cell
|
||
def _(analysis_task_input, labeled_transcript, mo, role_mapping):
|
||
# Build full analysis prompt
|
||
full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
|
||
|
||
## Speaker Roles
|
||
{role_mapping}
|
||
|
||
## Task
|
||
{analysis_task_input.value}
|
||
|
||
## Interview Transcript
|
||
|
||
'''
|
||
|
||
<transcript>
|
||
{labeled_transcript}
|
||
</transcript>
|
||
|
||
'''
|
||
|
||
Provide your analysis in well-structured markdown format."""
|
||
|
||
run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
|
||
|
||
mo.vstack([
|
||
mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
|
||
run_analysis_button
|
||
])
|
||
return full_analysis_prompt, run_analysis_button
|
||
|
||
|
||
@app.cell
|
||
def _(full_analysis_prompt, mo):
|
||
mo.md(rf"""
|
||
# Full Analysis Prompt
|
||
|
||
---
|
||
|
||
{full_analysis_prompt}
|
||
""")
|
||
return
|
||
|
||
|
||
@app.cell
|
||
def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
|
||
analysis_response = ""
|
||
if run_analysis_button.value:
|
||
response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
|
||
analysis_response = response_2.response
|
||
|
||
mo.md(f"""
|
||
## Analysis Results
|
||
|
||
{analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
|
||
""")
|
||
return
|
||
|
||
|
||
if __name__ == "__main__":
|
||
app.run()
|