New prompt for thematic analysis added

This commit is contained in:
mtorsij
2025-12-01 22:17:57 +01:00
parent 82bd2b7c3b
commit 15d706b8c5

View File

@@ -13,8 +13,8 @@ def _():
VM_NAME = 'hiperf-gpu'
MODEL = 'llama3.3:70b'
client = connect_qumo_ollama(VM_NAME)
return MODEL, Path, client, load_srt, mo
#client = connect_qumo_ollama(VM_NAME)
return MODEL, Path, load_srt, mo
@app.cell(hide_code=True)
@@ -186,6 +186,183 @@ def _(mo):
return
@app.cell
def _(mo):
# Step 3a: Define themes for labelling
themes_input = mo.ui.text_area(
value="""brand voice and tone
customer experience priorities
design system and consistency
AI and conversational interfaces""",
label="Themes (one per line)",
full_width=True,
rows=6,
)
mo.md("""### Step 3a: Define Themes
Enter one theme per line. These will be used to
label each interview transcript. Themes may overlap; the
same section can relate to multiple themes.
""")
themes_input
return (themes_input,)
@app.cell
def _(themes_input):
# Parse themes into a clean Python list
raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
theme_list = [t.strip() for t in raw_lines if t.strip()]
return (theme_list,)
@app.cell
def _(Path, mo):
# Configuration for JSON output directory
OUTPUT_DIR = Path("data/labels")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
mo.md(f"""### Step 3b: LLM-based Theme Labelling
This step runs an LLM over the current interview transcript
for each defined theme and saves one JSON file per theme
for this interview in `{OUTPUT_DIR}`.
For each theme, the model will return full sections of the
conversation (multi-sentence chunks, not just short quotes)
that are about that theme.
""")
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
label_button
return OUTPUT_DIR, label_button
@app.cell
def _(
MODEL,
OUTPUT_THEME_DIR,
Path,
client,
file_dropdown,
theme_label_button,
labeled_transcript,
mo,
theme_list,
):
import json
from datetime import datetime
theme_label_results = {}
if theme_label_button.value and file_dropdown.value and theme_list:
interview_id = Path(file_dropdown.value).stem
for theme in theme_list:
prompt = f"""You are an expert qualitative researcher.
You will analyse a single interview transcript for ONE specific theme.
Theme: "{theme}"
Tasks:
1. Decide if the theme is present in this interview.
2. If present, estimate how relevant it is on a 01 scale
where 0 = not mentioned, 0.5 = moderately important,
1 = central theme of the interview.
3. Identify all sections of the conversation that are
primarily about this theme. A section can span multiple
consecutive utterances and should form a coherent piece
of the dialogue about the theme, not just a single
sentence.
Each section should include:
- the dominant speaker label (or "mixed" if multiple)
- the full section text (one or more sentences)
Return your answer ONLY as a JSON object with this schema:
{{
"theme": string, // the theme name
"present": bool, // whether the theme appears
"relevance": float, // 0.01.0
"sections": [
{{
"speaker": string, // main speaker label for the section
"section_text": string // full section text about the theme
}}
]
}}
Transcript:
"""
{labeled_transcript}
"""
"""
response = client.generate(model=MODEL, prompt=prompt)
raw_text = response.response.strip()
try:
parsed = json.loads(raw_text)
except json.JSONDecodeError:
# Fallback: try to extract JSON between braces
try:
start = raw_text.index("{")
end = raw_text.rindex("}") + 1
parsed = json.loads(raw_text[start:end])
except Exception:
parsed = {
"theme": theme,
"present": False,
"relevance": 0.0,
"sections": [],
"_parse_error": True,
"_raw": raw_text,
}
# Normalise fields
parsed["theme"] = parsed.get("theme", theme)
parsed["present"] = bool(parsed.get("present", False))
try:
parsed["relevance"] = float(parsed.get("relevance", 0.0))
except (TypeError, ValueError):
parsed["relevance"] = 0.0
if not isinstance(parsed.get("sections"), list):
parsed["sections"] = []
theme_label_results[theme] = parsed
# Write per-interview-per-theme JSON file
out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
out_data = {
"interview_id": interview_id,
"theme": parsed["theme"],
"present": parsed["present"],
"relevance": parsed["relevance"],
"sections": parsed["sections"],
"generated_at": datetime.utcnow().isoformat() + "Z",
}
out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
if theme_label_button.value:
if not file_dropdown.value:
status = "No transcript selected."
elif not theme_list:
status = "No themes defined. Please add at least one theme."
else:
status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
else:
status = "Click 'Run Theme Labelling for This Interview' to start."
mo.md(f"""### Theme Labelling Status
{status}
""")
return theme_label_results
@app.cell
def _(mo):
# Editable analysis task prompt