Compare commits
3 Commits
b21f402e1e
...
ae9563eba3
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ae9563eba3 | ||
|
|
15d706b8c5 | ||
|
|
82bd2b7c3b |
@@ -82,7 +82,6 @@ def _(mo):
|
|||||||
2. **Process:**
|
2. **Process:**
|
||||||
* The LLM analyzes each transcript segment-by-segment.
|
* The LLM analyzes each transcript segment-by-segment.
|
||||||
* It extracts specific quotes that match a Theme Definition.
|
* It extracts specific quotes that match a Theme Definition.
|
||||||
* **Multi-Theme Handling:** If a quote applies to multiple themes, it is tagged with *all* relevant themes. In the dataset, this creates multiple entries (one per theme) so the quote informs the synthesis of each relevant topic.
|
|
||||||
* **Granular Sentiment Analysis:** For each quote, the model identifies:
|
* **Granular Sentiment Analysis:** For each quote, the model identifies:
|
||||||
* **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
|
* **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
|
||||||
* **Sentiment:** Positive / Neutral / Negative.
|
* **Sentiment:** Positive / Neutral / Negative.
|
||||||
|
|||||||
@@ -13,8 +13,8 @@ def _():
|
|||||||
VM_NAME = 'hiperf-gpu'
|
VM_NAME = 'hiperf-gpu'
|
||||||
MODEL = 'llama3.3:70b'
|
MODEL = 'llama3.3:70b'
|
||||||
|
|
||||||
client = connect_qumo_ollama(VM_NAME)
|
#client = connect_qumo_ollama(VM_NAME)
|
||||||
return MODEL, Path, client, load_srt, mo
|
return MODEL, Path, load_srt, mo
|
||||||
|
|
||||||
|
|
||||||
@app.cell(hide_code=True)
|
@app.cell(hide_code=True)
|
||||||
@@ -186,6 +186,316 @@ def _(mo):
|
|||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(mo):
|
||||||
|
# Step 3a: Define themes for labelling
|
||||||
|
themes_input = mo.ui.text_area(
|
||||||
|
value="""brand voice and tone
|
||||||
|
customer experience priorities
|
||||||
|
design system and consistency
|
||||||
|
AI and conversational interfaces""",
|
||||||
|
label="Themes (one per line)",
|
||||||
|
full_width=True,
|
||||||
|
rows=6,
|
||||||
|
)
|
||||||
|
|
||||||
|
mo.md("""### Step 3a: Define Themes
|
||||||
|
|
||||||
|
Enter one theme per line. These will be used to
|
||||||
|
label each interview transcript. Themes may overlap; the
|
||||||
|
same section can relate to multiple themes.
|
||||||
|
""")
|
||||||
|
|
||||||
|
themes_input
|
||||||
|
return (themes_input,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(themes_input):
|
||||||
|
# Parse themes into a clean Python list
|
||||||
|
raw_lines = themes_input.value.splitlines() if themes_input.value and themes_input.value else []
|
||||||
|
theme_list = [t.strip() for t in raw_lines if t.strip()]
|
||||||
|
return (theme_list,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(Path, mo):
|
||||||
|
# Configuration for JSON output directory
|
||||||
|
OUTPUT_DIR = Path("data/labels")
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
mo.md(f"""### Step 3b: LLM-based Theme Labelling
|
||||||
|
|
||||||
|
This step runs an LLM over the current interview transcript
|
||||||
|
for each defined theme and saves one JSON file per theme
|
||||||
|
for this interview in `{OUTPUT_DIR}`.
|
||||||
|
|
||||||
|
For each theme, the model will return full sections of the
|
||||||
|
conversation (multi-sentence chunks, not just short quotes)
|
||||||
|
that are about that theme.
|
||||||
|
""")
|
||||||
|
|
||||||
|
label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
|
||||||
|
label_button
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(
|
||||||
|
MODEL,
|
||||||
|
OUTPUT_THEME_DIR,
|
||||||
|
Path,
|
||||||
|
client,
|
||||||
|
file_dropdown,
|
||||||
|
labeled_transcript,
|
||||||
|
mo,
|
||||||
|
theme_label_button,
|
||||||
|
theme_list,
|
||||||
|
):
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
theme_label_results = {}
|
||||||
|
|
||||||
|
if theme_label_button.value and file_dropdown.value and theme_list:
|
||||||
|
interview_id = Path(file_dropdown.value).stem
|
||||||
|
|
||||||
|
for theme in theme_list:
|
||||||
|
prompt = f"""You are an expert qualitative researcher.
|
||||||
|
|
||||||
|
You will analyse a single interview transcript for ONE specific theme.
|
||||||
|
|
||||||
|
Theme: "{theme}"
|
||||||
|
|
||||||
|
Tasks:
|
||||||
|
1. Decide if the theme is present in this interview.
|
||||||
|
2. If present, estimate how relevant it is on a 0–1 scale
|
||||||
|
where 0 = not mentioned, 0.5 = moderately important,
|
||||||
|
1 = central theme of the interview.
|
||||||
|
3. Identify all sections of the conversation that are
|
||||||
|
primarily about this theme. A section can span multiple
|
||||||
|
consecutive utterances and should form a coherent piece
|
||||||
|
of the dialogue about the theme, not just a single
|
||||||
|
sentence.
|
||||||
|
|
||||||
|
Each section should include:
|
||||||
|
- the dominant speaker label (or "mixed" if multiple)
|
||||||
|
- the full section text (one or more sentences)
|
||||||
|
|
||||||
|
Return your answer ONLY as a JSON object with this schema:
|
||||||
|
{{
|
||||||
|
"theme": string, // the theme name
|
||||||
|
"present": bool, // whether the theme appears
|
||||||
|
"relevance": float, // 0.0–1.0
|
||||||
|
"sections": [
|
||||||
|
{{
|
||||||
|
"speaker": string, // main speaker label for the section
|
||||||
|
"section_text": string // full section text about the theme
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
Transcript:
|
||||||
|
"""
|
||||||
|
{labeled_transcript}
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
|
||||||
|
response = client.generate(model=MODEL, prompt=prompt)
|
||||||
|
raw_text = response.response.strip()
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = json.loads(raw_text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
# Fallback: try to extract JSON between braces
|
||||||
|
try:
|
||||||
|
start = raw_text.index("{")
|
||||||
|
end = raw_text.rindex("}") + 1
|
||||||
|
parsed = json.loads(raw_text[start:end])
|
||||||
|
except Exception:
|
||||||
|
parsed = {
|
||||||
|
"theme": theme,
|
||||||
|
"present": False,
|
||||||
|
"relevance": 0.0,
|
||||||
|
"sections": [],
|
||||||
|
"_parse_error": True,
|
||||||
|
"_raw": raw_text,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Normalise fields
|
||||||
|
parsed["theme"] = parsed.get("theme", theme)
|
||||||
|
parsed["present"] = bool(parsed.get("present", False))
|
||||||
|
try:
|
||||||
|
parsed["relevance"] = float(parsed.get("relevance", 0.0))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
parsed["relevance"] = 0.0
|
||||||
|
if not isinstance(parsed.get("sections"), list):
|
||||||
|
parsed["sections"] = []
|
||||||
|
|
||||||
|
theme_label_results[theme] = parsed
|
||||||
|
|
||||||
|
# Write per-interview-per-theme JSON file
|
||||||
|
out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
|
||||||
|
out_data = {
|
||||||
|
"interview_id": interview_id,
|
||||||
|
"theme": parsed["theme"],
|
||||||
|
"present": parsed["present"],
|
||||||
|
"relevance": parsed["relevance"],
|
||||||
|
"sections": parsed["sections"],
|
||||||
|
"generated_at": datetime.utcnow().isoformat() + "Z",
|
||||||
|
}
|
||||||
|
out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
if theme_label_button.value:
|
||||||
|
if not file_dropdown.value:
|
||||||
|
status = "No transcript selected."
|
||||||
|
elif not theme_list:
|
||||||
|
status = "No themes defined. Please add at least one theme."
|
||||||
|
else:
|
||||||
|
status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
|
||||||
|
else:
|
||||||
|
status = "Click 'Run Theme Labelling for This Interview' to start."
|
||||||
|
|
||||||
|
mo.md(f"""### Theme Labelling Status
|
||||||
|
|
||||||
|
{status}
|
||||||
|
""")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(Path, mo):
|
||||||
|
# Step 3c: Load all labeled transcripts (assumed precomputed)
|
||||||
|
LABELED_DIR = Path("data/labeled_transcripts")
|
||||||
|
LABELED_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
labeled_files = sorted(LABELED_DIR.glob("*.json"))
|
||||||
|
|
||||||
|
mo.md(f"""### Step 3c: Use Pre-Labeled Transcripts
|
||||||
|
|
||||||
|
Found **{len(labeled_files)}** labeled transcript files in `{LABELED_DIR}`.
|
||||||
|
These will be used to aggregate themes across all interviews.
|
||||||
|
""")
|
||||||
|
|
||||||
|
labeled_files
|
||||||
|
return (labeled_files,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(labeled_files):
|
||||||
|
import json
|
||||||
|
|
||||||
|
all_labeled_records = []
|
||||||
|
for f in labeled_files:
|
||||||
|
try:
|
||||||
|
data = json.loads(f.read_text(encoding="utf-8"))
|
||||||
|
except Exception:
|
||||||
|
# Skip unreadable files
|
||||||
|
continue
|
||||||
|
|
||||||
|
interview_id = data.get("interview_id") or f.stem.split("__", 1)[0]
|
||||||
|
theme = data.get("theme", "")
|
||||||
|
present = bool(data.get("present", False))
|
||||||
|
try:
|
||||||
|
relevance = float(data.get("relevance", 0.0))
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
relevance = 0.0
|
||||||
|
sections = data.get("sections") or []
|
||||||
|
|
||||||
|
all_labeled_records.append(
|
||||||
|
{
|
||||||
|
"interview_id": interview_id,
|
||||||
|
"theme": theme,
|
||||||
|
"present": present,
|
||||||
|
"relevance": relevance,
|
||||||
|
"sections": sections,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return (all_labeled_records,)
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_labeled_records, mo):
|
||||||
|
# Derive full theme and interview sets
|
||||||
|
all_themes = sorted({r["theme"] for r in all_labeled_records if r["theme"]})
|
||||||
|
all_interviews = sorted({r["interview_id"] for r in all_labeled_records})
|
||||||
|
|
||||||
|
theme_selector = mo.ui.dropdown(
|
||||||
|
options={t: t for t in all_themes},
|
||||||
|
label="Select theme to explore across all interviews",
|
||||||
|
)
|
||||||
|
|
||||||
|
mo.md("### Step 3d: Explore Themes Across All Labeled Transcripts")
|
||||||
|
theme_selector
|
||||||
|
return all_interviews, theme_selector
|
||||||
|
|
||||||
|
|
||||||
|
@app.cell
|
||||||
|
def _(all_interviews, all_labeled_records, mo, theme_selector):
|
||||||
|
import statistics
|
||||||
|
|
||||||
|
selected_theme = theme_selector.value
|
||||||
|
theme_summary = {}
|
||||||
|
theme_sections = []
|
||||||
|
|
||||||
|
if selected_theme:
|
||||||
|
theme_records = [
|
||||||
|
r for r in all_labeled_records if r["theme"] == selected_theme
|
||||||
|
]
|
||||||
|
|
||||||
|
present_flags = [r["present"] for r in theme_records]
|
||||||
|
relevances = [r["relevance"] for r in theme_records if r["present"]]
|
||||||
|
|
||||||
|
theme_summary = {
|
||||||
|
"theme": selected_theme,
|
||||||
|
"num_interviews": len(all_interviews),
|
||||||
|
"num_interviews_with_theme": sum(present_flags),
|
||||||
|
"share_of_interviews_with_theme": (
|
||||||
|
sum(present_flags) / len(all_interviews) if all_interviews else 0.0
|
||||||
|
),
|
||||||
|
"avg_relevance_if_present": (
|
||||||
|
statistics.mean(relevances) if relevances else 0.0
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
for r in theme_records:
|
||||||
|
interview_id = r["interview_id"]
|
||||||
|
for s in r["sections"]:
|
||||||
|
theme_sections.append(
|
||||||
|
{
|
||||||
|
"interview_id": interview_id,
|
||||||
|
"speaker": s.get("speaker", ""),
|
||||||
|
"section_text": s.get("section_text", ""),
|
||||||
|
"relevance": r["relevance"],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
mo.md(
|
||||||
|
f"""#### Theme Overview: `{selected_theme or "None selected"}`
|
||||||
|
|
||||||
|
- Total interviews: **{len(all_interviews)}**
|
||||||
|
- Interviews where theme is present: **{theme_summary.get("num_interviews_with_theme", 0)}**
|
||||||
|
- Share of interviews with theme: **{theme_summary.get("share_of_interviews_with_theme", 0.0):.2f}**
|
||||||
|
- Avg. relevance (when present): **{theme_summary.get("avg_relevance_if_present", 0.0):.2f}**
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
|
||||||
|
if theme_sections:
|
||||||
|
table_rows = [
|
||||||
|
{
|
||||||
|
"Interview": s["interview_id"],
|
||||||
|
"Speaker": s["speaker"],
|
||||||
|
"Relevance": f"{s['relevance']:.2f}",
|
||||||
|
"Section": s["section_text"],
|
||||||
|
}
|
||||||
|
for s in theme_sections
|
||||||
|
]
|
||||||
|
mo.ui.table(table_rows)
|
||||||
|
else:
|
||||||
|
mo.md("_No sections for this theme yet._")
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
@app.cell
|
@app.cell
|
||||||
def _(mo):
|
def _(mo):
|
||||||
# Editable analysis task prompt
|
# Editable analysis task prompt
|
||||||
|
|||||||
Reference in New Issue
Block a user