taguette pre-process

This commit is contained in:
2025-12-07 21:37:42 +01:00
parent 98202ac3f2
commit 8cc2bc9087
5 changed files with 445 additions and 6 deletions

226
Stage1_Theme_Discovery.py Normal file
View File

@@ -0,0 +1,226 @@
import marimo
__generated_with = "0.18.1"
app = marimo.App(width="medium")
@app.cell
def _():
import marimo as mo
import json
import pandas as pd
import re
from pathlib import Path
from utils import connect_qumo_ollama, load_srt
# Configuration
VM_NAME = 'hiperf-gpu'
MODEL = 'llama3.3:70b'
TRANSCRIPT_DIR = Path("data/transcripts")
OUTPUT_FILE = Path("master_codebook.json")
client = connect_qumo_ollama(VM_NAME)
return (
MODEL,
OUTPUT_FILE,
TRANSCRIPT_DIR,
client,
json,
load_srt,
mo,
pd,
re,
)
@app.cell
def _(mo):
mo.md(r"""
# Stage 1: Theme Discovery
**Goal:** Identify recurring themes across a sample of interviews.
1. **Select Transcripts:** Choose 4-5 representative interviews.
2. **Extract Topics:** The AI will analyze each transcript to find key topics.
3. **Synthesize Themes:** Topics are grouped into a Master Codebook.
4. **Refine & Save:** Edit the definitions and save the `master_codebook.json`.
""")
return
@app.cell
def _(TRANSCRIPT_DIR, mo):
# File Selection
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
file_options = {f.name: str(f) for f in srt_files}
file_selector = mo.ui.multiselect(
options=file_options,
label="Select Transcripts (Recommended: 4-5)",
full_width=True
)
file_selector
return (file_selector,)
@app.cell
def _(file_selector, mo):
mo.md(f"**Selected:** {len(file_selector.value)} files")
return
@app.cell
def _(mo):
start_discovery_btn = mo.ui.run_button(label="Start Discovery Process")
start_discovery_btn
return (start_discovery_btn,)
@app.cell
def _(
MODEL,
client,
file_selector,
json,
load_srt,
mo,
re,
start_discovery_btn,
):
# Map Phase: Extract Topics per Transcript
extracted_topics = []
status_callout = mo.md("")
if start_discovery_btn.value and file_selector.value:
with mo.status.spinner("Analyzing transcripts...") as _spinner:
for filepath in file_selector.value:
_transcript = load_srt(filepath)
# Truncate for discovery if too long (optional, but good for speed)
# Using first 15k chars usually gives enough context for high-level themes
_context = _transcript[:15000]
_prompt = f"""
Analyze this interview transcript and list the top 5-7 key topics or themes discussed.
Focus on: Brand voice, Customer experience, Design systems, and AI.
Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"]
Transcript:
{_context}...
"""
try:
_response = client.generate(model=MODEL, prompt=_prompt)
# Find JSON list in response
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
if _match:
_topics = json.loads(_match.group(0))
extracted_topics.extend(_topics)
except Exception as e:
print(f"Error processing {filepath}: {e}")
status_callout = mo.callout(
f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.",
kind="success"
)
elif start_discovery_btn.value:
status_callout = mo.callout("Please select at least one file.", kind="warn")
status_callout
return (extracted_topics,)
@app.cell
def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn):
# Reduce Phase: Synthesize Themes
suggested_themes = []
if start_discovery_btn.value and extracted_topics:
with mo.status.spinner("Synthesizing Master Codebook...") as _spinner:
_topics_str = ", ".join(extracted_topics)
_synthesis_prompt = f"""
You are a qualitative data architect.
I have a list of raw topics extracted from multiple interviews:
[{_topics_str}]
Task:
1. Group these into 5-8 distinct, high-level Themes.
2. Create a definition for each theme.
3. Assign a hex color code to each.
4. ALWAYS include a theme named "Other" for miscellaneous insights.
Return a JSON object with this structure:
[
{{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}},
...
]
"""
_response = client.generate(model=MODEL, prompt=_synthesis_prompt)
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
if _match:
try:
suggested_themes = json.loads(_match.group(0))
except:
suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}]
return (suggested_themes,)
@app.cell
def _(mo, pd, suggested_themes):
# Interactive Editor
# Default empty structure if nothing generated yet
_initial_data = suggested_themes if suggested_themes else [
{"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"}
]
df_themes = pd.DataFrame(_initial_data)
theme_editor = mo.ui.data_editor(
df_themes,
label="Master Codebook Editor",
column_config={
"Color": mo.ui.column.color_picker(label="Color")
},
num_rows="dynamic" # Allow adding/removing rows
)
mo.vstack([
mo.md("### Review & Refine Codebook"),
mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."),
theme_editor
])
return (theme_editor,)
@app.cell
def _(OUTPUT_FILE, json, mo, theme_editor):
save_btn = mo.ui.run_button(label="Save Master Codebook")
save_message = mo.md("")
if save_btn.value:
_final_df = theme_editor.value
# Convert to list of dicts
_codebook = _final_df.to_dict(orient="records")
with open(OUTPUT_FILE, "w") as f:
json.dump(_codebook, f, indent=2)
save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success")
mo.vstack([
save_btn,
save_message
])
return
if __name__ == "__main__":
app.run()