taguette pre-process
This commit is contained in:
226
Stage1_Theme_Discovery.py
Normal file
226
Stage1_Theme_Discovery.py
Normal file
@@ -0,0 +1,226 @@
|
||||
import marimo
|
||||
|
||||
__generated_with = "0.18.1"
|
||||
app = marimo.App(width="medium")
|
||||
|
||||
|
||||
@app.cell
|
||||
def _():
|
||||
import marimo as mo
|
||||
import json
|
||||
import pandas as pd
|
||||
import re
|
||||
from pathlib import Path
|
||||
from utils import connect_qumo_ollama, load_srt
|
||||
|
||||
# Configuration
|
||||
VM_NAME = 'hiperf-gpu'
|
||||
MODEL = 'llama3.3:70b'
|
||||
TRANSCRIPT_DIR = Path("data/transcripts")
|
||||
OUTPUT_FILE = Path("master_codebook.json")
|
||||
|
||||
client = connect_qumo_ollama(VM_NAME)
|
||||
return (
|
||||
MODEL,
|
||||
OUTPUT_FILE,
|
||||
TRANSCRIPT_DIR,
|
||||
client,
|
||||
json,
|
||||
load_srt,
|
||||
mo,
|
||||
pd,
|
||||
re,
|
||||
)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Stage 1: Theme Discovery
|
||||
|
||||
**Goal:** Identify recurring themes across a sample of interviews.
|
||||
|
||||
1. **Select Transcripts:** Choose 4-5 representative interviews.
|
||||
2. **Extract Topics:** The AI will analyze each transcript to find key topics.
|
||||
3. **Synthesize Themes:** Topics are grouped into a Master Codebook.
|
||||
4. **Refine & Save:** Edit the definitions and save the `master_codebook.json`.
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TRANSCRIPT_DIR, mo):
|
||||
# File Selection
|
||||
srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
|
||||
file_options = {f.name: str(f) for f in srt_files}
|
||||
|
||||
file_selector = mo.ui.multiselect(
|
||||
options=file_options,
|
||||
label="Select Transcripts (Recommended: 4-5)",
|
||||
full_width=True
|
||||
)
|
||||
file_selector
|
||||
return (file_selector,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(file_selector, mo):
|
||||
mo.md(f"**Selected:** {len(file_selector.value)} files")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
start_discovery_btn = mo.ui.run_button(label="Start Discovery Process")
|
||||
start_discovery_btn
|
||||
return (start_discovery_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
MODEL,
|
||||
client,
|
||||
file_selector,
|
||||
json,
|
||||
load_srt,
|
||||
mo,
|
||||
re,
|
||||
start_discovery_btn,
|
||||
):
|
||||
# Map Phase: Extract Topics per Transcript
|
||||
extracted_topics = []
|
||||
status_callout = mo.md("")
|
||||
|
||||
if start_discovery_btn.value and file_selector.value:
|
||||
with mo.status.spinner("Analyzing transcripts...") as _spinner:
|
||||
for filepath in file_selector.value:
|
||||
_transcript = load_srt(filepath)
|
||||
|
||||
# Truncate for discovery if too long (optional, but good for speed)
|
||||
# Using first 15k chars usually gives enough context for high-level themes
|
||||
_context = _transcript[:15000]
|
||||
|
||||
_prompt = f"""
|
||||
Analyze this interview transcript and list the top 5-7 key topics or themes discussed.
|
||||
Focus on: Brand voice, Customer experience, Design systems, and AI.
|
||||
|
||||
Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"]
|
||||
|
||||
Transcript:
|
||||
{_context}...
|
||||
"""
|
||||
|
||||
try:
|
||||
_response = client.generate(model=MODEL, prompt=_prompt)
|
||||
# Find JSON list in response
|
||||
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
|
||||
if _match:
|
||||
_topics = json.loads(_match.group(0))
|
||||
extracted_topics.extend(_topics)
|
||||
except Exception as e:
|
||||
print(f"Error processing {filepath}: {e}")
|
||||
|
||||
status_callout = mo.callout(
|
||||
f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.",
|
||||
kind="success"
|
||||
)
|
||||
elif start_discovery_btn.value:
|
||||
status_callout = mo.callout("Please select at least one file.", kind="warn")
|
||||
|
||||
status_callout
|
||||
return (extracted_topics,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn):
|
||||
# Reduce Phase: Synthesize Themes
|
||||
suggested_themes = []
|
||||
|
||||
if start_discovery_btn.value and extracted_topics:
|
||||
with mo.status.spinner("Synthesizing Master Codebook...") as _spinner:
|
||||
_topics_str = ", ".join(extracted_topics)
|
||||
|
||||
_synthesis_prompt = f"""
|
||||
You are a qualitative data architect.
|
||||
|
||||
I have a list of raw topics extracted from multiple interviews:
|
||||
[{_topics_str}]
|
||||
|
||||
Task:
|
||||
1. Group these into 5-8 distinct, high-level Themes.
|
||||
2. Create a definition for each theme.
|
||||
3. Assign a hex color code to each.
|
||||
4. ALWAYS include a theme named "Other" for miscellaneous insights.
|
||||
|
||||
Return a JSON object with this structure:
|
||||
[
|
||||
{{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}},
|
||||
...
|
||||
]
|
||||
"""
|
||||
|
||||
_response = client.generate(model=MODEL, prompt=_synthesis_prompt)
|
||||
|
||||
_match = re.search(r'\[.*\]', _response.response, re.DOTALL)
|
||||
if _match:
|
||||
try:
|
||||
suggested_themes = json.loads(_match.group(0))
|
||||
except:
|
||||
suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}]
|
||||
|
||||
return (suggested_themes,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, pd, suggested_themes):
|
||||
# Interactive Editor
|
||||
|
||||
# Default empty structure if nothing generated yet
|
||||
_initial_data = suggested_themes if suggested_themes else [
|
||||
{"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"}
|
||||
]
|
||||
|
||||
df_themes = pd.DataFrame(_initial_data)
|
||||
|
||||
theme_editor = mo.ui.data_editor(
|
||||
df_themes,
|
||||
label="Master Codebook Editor",
|
||||
column_config={
|
||||
"Color": mo.ui.column.color_picker(label="Color")
|
||||
},
|
||||
num_rows="dynamic" # Allow adding/removing rows
|
||||
)
|
||||
|
||||
mo.vstack([
|
||||
mo.md("### Review & Refine Codebook"),
|
||||
mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."),
|
||||
theme_editor
|
||||
])
|
||||
return (theme_editor,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(OUTPUT_FILE, json, mo, theme_editor):
|
||||
save_btn = mo.ui.run_button(label="Save Master Codebook")
|
||||
|
||||
save_message = mo.md("")
|
||||
|
||||
if save_btn.value:
|
||||
_final_df = theme_editor.value
|
||||
# Convert to list of dicts
|
||||
_codebook = _final_df.to_dict(orient="records")
|
||||
|
||||
with open(OUTPUT_FILE, "w") as f:
|
||||
json.dump(_codebook, f, indent=2)
|
||||
|
||||
save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success")
|
||||
|
||||
mo.vstack([
|
||||
save_btn,
|
||||
save_message
|
||||
])
|
||||
return
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run()
|
||||
Reference in New Issue
Block a user