taguette pre-process

2025-12-07 21:37:42 +01:00
parent 98202ac3f2
commit 8cc2bc9087
5 changed files with 445 additions and 6 deletions
--- a/Stage1_Theme_Discovery.py
+++ b/Stage1_Theme_Discovery.py
@@ -0,0 +1,226 @@
+import marimo
+
+__generated_with = "0.18.1"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    import json
+    import pandas as pd
+    import re
+    from pathlib import Path
+    from utils import connect_qumo_ollama, load_srt
+
+    # Configuration
+    VM_NAME = 'hiperf-gpu'
+    MODEL = 'llama3.3:70b'
+    TRANSCRIPT_DIR = Path("data/transcripts")
+    OUTPUT_FILE = Path("master_codebook.json")
+
+    client = connect_qumo_ollama(VM_NAME)
+    return (
+        MODEL,
+        OUTPUT_FILE,
+        TRANSCRIPT_DIR,
+        client,
+        json,
+        load_srt,
+        mo,
+        pd,
+        re,
+    )
+
+
+@app.cell
+def _(mo):
+    mo.md(r"""
+    # Stage 1: Theme Discovery
+
+    **Goal:** Identify recurring themes across a sample of interviews.
+
+    1.  **Select Transcripts:** Choose 4-5 representative interviews.
+    2.  **Extract Topics:** The AI will analyze each transcript to find key topics.
+    3.  **Synthesize Themes:** Topics are grouped into a Master Codebook.
+    4.  **Refine & Save:** Edit the definitions and save the `master_codebook.json`.
+    """)
+    return
+
+
+@app.cell
+def _(TRANSCRIPT_DIR, mo):
+    # File Selection
+    srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
+    file_options = {f.name: str(f) for f in srt_files}
+
+    file_selector = mo.ui.multiselect(
+        options=file_options,
+        label="Select Transcripts (Recommended: 4-5)",
+        full_width=True
+    )
+    file_selector
+    return (file_selector,)
+
+
+@app.cell
+def _(file_selector, mo):
+    mo.md(f"**Selected:** {len(file_selector.value)} files")
+    return
+
+
+@app.cell
+def _(mo):
+    start_discovery_btn = mo.ui.run_button(label="Start Discovery Process")
+    start_discovery_btn
+    return (start_discovery_btn,)
+
+
+@app.cell
+def _(
+    MODEL,
+    client,
+    file_selector,
+    json,
+    load_srt,
+    mo,
+    re,
+    start_discovery_btn,
+):
+    # Map Phase: Extract Topics per Transcript
+    extracted_topics = []
+    status_callout = mo.md("")
+
+    if start_discovery_btn.value and file_selector.value:
+        with mo.status.spinner("Analyzing transcripts...") as _spinner:
+            for filepath in file_selector.value:
+                _transcript = load_srt(filepath)
+
+                # Truncate for discovery if too long (optional, but good for speed)
+                # Using first 15k chars usually gives enough context for high-level themes
+                _context = _transcript[:15000] 
+
+                _prompt = f"""
+                Analyze this interview transcript and list the top 5-7 key topics or themes discussed.
+                Focus on: Brand voice, Customer experience, Design systems, and AI.
+
+                Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"]
+
+                Transcript:
+                {_context}...
+                """
+
+                try:
+                    _response = client.generate(model=MODEL, prompt=_prompt)
+                    # Find JSON list in response
+                    _match = re.search(r'\[.*\]', _response.response, re.DOTALL)
+                    if _match:
+                        _topics = json.loads(_match.group(0))
+                        extracted_topics.extend(_topics)
+                except Exception as e:
+                    print(f"Error processing {filepath}: {e}")
+
+        status_callout = mo.callout(
+            f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.", 
+            kind="success"
+        )
+    elif start_discovery_btn.value:
+        status_callout = mo.callout("Please select at least one file.", kind="warn")
+
+    status_callout
+    return (extracted_topics,)
+
+
+@app.cell
+def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn):
+    # Reduce Phase: Synthesize Themes
+    suggested_themes = []
+
+    if start_discovery_btn.value and extracted_topics:
+        with mo.status.spinner("Synthesizing Master Codebook...") as _spinner:
+            _topics_str = ", ".join(extracted_topics)
+
+            _synthesis_prompt = f"""
+            You are a qualitative data architect. 
+
+            I have a list of raw topics extracted from multiple interviews:
+            [{_topics_str}]
+
+            Task:
+            1. Group these into 5-8 distinct, high-level Themes.
+            2. Create a definition for each theme.
+            3. Assign a hex color code to each.
+            4. ALWAYS include a theme named "Other" for miscellaneous insights.
+
+            Return a JSON object with this structure:
+            [
+                {{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}},
+                ...
+            ]
+            """
+
+            _response = client.generate(model=MODEL, prompt=_synthesis_prompt)
+
+            _match = re.search(r'\[.*\]', _response.response, re.DOTALL)
+            if _match:
+                try:
+                    suggested_themes = json.loads(_match.group(0))
+                except:
+                    suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}]
+
+    return (suggested_themes,)
+
+
+@app.cell
+def _(mo, pd, suggested_themes):
+    # Interactive Editor
+
+    # Default empty structure if nothing generated yet
+    _initial_data = suggested_themes if suggested_themes else [
+        {"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"}
+    ]
+
+    df_themes = pd.DataFrame(_initial_data)
+
+    theme_editor = mo.ui.data_editor(
+        df_themes,
+        label="Master Codebook Editor",
+        column_config={
+            "Color": mo.ui.column.color_picker(label="Color")
+        },
+        num_rows="dynamic" # Allow adding/removing rows
+    )
+
+    mo.vstack([
+        mo.md("### Review & Refine Codebook"),
+        mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."),
+        theme_editor
+    ])
+    return (theme_editor,)
+
+
+@app.cell
+def _(OUTPUT_FILE, json, mo, theme_editor):
+    save_btn = mo.ui.run_button(label="Save Master Codebook")
+
+    save_message = mo.md("")
+
+    if save_btn.value:
+        _final_df = theme_editor.value
+        # Convert to list of dicts
+        _codebook = _final_df.to_dict(orient="records")
+
+        with open(OUTPUT_FILE, "w") as f:
+            json.dump(_codebook, f, indent=2)
+
+        save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success")
+
+    mo.vstack([
+        save_btn,
+        save_message
+    ])
+    return
+
+
+if __name__ == "__main__":
+    app.run()