import marimo __generated_with = "0.18.1" app = marimo.App(width="medium") @app.cell def _(): import marimo as mo import json import pandas as pd import re from pathlib import Path from utils import connect_qumo_ollama, load_srt # Configuration VM_NAME = 'hiperf-gpu' MODEL = 'llama3.3:70b' TRANSCRIPT_DIR = Path("data/transcripts") OUTPUT_FILE = Path("master_codebook.json") client = connect_qumo_ollama(VM_NAME) return ( MODEL, OUTPUT_FILE, TRANSCRIPT_DIR, client, json, load_srt, mo, pd, re, ) @app.cell def _(mo): mo.md(r""" # Stage 1: Theme Discovery **Goal:** Identify recurring themes across a sample of interviews. 1. **Select Transcripts:** Choose 4-5 representative interviews. 2. **Extract Topics:** The AI will analyze each transcript to find key topics. 3. **Synthesize Themes:** Topics are grouped into a Master Codebook. 4. **Refine & Save:** Edit the definitions and save the `master_codebook.json`. """) return @app.cell def _(TRANSCRIPT_DIR, mo): # File Selection srt_files = list(TRANSCRIPT_DIR.glob("*.srt")) file_options = {f.name: str(f) for f in srt_files} file_selector = mo.ui.multiselect( options=file_options, label="Select Transcripts (Recommended: 4-5)", full_width=True ) file_selector return (file_selector,) @app.cell def _(file_selector, mo): mo.md(f"**Selected:** {len(file_selector.value)} files") return @app.cell def _(mo): start_discovery_btn = mo.ui.run_button(label="Start Discovery Process") start_discovery_btn return (start_discovery_btn,) @app.cell def _( MODEL, client, file_selector, json, load_srt, mo, re, start_discovery_btn, ): # Map Phase: Extract Topics per Transcript extracted_topics = [] status_callout = mo.md("") if start_discovery_btn.value and file_selector.value: with mo.status.spinner("Analyzing transcripts...") as _spinner: for filepath in file_selector.value: _transcript = load_srt(filepath) # Truncate for discovery if too long (optional, but good for speed) # Using first 15k chars usually gives enough context for high-level themes _context = _transcript[:15000] _prompt = f""" Analyze this interview transcript and list the top 5-7 key topics or themes discussed. Focus on: Brand voice, Customer experience, Design systems, and AI. Return ONLY a JSON list of strings. Example: ["Inconsistent Tone", "Mobile Latency", "AI Trust"] Transcript: {_context}... """ try: _response = client.generate(model=MODEL, prompt=_prompt) # Find JSON list in response _match = re.search(r'\[.*\]', _response.response, re.DOTALL) if _match: _topics = json.loads(_match.group(0)) extracted_topics.extend(_topics) except Exception as e: print(f"Error processing {filepath}: {e}") status_callout = mo.callout( f"✅ Extracted {len(extracted_topics)} raw topics from {len(file_selector.value)} files.", kind="success" ) elif start_discovery_btn.value: status_callout = mo.callout("Please select at least one file.", kind="warn") status_callout return (extracted_topics,) @app.cell def _(MODEL, client, extracted_topics, json, mo, re, start_discovery_btn): # Reduce Phase: Synthesize Themes suggested_themes = [] if start_discovery_btn.value and extracted_topics: with mo.status.spinner("Synthesizing Master Codebook...") as _spinner: _topics_str = ", ".join(extracted_topics) _synthesis_prompt = f""" You are a qualitative data architect. I have a list of raw topics extracted from multiple interviews: [{_topics_str}] Task: 1. Group these into 5-8 distinct, high-level Themes. 2. Create a definition for each theme. 3. Assign a hex color code to each. 4. ALWAYS include a theme named "Other" for miscellaneous insights. Return a JSON object with this structure: [ {{"Theme": "Theme Name", "Definition": "Description...", "Color": "#HEXCODE"}}, ... ] """ _response = client.generate(model=MODEL, prompt=_synthesis_prompt) _match = re.search(r'\[.*\]', _response.response, re.DOTALL) if _match: try: suggested_themes = json.loads(_match.group(0)) except: suggested_themes = [{"Theme": "Error parsing JSON", "Definition": _response.response, "Color": "#000000"}] return (suggested_themes,) @app.cell def _(mo, pd, suggested_themes): # Interactive Editor # Default empty structure if nothing generated yet _initial_data = suggested_themes if suggested_themes else [ {"Theme": "Example Theme", "Definition": "Description here...", "Color": "#CCCCCC"} ] df_themes = pd.DataFrame(_initial_data) theme_editor = mo.ui.data_editor( df_themes, label="Master Codebook Editor", column_config={ "Color": mo.ui.column.color_picker(label="Color") }, num_rows="dynamic" # Allow adding/removing rows ) mo.vstack([ mo.md("### Review & Refine Codebook"), mo.md("Edit the themes below. You can add rows, change colors, or refine definitions."), theme_editor ]) return (theme_editor,) @app.cell def _(OUTPUT_FILE, json, mo, theme_editor): save_btn = mo.ui.run_button(label="Save Master Codebook") save_message = mo.md("") if save_btn.value: _final_df = theme_editor.value # Convert to list of dicts _codebook = _final_df.to_dict(orient="records") with open(OUTPUT_FILE, "w") as f: json.dump(_codebook, f, indent=2) save_message = mo.callout(f"✅ Saved {len(_codebook)} themes to `{OUTPUT_FILE}`", kind="success") mo.vstack([ save_btn, save_message ]) return if __name__ == "__main__": app.run()