From 21e4ab366ae925854d250ad7a47f72289e335bc4 Mon Sep 17 00:00:00 2001 From: Luigi Maiorano Date: Mon, 1 Dec 2025 16:58:11 +0100 Subject: [PATCH] architecture overview for afstemming + reference --- Architecture_Overview.py | 127 +++++++++++++++++++++++++++++ Interviews_Transcription.py | 2 +- Model_Selection_Reference.py | 153 +++++++++++++++++++++++++++++++++++ utils.py | 1 - 4 files changed, 281 insertions(+), 2 deletions(-) create mode 100644 Architecture_Overview.py create mode 100644 Model_Selection_Reference.py diff --git a/Architecture_Overview.py b/Architecture_Overview.py new file mode 100644 index 0000000..90d18fb --- /dev/null +++ b/Architecture_Overview.py @@ -0,0 +1,127 @@ +import marimo + +__generated_with = "0.18.0" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # Interview Analysis Pipeline Architecture + + **Project Goal:** Synthesize insights from 26 stakeholder interviews into a unified report. + + **Input:** 26 Interview Transcripts (`.srt`) + **Output:** Comprehensive Qualitative Analysis Report + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## High-Level Workflow + + The analysis follows a structured **3-Stage Pipeline** to ensure consistency across all interviews while leveraging the reasoning capabilities of Large Language Models (LLMs). + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Stage 1: Discovery (Theme Definition) + + **Goal:** Establish the "Rules of the Game" to ensure consistent analysis. + + 1. **Input:** A representative sample of 4-5 interviews. + 2. **Process:** + * Exploratory analysis to identify recurring topics. + * Grouping topics into **Themes**. + * Defining the **"Other"** category for emerging insights that don't fit established themes. + 3. **Output:** `master_codebook.json` + * Contains Theme Names, Definitions, and Color Codes. + * Serves as the strict instruction set for the AI in Stage 2. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.mermaid(""" + graph TD + A[Raw Transcripts] -->|Sample 4-5| B(Stage 1: Discovery) + B -->|Generate| C[Master Codebook] + C -->|Input| D(Stage 2: Theme Coding) + A -->|All 26 Files| D + D -->|Extract| E[Structured Dataset] + E -->|Aggregate| F(Stage 3: Synthesis) + F -->|Generate| G[Final Report] + + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Stage 2: Structured Theme Coding (Extraction) + + **Goal:** Convert unstructured text into a structured dataset. + + 1. **Input:** All 26 Transcripts + `master_codebook.json`. + 2. **Process:** + * The LLM analyzes each transcript segment-by-segment. + * It extracts specific quotes that match a Theme Definition. + * **Granular Sentiment Analysis:** For each quote, the model identifies: + * **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone"). + * **Sentiment:** Positive / Neutral / Negative. + 3. **Output:** `coded_segments.csv` + * Columns: `Source_File`, `Speaker`, `Theme`, `Quote`, `Subject`, `Sentiment`, `Context`. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Stage 3: Synthesis & Reporting + + **Goal:** Derive conclusions from the aggregated data. + + 1. **Input:** `coded_segments.csv` (The consolidated dataset). + 2. **Process:** + * **Theme Synthesis:** All quotes for "Theme A" are analyzed together to find patterns, contradictions, and consensus. + * **"Other" Review:** The "Other" category is manually or computationally reviewed to identify missed signals. + * **Global Synthesis:** Cross-theme analysis to build the final narrative. + 3. **Output:** Final Report + * Executive Summary + * Theme-by-Theme Deep Dives (with supporting quotes) + * Strategic Recommendations + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Technical Infrastructure + + | Component | Specification | Role | + |-----------|---------------|------| + | **Model** | `llama3.3:70b` | Primary reasoning engine (128k context) | + | **Compute** | NVIDIA H100 (80GB) | High-performance inference | + | **Orchestration** | Python + Marimo | Pipeline management and UI | + | **Storage** | Local JSON/CSV | Data persistence | + """) + return + + +if __name__ == "__main__": + app.run() diff --git a/Interviews_Transcription.py b/Interviews_Transcription.py index 1f0dc46..b8c015b 100644 --- a/Interviews_Transcription.py +++ b/Interviews_Transcription.py @@ -15,7 +15,7 @@ def _(mo): mo.md(r""" # Interview Audio Transcription - Use Whisper-Webui: http://whisper-webui.tail44fa00.ts.net:7860 + Use Whisper-Webui: http://whisper-webui-h100.tail44fa00.ts.net:7860 """) return diff --git a/Model_Selection_Reference.py b/Model_Selection_Reference.py new file mode 100644 index 0000000..df0b0ab --- /dev/null +++ b/Model_Selection_Reference.py @@ -0,0 +1,153 @@ +import marimo + +__generated_with = "0.18.0" +app = marimo.App(width="medium") + + +@app.cell +def _(): + import marimo as mo + return (mo,) + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + # LLM Model Selection Reference + + A reference guide for choosing models for interview transcript thematic analysis. + + --- + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Infrastructure + + | Resource | Specification | + |----------|---------------| + | **GPU** | NVIDIA H100 (80GB VRAM) | + | **VM** | `hiperf-gpu` via Tailscale | + | **API** | Ollama Python client | + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Recommended Models for Thematic Analysis + + ### Primary Recommendation: `llama3.3:70b` + + | Aspect | Value | + |--------|-------| + | **Context Window** | 128K tokens | + | **VRAM Usage** | ~45GB | + | **Architecture** | Dense (70B always active) | + | **Strengths** | Excellent instruction following, proven reliability, great for long documents | + + ### Alternatives + + | Model | Context | VRAM | Best For | + |-------|---------|------|----------| + | `qwen3:30b` | 256K | ~19GB | Fast iteration, huge context window | + | `qwen3:32b` | 40K | ~20GB | Balance of speed and quality | + | `qwen3:235b` | 256K | ~142GB (needs quantization) | Maximum quality (MoE: 22B active) | + | `deepseek-r1:70b` | 64K | ~45GB | Reasoning transparency (shows thinking) | + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Context Window Considerations + + ### For 1-Hour Interview Transcripts + + - **Estimated size**: ~8,000-10,000 tokens + - **Requirement**: Any model with 32K+ context is sufficient + - **Recommendation**: `llama3.3:70b` (128K) handles full transcripts easily + + ### When Larger Context Helps ✅ + + - Full document fits without chunking + - Model can connect themes across entire transcript + - Simpler preprocessing pipeline + + ### When Larger Context Can Hurt ⚠️ + + | Issue | Explanation | + |-------|-------------| + | **"Lost in the middle"** | LLMs focus on beginning/end, lose attention to middle | + | **Slower inference** | Attention scales quadratically with length | + | **Diluted attention** | Key info gets drowned by less relevant content | + + ### Key Insight + + Research shows models often perform **worse** with very long contexts vs. strategically selected shorter contexts. For ~10K token transcripts, **context window size doesn't matter** — choose based on model quality and speed. + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Document Chunking + + ### When You Need Chunking + + | Model Context | 30-min Transcript (~5K tokens) | 1-hour Transcript (~10K tokens) | + |---------------|-------------------------------|--------------------------------| + | 4K-8K (7B models) | ⚠️ May need chunking | ❌ Needs chunking | + | 32K-40K | ✅ Fits | ✅ Fits | + | 128K+ | ✅ Fits easily | ✅ Fits easily | + + ### Chunking Strategies (if needed) + + 1. **By speaker turns** — Split at natural conversation boundaries + 2. **By time segments** — 10-15 minute chunks + 3. **By token count** — Fixed size with overlap + 4. **Hierarchical** — Summarize chunks, then analyze summaries + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + ## Model Comparison Summary + + ``` + Quality: qwen3:235b > llama3.3:70b ≈ qwen3:30b > qwen3:32b + Speed: qwen3:30b > qwen3:32b > llama3.3:70b > qwen3:235b + Context: qwen3:235b (256K) > qwen3:30b (256K) > llama3.3:70b (128K) > qwen3:32b (40K) + ``` + + ### Final Recommendation + + **Use `llama3.3:70b`** for this project: + - 128K context is more than sufficient for 1-hour transcripts + - Excellent quality for thematic analysis + - Well-tested and reliable + - Good balance of speed and quality on H100 + """) + return + + +@app.cell(hide_code=True) +def _(mo): + mo.md(r""" + --- + + *Last updated: December 2025* + """) + return + + +if __name__ == "__main__": + app.run() diff --git a/utils.py b/utils.py index 9b6495d..789c269 100644 --- a/utils.py +++ b/utils.py @@ -6,7 +6,6 @@ import re from pathlib import Path import requests -import ollama from ollama import Client