architecture overview for afstemming + reference

2025-12-01 16:58:11 +01:00
parent 9499d6c068
commit 21e4ab366a
4 changed files with 281 additions and 2 deletions
--- a/Architecture_Overview.py
+++ b/Architecture_Overview.py
@@ -0,0 +1,127 @@
 import marimo
 __generated_with = "0.18.0"
 app = marimo.App(width="medium")
@app.cell
 def _():
    import marimo as mo
    return (mo,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    # Interview Analysis Pipeline Architecture
    **Project Goal:** Synthesize insights from 26 stakeholder interviews into a unified report.
    **Input:** 26 Interview Transcripts (`.srt`)
    **Output:** Comprehensive Qualitative Analysis Report
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## High-Level Workflow
    The analysis follows a structured **3-Stage Pipeline** to ensure consistency across all interviews while leveraging the reasoning capabilities of Large Language Models (LLMs).
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Stage 1: Discovery (Theme Definition)
    **Goal:** Establish the "Rules of the Game" to ensure consistent analysis.
    1.  **Input:** A representative sample of 4-5 interviews.
    2.  **Process:**
        *   Exploratory analysis to identify recurring topics.
        *   Grouping topics into **Themes**.
        *   Defining the **"Other"** category for emerging insights that don't fit established themes.
    3.  **Output:** `master_codebook.json`
        *   Contains Theme Names, Definitions, and Color Codes.
        *   Serves as the strict instruction set for the AI in Stage 2.
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.mermaid("""
    graph TD
        A[Raw Transcripts] -->|Sample 4-5| B(Stage 1: Discovery)
        B -->|Generate| C[Master Codebook]
        C -->|Input| D(Stage 2: Theme Coding)
        A -->|All 26 Files| D
        D -->|Extract| E[Structured Dataset]
        E -->|Aggregate| F(Stage 3: Synthesis)
        F -->|Generate| G[Final Report]
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Stage 2: Structured Theme Coding (Extraction)
    **Goal:** Convert unstructured text into a structured dataset.
    1.  **Input:** All 26 Transcripts + `master_codebook.json`.
    2.  **Process:**
        *   The LLM analyzes each transcript segment-by-segment.
        *   It extracts specific quotes that match a Theme Definition.
        *   **Granular Sentiment Analysis:** For each quote, the model identifies:
            *   **Subject:** The specific topic/object being discussed (e.g., "Login Flow", "Brand Tone").
            *   **Sentiment:** Positive / Neutral / Negative.
    3.  **Output:** `coded_segments.csv`
        *   Columns: `Source_File`, `Speaker`, `Theme`, `Quote`, `Subject`, `Sentiment`, `Context`.
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Stage 3: Synthesis & Reporting
    **Goal:** Derive conclusions from the aggregated data.
    1.  **Input:** `coded_segments.csv` (The consolidated dataset).
    2.  **Process:**
        *   **Theme Synthesis:** All quotes for "Theme A" are analyzed together to find patterns, contradictions, and consensus.
        *   **"Other" Review:** The "Other" category is manually or computationally reviewed to identify missed signals.
        *   **Global Synthesis:** Cross-theme analysis to build the final narrative.
    3.  **Output:** Final Report
        *   Executive Summary
        *   Theme-by-Theme Deep Dives (with supporting quotes)
        *   Strategic Recommendations
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Technical Infrastructure
    | Component | Specification | Role |
    |-----------|---------------|------|
    | **Model** | `llama3.3:70b` | Primary reasoning engine (128k context) |
    | **Compute** | NVIDIA H100 (80GB) | High-performance inference |
    | **Orchestration** | Python + Marimo | Pipeline management and UI |
    | **Storage** | Local JSON/CSV | Data persistence |
    """)
    return
 if __name__ == "__main__":
    app.run()
--- a/Interviews_Transcription.py
+++ b/Interviews_Transcription.py
@@ -15,7 +15,7 @@ def _(mo):
    mo.md(r"""
    # Interview Audio Transcription
-    Use Whisper-Webui: http://whisper-webui.tail44fa00.ts.net:7860
+    Use Whisper-Webui: http://whisper-webui-h100.tail44fa00.ts.net:7860
    """)
    return
--- a/Model_Selection_Reference.py
+++ b/Model_Selection_Reference.py
@@ -0,0 +1,153 @@
 import marimo
 __generated_with = "0.18.0"
 app = marimo.App(width="medium")
@app.cell
 def _():
    import marimo as mo
    return (mo,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    # LLM Model Selection Reference
    A reference guide for choosing models for interview transcript thematic analysis.
    ---
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Infrastructure
    | Resource | Specification |
    |----------|---------------|
    | **GPU** | NVIDIA H100 (80GB VRAM) |
    | **VM** | `hiperf-gpu` via Tailscale |
    | **API** | Ollama Python client |
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Recommended Models for Thematic Analysis
    ### Primary Recommendation: `llama3.3:70b`
    | Aspect | Value |
    |--------|-------|
    | **Context Window** | 128K tokens |
    | **VRAM Usage** | ~45GB |
    | **Architecture** | Dense (70B always active) |
    | **Strengths** | Excellent instruction following, proven reliability, great for long documents |
    ### Alternatives
    | Model | Context | VRAM | Best For |
    |-------|---------|------|----------|
    | `qwen3:30b` | 256K | ~19GB | Fast iteration, huge context window |
    | `qwen3:32b` | 40K | ~20GB | Balance of speed and quality |
    | `qwen3:235b` | 256K | ~142GB (needs quantization) | Maximum quality (MoE: 22B active) |
    | `deepseek-r1:70b` | 64K | ~45GB | Reasoning transparency (shows thinking) |
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Context Window Considerations
    ### For 1-Hour Interview Transcripts
    - **Estimated size**: ~8,000-10,000 tokens
    - **Requirement**: Any model with 32K+ context is sufficient
    - **Recommendation**: `llama3.3:70b` (128K) handles full transcripts easily
    ### When Larger Context Helps ✅
    - Full document fits without chunking
    - Model can connect themes across entire transcript
    - Simpler preprocessing pipeline
    ### When Larger Context Can Hurt ⚠️
    | Issue | Explanation |
    |-------|-------------|
    | **"Lost in the middle"** | LLMs focus on beginning/end, lose attention to middle |
    | **Slower inference** | Attention scales quadratically with length |
    | **Diluted attention** | Key info gets drowned by less relevant content |
    ### Key Insight
    Research shows models often perform **worse** with very long contexts vs. strategically selected shorter contexts. For ~10K token transcripts, **context window size doesn't matter** — choose based on model quality and speed.
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Document Chunking
    ### When You Need Chunking
    | Model Context | 30-min Transcript (~5K tokens) | 1-hour Transcript (~10K tokens) |
    |---------------|-------------------------------|--------------------------------|
    | 4K-8K (7B models) | ⚠️ May need chunking | ❌ Needs chunking |
    | 32K-40K | ✅ Fits | ✅ Fits |
    | 128K+ | ✅ Fits easily | ✅ Fits easily |
    ### Chunking Strategies (if needed)
    1. **By speaker turns** — Split at natural conversation boundaries
    2. **By time segments** — 10-15 minute chunks
    3. **By token count** — Fixed size with overlap
    4. **Hierarchical** — Summarize chunks, then analyze summaries
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Model Comparison Summary
    ```
    Quality:     qwen3:235b > llama3.3:70b ≈ qwen3:30b > qwen3:32b
    Speed:       qwen3:30b > qwen3:32b > llama3.3:70b > qwen3:235b
    Context:     qwen3:235b (256K) > qwen3:30b (256K) > llama3.3:70b (128K) > qwen3:32b (40K)
    ```
    ### Final Recommendation
    **Use `llama3.3:70b`** for this project:
    - 128K context is more than sufficient for 1-hour transcripts
    - Excellent quality for thematic analysis
    - Well-tested and reliable
    - Good balance of speed and quality on H100
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ---
    *Last updated: December 2025*
    """)
    return
 if __name__ == "__main__":
    app.run()
--- a/utils.py
+++ b/utils.py
@@ -6,7 +6,6 @@ import re
 from pathlib import Path
 import requests
 import ollama
 from ollama import Client