import marimo

__generated_with = "0.18.0"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    return (mo,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # LLM Model Selection Reference

    A reference guide for choosing models for interview transcript thematic analysis.

    ---
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Infrastructure

    | Resource | Specification |
    |----------|---------------|
    | **GPU** | NVIDIA H100 (80GB VRAM) |
    | **VM** | `hiperf-gpu` via Tailscale |
    | **API** | Ollama Python client |
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Recommended Models for Thematic Analysis

    ### Primary Recommendation: `llama3.3:70b`

    | Aspect | Value |
    |--------|-------|
    | **Context Window** | 128K tokens |
    | **VRAM Usage** | ~45GB |
    | **Architecture** | Dense (70B always active) |
    | **Strengths** | Excellent instruction following, proven reliability, great for long documents |

    ### Alternatives

    | Model | Context | VRAM | Best For |
    |-------|---------|------|----------|
    | `qwen3:30b` | 256K | ~19GB | Fast iteration, huge context window |
    | `qwen3:32b` | 40K | ~20GB | Balance of speed and quality |
    | `qwen3:235b` | 256K | ~142GB (needs quantization) | Maximum quality (MoE: 22B active) |
    | `deepseek-r1:70b` | 64K | ~45GB | Reasoning transparency (shows thinking) |
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Context Window Considerations

    ### For 1-Hour Interview Transcripts

    - **Estimated size**: ~8,000-10,000 tokens
    - **Requirement**: Any model with 32K+ context is sufficient
    - **Recommendation**: `llama3.3:70b` (128K) handles full transcripts easily

    ### When Larger Context Helps ✅

    - Full document fits without chunking
    - Model can connect themes across entire transcript
    - Simpler preprocessing pipeline

    ### When Larger Context Can Hurt ⚠️

    | Issue | Explanation |
    |-------|-------------|
    | **"Lost in the middle"** | LLMs focus on beginning/end, lose attention to middle |
    | **Slower inference** | Attention scales quadratically with length |
    | **Diluted attention** | Key info gets drowned by less relevant content |

    ### Key Insight

    Research shows models often perform **worse** with very long contexts vs. strategically selected shorter contexts. For ~10K token transcripts, **context window size doesn't matter** — choose based on model quality and speed.
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Document Chunking

    ### When You Need Chunking

    | Model Context | 30-min Transcript (~5K tokens) | 1-hour Transcript (~10K tokens) |
    |---------------|-------------------------------|--------------------------------|
    | 4K-8K (7B models) | ⚠️ May need chunking | ❌ Needs chunking |
    | 32K-40K | ✅ Fits | ✅ Fits |
    | 128K+ | ✅ Fits easily | ✅ Fits easily |

    ### Chunking Strategies (if needed)

    1. **By speaker turns** — Split at natural conversation boundaries
    2. **By time segments** — 10-15 minute chunks
    3. **By token count** — Fixed size with overlap
    4. **Hierarchical** — Summarize chunks, then analyze summaries
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Model Comparison Summary

    ```
    Quality:     qwen3:235b > llama3.3:70b ≈ qwen3:30b > qwen3:32b
    Speed:       qwen3:30b > qwen3:32b > llama3.3:70b > qwen3:235b
    Context:     qwen3:235b (256K) > qwen3:30b (256K) > llama3.3:70b (128K) > qwen3:32b (40K)
    ```

    ### Final Recommendation

    **Use `llama3.3:70b`** for this project:
    - 128K context is more than sufficient for 1-hour transcripts
    - Excellent quality for thematic analysis
    - Well-tested and reliable
    - Good balance of speed and quality on H100
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ---

    *Last updated: December 2025*
    """)
    return


if __name__ == "__main__":
    app.run()