thematic analysis opzetje

2025-12-01 15:09:16 +01:00
parent 74aecff2bd
commit 9499d6c068
4 changed files with 331 additions and 7 deletions
--- a/Thematic_Analysis.py
+++ b/Thematic_Analysis.py
@@ -0,0 +1,273 @@
 import marimo
 __generated_with = "0.18.0"
 app = marimo.App(width="medium")
@app.cell
 def _():
    import marimo as mo
    from pathlib import Path
    from utils import connect_qumo_ollama, load_srt
    VM_NAME = 'hiperf-gpu'
    MODEL = 'llama3.3:70b'
    client = connect_qumo_ollama(VM_NAME)
    return MODEL, Path, client, load_srt, mo
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    # Interview Transcript Thematic Analysis
    This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
    """)
    return
@app.cell
 def _(Path, mo):
    # Load transcript from SRT file
    TRANSCRIPT_DIR = Path("data/transcripts")
    srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
    # File selector
    file_dropdown = mo.ui.dropdown(
        options={f.name: str(f) for f in srt_files},
        label="Select transcript file"
    )
    file_dropdown
    return (file_dropdown,)
@app.cell
 def _(file_dropdown, load_srt, mo):
    # Load and display transcript preview
    transcript_raw = ""
    if file_dropdown.value:
        transcript_raw = load_srt(file_dropdown.value)
    mo.md(f"""
    ## Transcript Preview
    **File:** `{file_dropdown.value or 'None selected'}`  
    **Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
    <details>
    <summary>Show first 2000 characters</summary>
    ```
    {transcript_raw[:2000]}...
    ```
    </details>
    """)
    return (transcript_raw,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Step 1: Infer Speaker Roles
    The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
    """)
    return
@app.cell
 def _(mo, transcript_raw):
    # Infer speaker roles from transcript context
    role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
    Based on the conversation context, determine who is:
    - The interviewer(s) - asking questions, guiding the conversation
    - The interviewee(s) - providing answers, sharing expertise/opinions
    Return ONLY a simple mapping in this exact format (one per line):
    SPEAKER_XX: Role - Brief description
    For example:
    SPEAKER_00: Interviewer - Michael from the voice branding team
    SPEAKER_01: Interviewee - Head of Digital Design
    <transcript>
    {transcript_raw[:4000]}
    </transcript>
    """
    infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
    infer_roles_button
    return infer_roles_button, role_inference_prompt
@app.cell
 def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
    inferred_roles_text = ""
    if infer_roles_button.value:
        response = client.generate(model=MODEL, prompt=role_inference_prompt)
        inferred_roles_text = response.response
    mo.md(f"""
    ### Inferred Roles
    {inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
    """)
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Step 2: Confirm or Edit Speaker Roles
    Review the inferred roles below and make corrections if needed.
    """)
    return
@app.cell
 def _(mo, transcript_raw):
    import re
    # Extract unique speakers from transcript
    speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
    # Create editable text inputs for each speaker
    role_inputs = {
        speaker: mo.ui.text(
            value=f"{speaker}",
            label=speaker,
            full_width=True
        )
        for speaker in speakers
    }
    mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
    return (role_inputs,)
@app.cell
 def _(mo, role_inputs):
    # Display role inputs as a form
    mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
    return
@app.cell
 def _(mo, role_inputs, transcript_raw):
    # Apply role labels to transcript
    labeled_transcript = transcript_raw
    for speaker_id, input_widget in role_inputs.items():
        if input_widget.value and input_widget.value != speaker_id:
            labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
    # Build role mapping summary
    role_mapping = "\n".join([
        f"- {speaker_id} → {input_widget.value}" 
        for speaker_id, input_widget in sorted(role_inputs.items())
    ])
    mo.md(f"""
    ### Role Mapping Applied
    {role_mapping}
    """)
    return labeled_transcript, role_mapping
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Step 3: Thematic Analysis
    Configure your analysis task and run the thematic analysis.
    """)
    return
@app.cell
 def _(mo):
    # Editable analysis task prompt
    analysis_task_input = mo.ui.text_area(
        value="""Perform a thematic analysis of this interview transcript.
    Identify and describe:
    1. **Key Themes** - Major topics and ideas that emerge from the conversation
    2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
    3. **Insights** - Notable observations or implications from the discussion
    Focus on themes related to:
    - Brand voice and tone strategy
    - Customer experience priorities
    - Design system and consistency
    - AI/conversational interface considerations""",
        label="Analysis Task",
        full_width=True,
        rows=12
    )
    analysis_task_input
    return (analysis_task_input,)
@app.cell
 def _(analysis_task_input, labeled_transcript, mo, role_mapping):
    # Build full analysis prompt
    full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
    ## Speaker Roles
    {role_mapping}
    ## Task
    {analysis_task_input.value}
    ## Interview Transcript
    '''
    <transcript>
    {labeled_transcript}
    </transcript>
    '''
    Provide your analysis in well-structured markdown format."""
    run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
    mo.vstack([
        mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
        run_analysis_button
    ])
    return full_analysis_prompt, run_analysis_button
@app.cell
 def _(full_analysis_prompt, mo):
    mo.md(rf"""
    # Full Analysis Prompt
    ---
    {full_analysis_prompt}
    """)
    return
@app.cell
 def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
    analysis_response = ""
    if run_analysis_button.value:
        response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
        analysis_response = response_2.response
    mo.md(f"""
    ## Analysis Results
    {analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
    """)
    return
 if __name__ == "__main__":
    app.run()
--- a/VB_interviews_sandbox.py
+++ b/VB_interviews_sandbox.py
@@ -9,8 +9,8 @@ def _():
    import marimo as mo
    from utils import connect_qumo_ollama
-    # VM_NAME = 'hiperf-gpu'
+    VM_NAME = 'hiperf-gpu'
-    VM_NAME = 'ollama-lite'
+    # VM_NAME = 'ollama-lite'
    client = connect_qumo_ollama(VM_NAME)
    return VM_NAME, client, mo
--- a/layouts/VB_interviews_sandbox.slides.json
+++ b/layouts/VB_interviews_sandbox.slides.json
@@ -1,4 +0,0 @@
 {
  "type": "slides",
  "data": {}
 }
--- a/utils.py
+++ b/utils.py
@@ -2,11 +2,66 @@
 Standard utils for this repository
 """
 import re
 from pathlib import Path
 import requests
 import ollama
 from ollama import Client
 def load_srt(path: str | Path) -> str:
    """Load and parse an SRT file, returning clean transcript with speaker labels.
    Args:
        path: Path to the SRT file
    Returns:
        Clean transcript string with format "SPEAKER_XX: text" per line,
        timestamps stripped, consecutive lines from same speaker merged.
    """
    path = Path(path)
    content = path.read_text(encoding='utf-8')
    # Parse SRT blocks: sequence number, timestamp, speaker|text
    # Pattern matches: number, timestamp line, content line(s)
    blocks = re.split(r'\n\n+', content.strip())
    turns = []
    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 3:
            continue
        # Skip sequence number (line 0) and timestamp (line 1)
        # Content is line 2 onwards
        text_lines = lines[2:]
        text = ' '.join(text_lines)
        # Parse speaker|text format
        if '|' in text:
            speaker, utterance = text.split('|', 1)
            speaker = speaker.strip()
            utterance = utterance.strip()
        else:
            speaker = "UNKNOWN"
            utterance = text.strip()
        turns.append((speaker, utterance))
    # Merge consecutive turns from same speaker
    merged = []
    for speaker, utterance in turns:
        if merged and merged[-1][0] == speaker:
            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
        else:
            merged.append((speaker, utterance))
    # Format as "SPEAKER_XX: text"
    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
    return '\n\n'.join(transcript_lines)
 def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
    """Establish connection to Qumo Ollama instance
@@ -25,7 +80,7 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
    except requests.ConnectionError:
        print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
-    print("Connection succesful.\nAvailable models:")
+    print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
    for m in client.list().models:
        print(f"  - '{m.model}' ")
    return client