thematic analysis opzetje

2025-12-01 15:09:16 +01:00
parent 74aecff2bd
commit 9499d6c068
4 changed files with 331 additions and 7 deletions
--- a/Thematic_Analysis.py
+++ b/Thematic_Analysis.py
@@ -0,0 +1,273 @@
+import marimo
+
+__generated_with = "0.18.0"
+app = marimo.App(width="medium")
+
+
+@app.cell
+def _():
+    import marimo as mo
+    from pathlib import Path
+    from utils import connect_qumo_ollama, load_srt
+
+    VM_NAME = 'hiperf-gpu'
+    MODEL = 'llama3.3:70b'
+
+    client = connect_qumo_ollama(VM_NAME)
+    return MODEL, Path, client, load_srt, mo
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Interview Transcript Thematic Analysis
+
+    This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
+    """)
+    return
+
+
+@app.cell
+def _(Path, mo):
+    # Load transcript from SRT file
+    TRANSCRIPT_DIR = Path("data/transcripts")
+    srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))
+
+    # File selector
+    file_dropdown = mo.ui.dropdown(
+        options={f.name: str(f) for f in srt_files},
+        label="Select transcript file"
+    )
+    file_dropdown
+    return (file_dropdown,)
+
+
+@app.cell
+def _(file_dropdown, load_srt, mo):
+    # Load and display transcript preview
+    transcript_raw = ""
+    if file_dropdown.value:
+        transcript_raw = load_srt(file_dropdown.value)
+
+    mo.md(f"""
+    ## Transcript Preview
+
+    **File:** `{file_dropdown.value or 'None selected'}`  
+    **Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words
+
+    <details>
+    <summary>Show first 2000 characters</summary>
+
+    ```
+    {transcript_raw[:2000]}...
+    ```
+    </details>
+    """)
+    return (transcript_raw,)
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 1: Infer Speaker Roles
+
+    The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
+    """)
+    return
+
+
+@app.cell
+def _(mo, transcript_raw):
+    # Infer speaker roles from transcript context
+    role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.
+
+    Based on the conversation context, determine who is:
+    - The interviewer(s) - asking questions, guiding the conversation
+    - The interviewee(s) - providing answers, sharing expertise/opinions
+
+    Return ONLY a simple mapping in this exact format (one per line):
+    SPEAKER_XX: Role - Brief description
+
+    For example:
+    SPEAKER_00: Interviewer - Michael from the voice branding team
+    SPEAKER_01: Interviewee - Head of Digital Design
+
+    <transcript>
+    {transcript_raw[:4000]}
+    </transcript>
+    """
+
+    infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
+    infer_roles_button
+    return infer_roles_button, role_inference_prompt
+
+
+@app.cell
+def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
+    inferred_roles_text = ""
+    if infer_roles_button.value:
+        response = client.generate(model=MODEL, prompt=role_inference_prompt)
+        inferred_roles_text = response.response
+
+    mo.md(f"""
+    ### Inferred Roles
+
+    {inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
+    """)
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 2: Confirm or Edit Speaker Roles
+
+    Review the inferred roles below and make corrections if needed.
+    """)
+    return
+
+
+@app.cell
+def _(mo, transcript_raw):
+    import re
+    # Extract unique speakers from transcript
+    speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))
+
+    # Create editable text inputs for each speaker
+    role_inputs = {
+        speaker: mo.ui.text(
+            value=f"{speaker}",
+            label=speaker,
+            full_width=True
+        )
+        for speaker in speakers
+    }
+
+    mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
+    return (role_inputs,)
+
+
+@app.cell
+def _(mo, role_inputs):
+    # Display role inputs as a form
+    mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
+    return
+
+
+@app.cell
+def _(mo, role_inputs, transcript_raw):
+    # Apply role labels to transcript
+    labeled_transcript = transcript_raw
+    for speaker_id, input_widget in role_inputs.items():
+        if input_widget.value and input_widget.value != speaker_id:
+            labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")
+
+    # Build role mapping summary
+    role_mapping = "\n".join([
+        f"- {speaker_id} → {input_widget.value}" 
+        for speaker_id, input_widget in sorted(role_inputs.items())
+    ])
+
+    mo.md(f"""
+    ### Role Mapping Applied
+
+    {role_mapping}
+    """)
+    return labeled_transcript, role_mapping
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Step 3: Thematic Analysis
+
+    Configure your analysis task and run the thematic analysis.
+    """)
+    return
+
+
+@app.cell
+def _(mo):
+    # Editable analysis task prompt
+    analysis_task_input = mo.ui.text_area(
+        value="""Perform a thematic analysis of this interview transcript.
+
+    Identify and describe:
+    1. **Key Themes** - Major topics and ideas that emerge from the conversation
+    2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
+    3. **Insights** - Notable observations or implications from the discussion
+
+    Focus on themes related to:
+    - Brand voice and tone strategy
+    - Customer experience priorities
+    - Design system and consistency
+    - AI/conversational interface considerations""",
+        label="Analysis Task",
+        full_width=True,
+        rows=12
+    )
+    analysis_task_input
+    return (analysis_task_input,)
+
+
+@app.cell
+def _(analysis_task_input, labeled_transcript, mo, role_mapping):
+    # Build full analysis prompt
+    full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.
+
+    ## Speaker Roles
+    {role_mapping}
+
+    ## Task
+    {analysis_task_input.value}
+
+    ## Interview Transcript
+
+    '''
+
+    <transcript>
+    {labeled_transcript}
+    </transcript>
+
+    '''
+
+    Provide your analysis in well-structured markdown format."""
+
+    run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")
+
+    mo.vstack([
+        mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
+        run_analysis_button
+    ])
+    return full_analysis_prompt, run_analysis_button
+
+
+@app.cell
+def _(full_analysis_prompt, mo):
+    mo.md(rf"""
+    # Full Analysis Prompt
+
+    ---
+
+    {full_analysis_prompt}
+    """)
+    return
+
+
+@app.cell
+def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
+    analysis_response = ""
+    if run_analysis_button.value:
+        response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
+        analysis_response = response_2.response
+
+    mo.md(f"""
+    ## Analysis Results
+
+    {analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
+    """)
+    return
+
+
+if __name__ == "__main__":
+    app.run()