Interview-Analysis/Thematic_Analysis.py

import marimo

__generated_with = "0.18.0"
app = marimo.App(width="medium")


@app.cell
def _():
    import marimo as mo
    from pathlib import Path
    from utils import connect_qumo_ollama, load_srt

    VM_NAME = 'hiperf-gpu'
    MODEL = 'llama3.3:70b'

    #client = connect_qumo_ollama(VM_NAME)
    return MODEL, Path, load_srt, mo


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    # Interview Transcript Thematic Analysis

    This notebook loads interview transcripts (SRT files) and runs thematic analysis using LLMs.
    """)
    return


@app.cell
def _(Path, mo):
    # Load transcript from SRT file
    TRANSCRIPT_DIR = Path("data/transcripts")
    srt_files = list(TRANSCRIPT_DIR.glob("*.srt"))

    # File selector
    file_dropdown = mo.ui.dropdown(
        options={f.name: str(f) for f in srt_files},
        label="Select transcript file"
    )
    file_dropdown
    return (file_dropdown,)


@app.cell
def _(file_dropdown, load_srt, mo):
    # Load and display transcript preview
    transcript_raw = ""
    if file_dropdown.value:
        transcript_raw = load_srt(file_dropdown.value)

    mo.md(f"""
    ## Transcript Preview

    **File:** `{file_dropdown.value or 'None selected'}`
    **Length:** {len(transcript_raw)} characters, ~{len(transcript_raw.split())} words

    <details>
    <summary>Show first 2000 characters</summary>

    ```
    {transcript_raw[:2000]}...
    ```
    </details>
    """)
    return (transcript_raw,)


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 1: Infer Speaker Roles

    The model will analyze the transcript to identify who is the interviewer and who is the interviewee.
    """)
    return


@app.cell
def _(mo, transcript_raw):
    # Infer speaker roles from transcript context
    role_inference_prompt = f"""Analyze this interview transcript and identify the role of each speaker.

    Based on the conversation context, determine who is:
    - The interviewer(s) - asking questions, guiding the conversation
    - The interviewee(s) - providing answers, sharing expertise/opinions

    Return ONLY a simple mapping in this exact format (one per line):
    SPEAKER_XX: Role - Brief description

    For example:
    SPEAKER_00: Interviewer - Michael from the voice branding team
    SPEAKER_01: Interviewee - Head of Digital Design

    <transcript>
    {transcript_raw[:4000]}
    </transcript>
    """

    infer_roles_button = mo.ui.run_button(label="Infer Speaker Roles")
    infer_roles_button
    return infer_roles_button, role_inference_prompt


@app.cell
def _(MODEL, client, infer_roles_button, mo, role_inference_prompt):
    inferred_roles_text = ""
    if infer_roles_button.value:
        response = client.generate(model=MODEL, prompt=role_inference_prompt)
        inferred_roles_text = response.response

    mo.md(f"""
    ### Inferred Roles

    {inferred_roles_text if inferred_roles_text else "_Click 'Infer Speaker Roles' to analyze the transcript_"}
    """)
    return


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 2: Confirm or Edit Speaker Roles

    Review the inferred roles below and make corrections if needed.
    """)
    return


@app.cell
def _(mo, transcript_raw):
    import re
    # Extract unique speakers from transcript
    speakers = sorted(set(re.findall(r'(SPEAKER_\d+):', transcript_raw)))

    # Create editable text inputs for each speaker
    role_inputs = {
        speaker: mo.ui.text(
            value=f"{speaker}",
            label=speaker,
            full_width=True
        )
        for speaker in speakers
    }

    mo.md("### Edit Speaker Labels\n\nEnter the name/role for each speaker:")
    return (role_inputs,)


@app.cell
def _(mo, role_inputs):
    # Display role inputs as a form
    mo.vstack([role_inputs[k] for k in sorted(role_inputs.keys())])
    return


@app.cell
def _(mo, role_inputs, transcript_raw):
    # Apply role labels to transcript
    labeled_transcript = transcript_raw
    for speaker_id, input_widget in role_inputs.items():
        if input_widget.value and input_widget.value != speaker_id:
            labeled_transcript = labeled_transcript.replace(f"{speaker_id}:", f"{input_widget.value}:")

    # Build role mapping summary
    role_mapping = "\n".join([
        f"- {speaker_id} → {input_widget.value}"
        for speaker_id, input_widget in sorted(role_inputs.items())
    ])

    mo.md(f"""
    ### Role Mapping Applied

    {role_mapping}
    """)
    return labeled_transcript, role_mapping


@app.cell(hide_code=True)
def _(mo):
    mo.md(r"""
    ## Step 3: Thematic Analysis

    Configure your analysis task and run the thematic analysis.
    """)
    return


@app.cell
def _(mo):
    # Step 3a: Define themes for labelling
    themes_input = mo.ui.text_area(
        value="""brand voice and tone
    customer experience priorities
    design system and consistency
    AI and conversational interfaces""",
        label="Themes (one per line)",
        full_width=True,
        rows=6,
    )

    mo.md("""### Step 3a: Define Themes

    Enter one theme per line. These will be used to
    label each interview transcript. Themes may overlap; the
    same section can relate to multiple themes.
    """)

    themes_input
    return (themes_input,)


@app.cell
def _(themes_input):
    # Parse themes into a clean Python list
    raw_lines = themes_input.value.splitlines() if themes_input and themes_input.value else []
    theme_list = [t.strip() for t in raw_lines if t.strip()]
    return (theme_list,)


@app.cell
def _(Path, mo):
    # Configuration for JSON output directory
    OUTPUT_DIR = Path("data/labels")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    mo.md(f"""### Step 3b: LLM-based Theme Labelling

    This step runs an LLM over the current interview transcript
    for each defined theme and saves one JSON file per theme
    for this interview in `{OUTPUT_DIR}`.

    For each theme, the model will return full sections of the
    conversation (multi-sentence chunks, not just short quotes)
    that are about that theme.
    """)

    label_button = mo.ui.run_button(label="Run Theme Labelling for This Interview")
    label_button
    return OUTPUT_DIR, label_button


@app.cell
def _(
    MODEL,
    OUTPUT_THEME_DIR,
    Path,
    client,
    file_dropdown,
    theme_label_button,
    labeled_transcript,
    mo,
    theme_list,
):
    import json
    from datetime import datetime

    theme_label_results = {}

    if theme_label_button.value and file_dropdown.value and theme_list:
        interview_id = Path(file_dropdown.value).stem

        for theme in theme_list:
            prompt = f"""You are an expert qualitative researcher.

            You will analyse a single interview transcript for ONE specific theme.

            Theme: "{theme}"

            Tasks:
            1. Decide if the theme is present in this interview.
            2. If present, estimate how relevant it is on a 0–1 scale
            where 0 = not mentioned, 0.5 = moderately important,
            1 = central theme of the interview.
            3. Identify all sections of the conversation that are
            primarily about this theme. A section can span multiple
            consecutive utterances and should form a coherent piece
            of the dialogue about the theme, not just a single
            sentence.

            Each section should include:
            - the dominant speaker label (or "mixed" if multiple)
            - the full section text (one or more sentences)

            Return your answer ONLY as a JSON object with this schema:
            {{
            "theme": string,                  // the theme name
            "present": bool,                  // whether the theme appears
            "relevance": float,               // 0.0–1.0
                "sections": [
                    {{
                        "speaker": string,           // main speaker label for the section
                        "section_text": string       // full section text about the theme
                    }}
                ]
            }}

            Transcript:
            """
            {labeled_transcript}
            """
            """

            response = client.generate(model=MODEL, prompt=prompt)
            raw_text = response.response.strip()

            try:
                parsed = json.loads(raw_text)
            except json.JSONDecodeError:
                # Fallback: try to extract JSON between braces
                try:
                    start = raw_text.index("{")
                    end = raw_text.rindex("}") + 1
                    parsed = json.loads(raw_text[start:end])
                except Exception:
                    parsed = {
                        "theme": theme,
                        "present": False,
                        "relevance": 0.0,
                        "sections": [],
                        "_parse_error": True,
                        "_raw": raw_text,
                    }

            # Normalise fields
            parsed["theme"] = parsed.get("theme", theme)
            parsed["present"] = bool(parsed.get("present", False))
            try:
                parsed["relevance"] = float(parsed.get("relevance", 0.0))
            except (TypeError, ValueError):
                parsed["relevance"] = 0.0
            if not isinstance(parsed.get("sections"), list):
                parsed["sections"] = []

            theme_label_results[theme] = parsed

            # Write per-interview-per-theme JSON file
            out_path = OUTPUT_THEME_DIR / f"{interview_id}__{theme.replace(' ', '_')}.json"
            out_data = {
                "interview_id": interview_id,
                "theme": parsed["theme"],
                "present": parsed["present"],
                "relevance": parsed["relevance"],
                "sections": parsed["sections"],
                "generated_at": datetime.utcnow().isoformat() + "Z",
            }
            out_path.write_text(json.dumps(out_data, ensure_ascii=False, indent=2), encoding="utf-8")

    if theme_label_button.value:
        if not file_dropdown.value:
            status = "No transcript selected."
        elif not theme_list:
            status = "No themes defined. Please add at least one theme."
        else:
            status = f"Labelled {len(theme_label_results)} themes for current interview. JSON files written to '{OUTPUT_THEME_DIR}'."
    else:
        status = "Click 'Run Theme Labelling for This Interview' to start."

    mo.md(f"""### Theme Labelling Status

{status}
""")
    return theme_label_results


@app.cell
def _(mo):
    # Editable analysis task prompt
    analysis_task_input = mo.ui.text_area(
        value="""Perform a thematic analysis of this interview transcript.

    Identify and describe:
    1. **Key Themes** - Major topics and ideas that emerge from the conversation
    2. **Supporting Quotes** - Direct quotes that exemplify each theme (include speaker attribution)
    3. **Insights** - Notable observations or implications from the discussion

    Focus on themes related to:
    - Brand voice and tone strategy
    - Customer experience priorities
    - Design system and consistency
    - AI/conversational interface considerations""",
        label="Analysis Task",
        full_width=True,
        rows=12
    )
    analysis_task_input
    return (analysis_task_input,)


@app.cell
def _(analysis_task_input, labeled_transcript, mo, role_mapping):
    # Build full analysis prompt
    full_analysis_prompt = f"""You are an expert qualitative researcher specializing in thematic analysis of interview data.

    ## Speaker Roles
    {role_mapping}

    ## Task
    {analysis_task_input.value}

    ## Interview Transcript

    '''

    <transcript>
    {labeled_transcript}
    </transcript>

    '''

    Provide your analysis in well-structured markdown format."""

    run_analysis_button = mo.ui.run_button(label="Run Thematic Analysis")

    mo.vstack([
        mo.md(f"**Prompt length:** ~{len(full_analysis_prompt.split())} words"),
        run_analysis_button
    ])
    return full_analysis_prompt, run_analysis_button


@app.cell
def _(full_analysis_prompt, mo):
    mo.md(rf"""
    # Full Analysis Prompt

    ---

    {full_analysis_prompt}
    """)
    return


@app.cell
def _(MODEL, client, full_analysis_prompt, mo, run_analysis_button):
    analysis_response = ""
    if run_analysis_button.value:
        response_2 = client.generate(model=MODEL, prompt=full_analysis_prompt)
        analysis_response = response_2.response

    mo.md(f"""
    ## Analysis Results

    {analysis_response if analysis_response else "_Click 'Run Thematic Analysis' to generate analysis_"}
    """)
    return


if __name__ == "__main__":
    app.run()