Interview-Analysis/utils.py

"""
Standard utils for this repository
"""

import re
from pathlib import Path

import requests
from ollama import Client


def load_srt(path: str | Path) -> str:
    """Load and parse an SRT file, returning clean transcript with speaker labels.

    Args:
        path: Path to the SRT file

    Returns:
        Clean transcript string with format "SPEAKER_XX: text" per line,
        timestamps stripped, consecutive lines from same speaker merged.
    """
    path = Path(path)
    content = path.read_text(encoding='utf-8')

    # Parse SRT blocks: sequence number, timestamp, speaker|text
    # Pattern matches: number, timestamp line, content line(s)
    blocks = re.split(r'\n\n+', content.strip())

    turns = []
    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 3:
            continue

        # Skip sequence number (line 0) and timestamp (line 1)
        # Content is line 2 onwards
        text_lines = lines[2:]
        text = ' '.join(text_lines)

        # Parse speaker|text format
        if '|' in text:
            speaker, utterance = text.split('|', 1)
            speaker = speaker.strip()
            utterance = utterance.strip()
        else:
            speaker = "UNKNOWN"
            utterance = text.strip()

        turns.append((speaker, utterance))

    # Merge consecutive turns from same speaker
    merged = []
    for speaker, utterance in turns:
        if merged and merged[-1][0] == speaker:
            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
        else:
            merged.append((speaker, utterance))

    # Format as "SPEAKER_XX: text"
    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
    return '\n\n'.join(transcript_lines)


def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
    """Establish connection to Qumo Ollama instance

    vm_name: str ('ollama-lite' or 'hiperf-gpu')
        Name of the VM running the Ollama instance

    Returns:
        tuple(Client): Ollama client connected to the specified VM
    """
    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:11434'
    try:
        requests.get(QUMO_OLLAMA_URL, timeout=5)
        client = Client(
            host=QUMO_OLLAMA_URL
        )
    except requests.ConnectionError:
        print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")

    print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
    for m in client.list().models:
        print(f"  - '{m.model}' ")
    return client