Interview-Analysis/utils/transcript_utils.py


from pathlib import Path
import re

def load_srt(path: str | Path) -> str:
    """Load and parse an SRT file, returning clean transcript with speaker labels.

    Args:
        path: Path to the SRT file

    Returns:
        Clean transcript string with format "SPEAKER_XX: text" per line,
        timestamps stripped, consecutive lines from same speaker merged.
    """
    path = Path(path)
    content = path.read_text(encoding='utf-8')

    # Parse SRT blocks: sequence number, timestamp, speaker|text
    # Pattern matches: number, timestamp line, content line(s)
    blocks = re.split(r'\n\n+', content.strip())

    turns = []
    for block in blocks:
        lines = block.strip().split('\n')
        if len(lines) < 3:
            continue

        # Skip sequence number (line 0) and timestamp (line 1)
        # Content is line 2 onwards
        text_lines = lines[2:]
        text = ' '.join(text_lines)

        # Parse speaker|text format
        if '|' in text:
            speaker, utterance = text.split('|', 1)
            speaker = speaker.strip()
            utterance = utterance.strip()
        else:
            speaker = "UNKNOWN"
            utterance = text.strip()

        turns.append((speaker, utterance))

    # Merge consecutive turns from same speaker
    merged = []
    for speaker, utterance in turns:
        if merged and merged[-1][0] == speaker:
            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
        else:
            merged.append((speaker, utterance))

    # Format as "SPEAKER_XX: text"
    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
    return '\n\n'.join(transcript_lines)