from pathlib import Path import re def load_srt(path: str | Path) -> str: """Load and parse an SRT file, returning clean transcript with speaker labels. Args: path: Path to the SRT file Returns: Clean transcript string with format "SPEAKER_XX: text" per line, timestamps stripped, consecutive lines from same speaker merged. """ path = Path(path) content = path.read_text(encoding='utf-8') # Parse SRT blocks: sequence number, timestamp, speaker|text # Pattern matches: number, timestamp line, content line(s) blocks = re.split(r'\n\n+', content.strip()) turns = [] for block in blocks: lines = block.strip().split('\n') if len(lines) < 3: continue # Skip sequence number (line 0) and timestamp (line 1) # Content is line 2 onwards text_lines = lines[2:] text = ' '.join(text_lines) # Parse speaker|text format if '|' in text: speaker, utterance = text.split('|', 1) speaker = speaker.strip() utterance = utterance.strip() else: speaker = "UNKNOWN" utterance = text.strip() turns.append((speaker, utterance)) # Merge consecutive turns from same speaker merged = [] for speaker, utterance in turns: if merged and merged[-1][0] == speaker: merged[-1] = (speaker, merged[-1][1] + ' ' + utterance) else: merged.append((speaker, utterance)) # Format as "SPEAKER_XX: text" transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged] return '\n\n'.join(transcript_lines)