basic parsing working
This commit is contained in:
54
utils/transcript_utils.py
Normal file
54
utils/transcript_utils.py
Normal file
@@ -0,0 +1,54 @@
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
def load_srt(path: str | Path) -> str:
|
||||
"""Load and parse an SRT file, returning clean transcript with speaker labels.
|
||||
|
||||
Args:
|
||||
path: Path to the SRT file
|
||||
|
||||
Returns:
|
||||
Clean transcript string with format "SPEAKER_XX: text" per line,
|
||||
timestamps stripped, consecutive lines from same speaker merged.
|
||||
"""
|
||||
path = Path(path)
|
||||
content = path.read_text(encoding='utf-8')
|
||||
|
||||
# Parse SRT blocks: sequence number, timestamp, speaker|text
|
||||
# Pattern matches: number, timestamp line, content line(s)
|
||||
blocks = re.split(r'\n\n+', content.strip())
|
||||
|
||||
turns = []
|
||||
for block in blocks:
|
||||
lines = block.strip().split('\n')
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
# Skip sequence number (line 0) and timestamp (line 1)
|
||||
# Content is line 2 onwards
|
||||
text_lines = lines[2:]
|
||||
text = ' '.join(text_lines)
|
||||
|
||||
# Parse speaker|text format
|
||||
if '|' in text:
|
||||
speaker, utterance = text.split('|', 1)
|
||||
speaker = speaker.strip()
|
||||
utterance = utterance.strip()
|
||||
else:
|
||||
speaker = "UNKNOWN"
|
||||
utterance = text.strip()
|
||||
|
||||
turns.append((speaker, utterance))
|
||||
|
||||
# Merge consecutive turns from same speaker
|
||||
merged = []
|
||||
for speaker, utterance in turns:
|
||||
if merged and merged[-1][0] == speaker:
|
||||
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
|
||||
else:
|
||||
merged.append((speaker, utterance))
|
||||
|
||||
# Format as "SPEAKER_XX: text"
|
||||
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
|
||||
return '\n\n'.join(transcript_lines)
|
||||
Reference in New Issue
Block a user