basic parsing working

2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions
--- a/utils/transcript_utils.py
+++ b/utils/transcript_utils.py
@@ -0,0 +1,54 @@
+
+from pathlib import Path
+import re
+
+def load_srt(path: str | Path) -> str:
+    """Load and parse an SRT file, returning clean transcript with speaker labels.
+    
+    Args:
+        path: Path to the SRT file
+        
+    Returns:
+        Clean transcript string with format "SPEAKER_XX: text" per line,
+        timestamps stripped, consecutive lines from same speaker merged.
+    """
+    path = Path(path)
+    content = path.read_text(encoding='utf-8')
+    
+    # Parse SRT blocks: sequence number, timestamp, speaker|text
+    # Pattern matches: number, timestamp line, content line(s)
+    blocks = re.split(r'\n\n+', content.strip())
+    
+    turns = []
+    for block in blocks:
+        lines = block.strip().split('\n')
+        if len(lines) < 3:
+            continue
+        
+        # Skip sequence number (line 0) and timestamp (line 1)
+        # Content is line 2 onwards
+        text_lines = lines[2:]
+        text = ' '.join(text_lines)
+        
+        # Parse speaker|text format
+        if '|' in text:
+            speaker, utterance = text.split('|', 1)
+            speaker = speaker.strip()
+            utterance = utterance.strip()
+        else:
+            speaker = "UNKNOWN"
+            utterance = text.strip()
+        
+        turns.append((speaker, utterance))
+    
+    # Merge consecutive turns from same speaker
+    merged = []
+    for speaker, utterance in turns:
+        if merged and merged[-1][0] == speaker:
+            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
+        else:
+            merged.append((speaker, utterance))
+    
+    # Format as "SPEAKER_XX: text"
+    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
+    return '\n\n'.join(transcript_lines)