thematic analysis opzetje

This commit is contained in:
2025-12-01 15:09:16 +01:00
parent 74aecff2bd
commit 9499d6c068
4 changed files with 331 additions and 7 deletions

View File

@@ -2,11 +2,66 @@
Standard utils for this repository
"""
import re
from pathlib import Path
import requests
import ollama
from ollama import Client
def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels.
Args:
path: Path to the SRT file
Returns:
Clean transcript string with format "SPEAKER_XX: text" per line,
timestamps stripped, consecutive lines from same speaker merged.
"""
path = Path(path)
content = path.read_text(encoding='utf-8')
# Parse SRT blocks: sequence number, timestamp, speaker|text
# Pattern matches: number, timestamp line, content line(s)
blocks = re.split(r'\n\n+', content.strip())
turns = []
for block in blocks:
lines = block.strip().split('\n')
if len(lines) < 3:
continue
# Skip sequence number (line 0) and timestamp (line 1)
# Content is line 2 onwards
text_lines = lines[2:]
text = ' '.join(text_lines)
# Parse speaker|text format
if '|' in text:
speaker, utterance = text.split('|', 1)
speaker = speaker.strip()
utterance = utterance.strip()
else:
speaker = "UNKNOWN"
utterance = text.strip()
turns.append((speaker, utterance))
# Merge consecutive turns from same speaker
merged = []
for speaker, utterance in turns:
if merged and merged[-1][0] == speaker:
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
else:
merged.append((speaker, utterance))
# Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines)
def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
"""Establish connection to Qumo Ollama instance
@@ -25,7 +80,7 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
except requests.ConnectionError:
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
print("Connection succesful.\nAvailable models:")
print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
for m in client.list().models:
print(f" - '{m.model}' ")
return client