basic parsing working
This commit is contained in:
4
utils/__init__.py
Normal file
4
utils/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .ollama_utils import connect_qumo_ollama
|
||||
from .data_utils import create_sentiment_matrix, extract_theme
|
||||
from .transcript_utils import load_srt
|
||||
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
|
||||
65
utils/data_utils.py
Normal file
65
utils/data_utils.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print("No sentiment data found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print("No Voice/Character context found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# # Convert to integers for cleaner display
|
||||
# matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
|
||||
|
||||
|
||||
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
|
||||
"""
|
||||
Extract the theme from a tag string.
|
||||
|
||||
Parameters:
|
||||
- tag: str, the tag string (e.g., 'VT - Personal Experience')
|
||||
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
|
||||
|
||||
Returns:
|
||||
- str, the extracted theme (e.g., 'Personal Experience')
|
||||
- None if no theme found
|
||||
"""
|
||||
for prefix in theme_prefixes.split('|'):
|
||||
if tag.startswith(prefix):
|
||||
return tag.replace(prefix, '').strip()
|
||||
return None
|
||||
|
||||
42
utils/ollama_utils.py
Normal file
42
utils/ollama_utils.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
|
||||
|
||||
import requests
|
||||
from ollama import Client
|
||||
|
||||
|
||||
|
||||
|
||||
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
|
||||
"""Establish connection to Qumo Ollama instance
|
||||
|
||||
vm_name: str ('ollama-lite' or 'hiperf-gpu')
|
||||
Name of the VM running the Ollama instance
|
||||
|
||||
Returns:
|
||||
tuple(Client): Ollama client connected to the specified VM
|
||||
"""
|
||||
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
|
||||
|
||||
if vm_name in ['localhost', '0.0.0.0']:
|
||||
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
|
||||
|
||||
try:
|
||||
requests.get(QUMO_OLLAMA_URL, timeout=5)
|
||||
client = Client(
|
||||
host=QUMO_OLLAMA_URL
|
||||
)
|
||||
|
||||
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
|
||||
models = [m.model for m in client.list().models]
|
||||
if print_models:
|
||||
print("Available models:")
|
||||
for m in models:
|
||||
print(f" - '{m}' ")
|
||||
return client, models
|
||||
|
||||
except requests.ConnectionError:
|
||||
pass
|
||||
|
||||
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
||||
return None, None
|
||||
128
utils/sentiment_analysis.py
Normal file
128
utils/sentiment_analysis.py
Normal file
@@ -0,0 +1,128 @@
|
||||
import random
|
||||
import pandas as pd
|
||||
|
||||
from ollama import Client
|
||||
import json
|
||||
|
||||
def dummy_sentiment_analysis(content, tag):
|
||||
if tag.startswith('VT -') or tag.startswith('CT -'):
|
||||
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
|
||||
|
||||
return 'test', 'not applicable'
|
||||
|
||||
|
||||
|
||||
def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
|
||||
"""
|
||||
Perform sentiment analysis using Ollama model.
|
||||
|
||||
Parameters:
|
||||
- content: Text content to analyze
|
||||
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
|
||||
|
||||
Returns:
|
||||
- sentiment score and reason
|
||||
"""
|
||||
prompt = f"""
|
||||
# Instructions
|
||||
You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
|
||||
|
||||
You need to deliver three pieces of information:
|
||||
1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
|
||||
2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
|
||||
3. A brief reason (max 10 words) explaining your sentiment score.
|
||||
|
||||
|
||||
# Guidelines
|
||||
Keywords should be directly relevant to the theme.
|
||||
|
||||
The reason should be extremely concise and to the point:
|
||||
- Does not need to be a full sentence.
|
||||
- Sentiment itself does not need to be stated in the explanation.
|
||||
- If keywords are present in the quote that directly capture the sentiment, give that as the reason..
|
||||
|
||||
|
||||
# Input
|
||||
|
||||
Theme: `{theme}`
|
||||
|
||||
Quote:
|
||||
```
|
||||
{content}
|
||||
```
|
||||
|
||||
# Response Format
|
||||
Provide your response in the following JSON format:
|
||||
{{
|
||||
"keywords": ["<list_of_relevant_keywords_if_any>"],
|
||||
"sentiment": <sentiment_score>,
|
||||
"reason": "<brief_explanation_max_10_words>"
|
||||
}}
|
||||
|
||||
|
||||
# Examples
|
||||
|
||||
** Example 1**
|
||||
- Theme: `Speed`
|
||||
- Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
|
||||
|
||||
- Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
|
||||
|
||||
** Example 2**
|
||||
- Theme: `Friendliness / Empathy`
|
||||
- Quote: `Sound very welcoming`
|
||||
|
||||
- Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
|
||||
|
||||
"""
|
||||
|
||||
resp = client.generate(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = resp.response.strip()
|
||||
|
||||
# Extract JSON from response
|
||||
start_index = response_text.find('{')
|
||||
end_index = response_text.rfind('}') + 1
|
||||
json_str = response_text[start_index:end_index]
|
||||
|
||||
response_json = json.loads(json_str)
|
||||
keywords = response_json.get('keywords', [])
|
||||
sentiment = response_json.get('sentiment', 'test')
|
||||
reason = response_json.get('reason', 'no reason provided')
|
||||
return keywords, sentiment, reason
|
||||
except Exception as e:
|
||||
print(f"Error parsing response: {e}")
|
||||
return [], None, 'parsing error'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
client = Client(
|
||||
host="http://localhost:11434"
|
||||
)
|
||||
|
||||
sentiment_df = pd.DataFrame({
|
||||
'content': [
|
||||
"I love this product!",
|
||||
"This is the worst service ever.",
|
||||
"It's okay, not great but not terrible."
|
||||
],
|
||||
'tag': [
|
||||
'VT - Personal Experience',
|
||||
'VT - Personal Experience',
|
||||
'VT - Personal Experience'
|
||||
],
|
||||
'manual_analysis': [False, False, True]
|
||||
})
|
||||
|
||||
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
|
||||
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
|
||||
axis=1
|
||||
)
|
||||
|
||||
print(sentiment_df.head())
|
||||
|
||||
54
utils/transcript_utils.py
Normal file
54
utils/transcript_utils.py
Normal file
@@ -0,0 +1,54 @@
|
||||
|
||||
from pathlib import Path
|
||||
import re
|
||||
|
||||
def load_srt(path: str | Path) -> str:
|
||||
"""Load and parse an SRT file, returning clean transcript with speaker labels.
|
||||
|
||||
Args:
|
||||
path: Path to the SRT file
|
||||
|
||||
Returns:
|
||||
Clean transcript string with format "SPEAKER_XX: text" per line,
|
||||
timestamps stripped, consecutive lines from same speaker merged.
|
||||
"""
|
||||
path = Path(path)
|
||||
content = path.read_text(encoding='utf-8')
|
||||
|
||||
# Parse SRT blocks: sequence number, timestamp, speaker|text
|
||||
# Pattern matches: number, timestamp line, content line(s)
|
||||
blocks = re.split(r'\n\n+', content.strip())
|
||||
|
||||
turns = []
|
||||
for block in blocks:
|
||||
lines = block.strip().split('\n')
|
||||
if len(lines) < 3:
|
||||
continue
|
||||
|
||||
# Skip sequence number (line 0) and timestamp (line 1)
|
||||
# Content is line 2 onwards
|
||||
text_lines = lines[2:]
|
||||
text = ' '.join(text_lines)
|
||||
|
||||
# Parse speaker|text format
|
||||
if '|' in text:
|
||||
speaker, utterance = text.split('|', 1)
|
||||
speaker = speaker.strip()
|
||||
utterance = utterance.strip()
|
||||
else:
|
||||
speaker = "UNKNOWN"
|
||||
utterance = text.strip()
|
||||
|
||||
turns.append((speaker, utterance))
|
||||
|
||||
# Merge consecutive turns from same speaker
|
||||
merged = []
|
||||
for speaker, utterance in turns:
|
||||
if merged and merged[-1][0] == speaker:
|
||||
merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
|
||||
else:
|
||||
merged.append((speaker, utterance))
|
||||
|
||||
# Format as "SPEAKER_XX: text"
|
||||
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
|
||||
return '\n\n'.join(transcript_lines)
|
||||
Reference in New Issue
Block a user