basic parsing working

2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,4 @@
+from .ollama_utils import connect_qumo_ollama
+from .data_utils import create_sentiment_matrix, extract_theme
+from .transcript_utils import load_srt
+from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+
+def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
+    """
+    Create a sentiment matrix for a specific document.
+
+    Parameters:
+    - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
+    - document_name: Name of the document to filter by
+
+    Returns:
+    - DataFrame representing the sentiment matrix
+    """
+
+    # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
+    sentiment_rows = doc_df[
+        doc_df['tag'].str.contains(column_prefix, na=False)
+    ].copy()
+
+    if sentiment_rows.empty:
+        print("No sentiment data found")
+        return pd.DataFrame()
+
+    # Filter for rows with valid Voice/Character context
+    valid_rows = sentiment_rows[
+        sentiment_rows['_context'].notna() & 
+        (sentiment_rows['_context'].str.contains(row_prefix, na=False))
+    ].copy()
+
+    if valid_rows.empty:
+        print("No Voice/Character context found")
+        return pd.DataFrame()
+
+    # Create aggregation: group by Voice/Character (_context) and Theme (tag)
+    # Sum sentiment scores for each combination
+    matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
+
+    # Pivot to create the matrix
+    matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
+
+    # # Convert to integers for cleaner display
+    # matrix = matrix.astype(int)
+
+    return matrix
+
+
+
+def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
+    """
+    Extract the theme from a tag string.
+
+    Parameters:
+    - tag: str, the tag string (e.g., 'VT - Personal Experience')
+    - theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
+
+    Returns:
+    - str, the extracted theme (e.g., 'Personal Experience')
+    - None if no theme found
+    """
+    for prefix in theme_prefixes.split('|'):
+        if tag.startswith(prefix):
+            return tag.replace(prefix, '').strip()
+    return None
+    
--- a/utils/ollama_utils.py
+++ b/utils/ollama_utils.py
@@ -0,0 +1,42 @@
+
+
+
+import requests
+from ollama import Client
+
+
+
+
+def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
+    """Establish connection to Qumo Ollama instance
+    
+    vm_name: str ('ollama-lite' or 'hiperf-gpu')
+        Name of the VM running the Ollama instance
+    
+    Returns:
+        tuple(Client): Ollama client connected to the specified VM
+    """
+    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
+
+    if vm_name in ['localhost', '0.0.0.0']:
+        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
+  
+    try:
+        requests.get(QUMO_OLLAMA_URL, timeout=5)
+        client = Client(
+            host=QUMO_OLLAMA_URL
+        )
+    
+        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
+        models = [m.model for m in client.list().models]
+        if print_models:
+            print("Available models:")
+            for m in models:
+                print(f"  - '{m}' ")
+        return client, models
+    
+    except requests.ConnectionError:
+        pass
+    
+    print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
+    return None, None
--- a/utils/sentiment_analysis.py
+++ b/utils/sentiment_analysis.py
@@ -0,0 +1,128 @@
+import random
+import pandas as pd
+
+from ollama import Client
+import json
+
+def dummy_sentiment_analysis(content, tag):
+    if tag.startswith('VT -') or tag.startswith('CT -'):
+        return random.choice([-1, 0, 1]), 'random dummy sentiment'  # Random sentiment for testing
+
+    return 'test', 'not applicable'
+
+
+
+def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
+    """
+    Perform sentiment analysis using Ollama model.
+
+    Parameters:
+    - content: Text content to analyze
+    - tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
+
+    Returns:
+    - sentiment score and reason
+    """
+    prompt = f"""
+    # Instructions
+    You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
+    
+    You need to deliver three pieces of information:
+    1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
+    2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
+    3. A brief reason (max 10 words) explaining your sentiment score.
+    
+
+    # Guidelines    
+    Keywords should be directly relevant to the theme.
+    
+    The reason should be extremely concise and to the point:
+    - Does not need to be a full sentence.
+    - Sentiment itself does not need to be stated in the explanation.
+    - If keywords are present in the quote that directly capture the sentiment, give that as the reason..
+    
+    
+    # Input
+    
+    Theme: `{theme}`
+    
+    Quote:
+    ```
+    {content}
+    ```
+    
+    # Response Format
+    Provide your response in the following JSON format:
+    {{
+        "keywords": ["<list_of_relevant_keywords_if_any>"],
+        "sentiment": <sentiment_score>,
+        "reason": "<brief_explanation_max_10_words>"
+    }}
+
+    
+    # Examples
+    
+    ** Example 1**
+    - Theme: `Speed`
+    - Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
+
+    - Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
+    
+    ** Example 2**
+    - Theme: `Friendliness / Empathy`
+    - Quote: `Sound very welcoming`
+    
+    - Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
+    
+    """
+
+    resp = client.generate(
+        model=model,
+        prompt=prompt,
+    )
+    
+    try:
+        response_text = resp.response.strip()
+
+        # Extract JSON from response
+        start_index = response_text.find('{')
+        end_index = response_text.rfind('}') + 1
+        json_str = response_text[start_index:end_index]
+        
+        response_json = json.loads(json_str)
+        keywords = response_json.get('keywords', [])
+        sentiment = response_json.get('sentiment', 'test')
+        reason = response_json.get('reason', 'no reason provided')
+        return keywords, sentiment, reason
+    except Exception as e:
+        print(f"Error parsing response: {e}")
+        return [], None, 'parsing error'
+
+
+if __name__ == "__main__":
+
+    client = Client(
+            host="http://localhost:11434"
+        )
+
+    sentiment_df = pd.DataFrame({
+        'content': [
+            "I love this product!",
+            "This is the worst service ever.",
+            "It's okay, not great but not terrible."
+        ],
+        'tag': [
+            'VT - Personal Experience',
+            'VT - Personal Experience',
+            'VT - Personal Experience'
+        ],
+        'manual_analysis': [False, False, True]
+    })
+
+    sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
+        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
+        axis=1
+    )
+
+    print(sentiment_df.head())
+
--- a/utils/transcript_utils.py
+++ b/utils/transcript_utils.py
@@ -0,0 +1,54 @@
+
+from pathlib import Path
+import re
+
+def load_srt(path: str | Path) -> str:
+    """Load and parse an SRT file, returning clean transcript with speaker labels.
+    
+    Args:
+        path: Path to the SRT file
+        
+    Returns:
+        Clean transcript string with format "SPEAKER_XX: text" per line,
+        timestamps stripped, consecutive lines from same speaker merged.
+    """
+    path = Path(path)
+    content = path.read_text(encoding='utf-8')
+    
+    # Parse SRT blocks: sequence number, timestamp, speaker|text
+    # Pattern matches: number, timestamp line, content line(s)
+    blocks = re.split(r'\n\n+', content.strip())
+    
+    turns = []
+    for block in blocks:
+        lines = block.strip().split('\n')
+        if len(lines) < 3:
+            continue
+        
+        # Skip sequence number (line 0) and timestamp (line 1)
+        # Content is line 2 onwards
+        text_lines = lines[2:]
+        text = ' '.join(text_lines)
+        
+        # Parse speaker|text format
+        if '|' in text:
+            speaker, utterance = text.split('|', 1)
+            speaker = speaker.strip()
+            utterance = utterance.strip()
+        else:
+            speaker = "UNKNOWN"
+            utterance = text.strip()
+        
+        turns.append((speaker, utterance))
+    
+    # Merge consecutive turns from same speaker
+    merged = []
+    for speaker, utterance in turns:
+        if merged and merged[-1][0] == speaker:
+            merged[-1] = (speaker, merged[-1][1] + ' ' + utterance)
+        else:
+            merged.append((speaker, utterance))
+    
+    # Format as "SPEAKER_XX: text"
+    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
+    return '\n\n'.join(transcript_lines)