basic parsing working

2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal"
        }
    ]
 }
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -16,7 +16,7 @@ def _():
    OLLAMA_LOCATION= 'localhost'
    # VM_NAME = 'ollama-lite'
-    client = connect_qumo_ollama(OLLAMA_LOCATION)
+    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
@@ -25,7 +25,23 @@ def _():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
-    return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
+
    model_select = mo.ui.dropdown(
        options=_models,
        value=_models[0],
        label="Select Ollama Model to use",
        searchable=True,
    )
    model_select
    return (
        TAGUETTE_EXPORT_DIR,
        WORKING_DIR,
        client,
        datetime,
        mo,
        model_select,
        pd,
    )
@app.cell(hide_code=True)
@@ -89,7 +105,7 @@ def _(all_tags_df, interview_select, mo):
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    ### Add `_context` column to track Voice / Character is being referred to per highlight
+    ## Add `_context` column to track Voice / Character is being referred to per highlight
    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
    1. Iterates through the dataframe in document order (row by row)
@@ -102,12 +118,12 @@ def _(mo):
    Example of challenging case:
-    | id  | document | tag                                | content | _seq_id | _context         |
+    | tag                                | content | _seq_id | _context         |
-    |-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
+    |------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
-    | 88  | P2 - Done   | _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
+    |  _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
-    | 88  | P2 - Done   | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
+    | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
-    | 88  | P2 - Done   | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
+    | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
-    | 88  | P2 - Done   | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
+    | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
    """)
    return
@@ -155,7 +171,7 @@ def _(df):
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    ## Resolve multi-context rows (only VT- and CT- theme tags)
+    ## Split multi-context rows (only VT- and CT- theme tags)
    For rows that have multiple contexts (e.g., both _V-54 and _V-41)
    - split these into separate rows for each context.
@@ -165,7 +181,7 @@ def _(mo):
@app.cell
-def _(df, mo, pd):
+def _(df, pd):
    # Expand rows that contain multiple contexts (comma-separated)
    expanded_rows = []
@@ -201,71 +217,32 @@ def _(df, mo, pd):
        expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
    ].copy()
-    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
+    print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
    split_rows_editor = None
-
+    sentiment_df[sentiment_df['manual_analysis']]
-    if not manual_rows.empty:
+    return (sentiment_df,)
        print(
            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
            "See next cell for manual review."
        )
        # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
        rows_to_edit = sentiment_df[
            (sentiment_df['manual_analysis'])
        ]
        # Create data editor for split rows
        split_rows_editor = mo.ui.data_editor(
            rows_to_edit
    ).form(label="Update Sentiment / Manual Flag")
    else:
        print("✓ No multi-context rows found")
    return rows_to_edit, sentiment_df, split_rows_editor
@app.cell(hide_code=True)
 def _(mo, rows_to_edit, split_rows_editor):
    mo.vstack([
        mo.md(f"""
        ### ⚠️ Manual Review Required
        **{len(rows_to_edit)} rows** were split from multi-context entries.
        Please review them below:
        1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
        2. Click **Submit** to apply changes.
        """),
        split_rows_editor
    ])
    return
@app.cell
 def _(mo, split_rows_editor):
    # Capture the edited manual-analysis rows for validation
    mo.stop(split_rows_editor.value is None, mo.md("Submit your sentiment analysis changes before continuing."))
    reviewed_manual_rows = split_rows_editor.value
    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
    if not reviewed_manual_rows.empty:
        valid_sentiments = {-1, 0, 1}
        needs_review = reviewed_manual_rows[
            reviewed_manual_rows['manual_analysis']
            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
        ]
        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
    print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
    return (reviewed_manual_rows,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    # Highlight Sentiment Analysis
+    ## Create 'theme' column
    """)
    return
@app.cell
 def _(sentiment_df):
    from utils import extract_theme
    sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
    sentiment_df
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    # Extract Sentiment + Reasoning
    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
@@ -278,24 +255,106 @@ def _(mo):
@app.cell
-def _(sentiment_df):
+def _(client, model_select, pd, sentiment_df):
    # for now, create an empty sentiment column with randomized dummy values for testing
    # only for 'VT -' and 'CT -' tags
    import random
-    def dummy_sentiment_analysis(content, tag):
+    from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
        if tag.startswith('VT -') or tag.startswith('CT -'):
            return random.choice([-1, 0, 1])  # Random sentiment for testing
        return None
    # Only run on rows without manual_analysis
-    sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
+    # sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
    #     lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])),
    #     axis=1
    # )
    sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)),
        axis=1
    )
    sentiment_df[~sentiment_df['manual_analysis']]
    return
@app.cell
 def _(sentiment_df):
    sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
    return
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
    ## Multi-context tags
    """)
    return
@app.cell
 def _(mo, sentiment_df):
    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
    split_rows_editor = None
    rows_to_edit = []
    if not manual_rows.empty:
        print(
            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
            "See next cell for manual review."
        )
        # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
        rows_to_edit = sentiment_df[
            (sentiment_df['manual_analysis'])
        ]
        # Create data editor for split rows
        split_rows_editor = mo.ui.data_editor(
            rows_to_edit
    ).form(label="Update Sentiment / Manual Flag")
    else:
        print("✓ No multi-context rows found")
    return rows_to_edit, split_rows_editor
@app.cell(hide_code=True)
 def _(mo, rows_to_edit, split_rows_editor):
    if split_rows_editor is not None:
        mo.vstack([
            mo.md(f"""
            ### ⚠️ Manual Review Required
            **{len(rows_to_edit)} rows** were split from multi-context entries.
            Please review them below:
            1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
            2. Click **Submit** to apply changes.
            """),
            split_rows_editor
        ])
    return
@app.cell
 def _(mo, split_rows_editor):
    # Capture the edited manual-analysis rows for validation
    reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
    mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
    if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
        valid_sentiments = {-1, 0, 1}
        needs_review = reviewed_manual_rows[
            reviewed_manual_rows['manual_analysis']
            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
        ]
        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
        print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
    return (reviewed_manual_rows,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -307,7 +366,10 @@ def _(mo):
@app.cell
 def _(pd, reviewed_manual_rows, sentiment_df):
    _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
-    recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
+    if isinstance(reviewed_manual_rows, pd.DataFrame):
        recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
    else:
        recombined_df = sentiment_df
    recombined_df
    return (recombined_df,)
@@ -348,7 +410,7 @@ def _(mo):
 def _(WORKING_DIR, datetime, interview_select, recombined_df):
    # Save to CSV in working dir
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
+    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
    recombined_df.to_csv(filename, index=False)
    print(f"✓ Saved processed data to '{filename}'")
--- a/03_Sentiment_Analysis.py
+++ b/03_Sentiment_Analysis.py
@@ -9,14 +9,14 @@ def _():
    import marimo as mo
    import pandas as pd
    from pathlib import Path
    from utils import create_sentiment_matrix
    INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
    WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
-
+    return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
    return INPUT_DIR, Path, WORKING_DIR, mo, pd
@app.cell(hide_code=True)
@@ -62,55 +62,6 @@ def _(mo):
    return
@app.cell
 def _(document_name, pd):
    import numpy as np
    def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
        """
        Create a sentiment matrix for a specific document.
        Parameters:
        - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
        - document_name: Name of the document to filter by
        Returns:
        - DataFrame representing the sentiment matrix
        """
        # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
        sentiment_rows = doc_df[
            doc_df['tag'].str.contains(column_prefix, na=False)
        ].copy()
        if sentiment_rows.empty:
            print(f"No sentiment data found for document: {document_name}")
            return pd.DataFrame()
        # Filter for rows with valid Voice/Character context
        valid_rows = sentiment_rows[
            sentiment_rows['_context'].notna() & 
            (sentiment_rows['_context'].str.contains(row_prefix, na=False))
        ].copy()
        if valid_rows.empty:
            print(f"No Voice/Character context found for document: {document_name}")
            return pd.DataFrame()
        # Create aggregation: group by Voice/Character (_context) and Theme (tag)
        # Sum sentiment scores for each combination
        matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
        # Pivot to create the matrix
        matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
        # # Convert to integers for cleaner display
        # matrix = matrix.astype(int)
        return matrix
    return (create_sentiment_matrix,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
--- a/04_Sentiment_Aggregation.py
+++ b/04_Sentiment_Aggregation.py
--- a/ollama/docker-compose.yml
+++ b/ollama/docker-compose.yml
@@ -17,18 +17,18 @@ services:
    #       c) Explicitly override: docker compose run --gpus all ollama
    # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
    #    'devices' section further down as a fallback (less portable).
-    # deploy:
+    deploy:
-    #   resources:
+      resources:
-    #     reservations:
+        reservations:
-    #       devices:
+          devices:
-    #         - driver: nvidia
+            - driver: nvidia
-    #           count: all
+              count: all
-    #           capabilities: [gpu]
+              capabilities: [gpu]
-    # environment:
+    environment:
      # Visible devices / capabilities for the NVIDIA container runtime
-      # - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_VISIBLE_DEVICES=all
-      # - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
    # devices:
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,4 @@
 from .ollama_utils import connect_qumo_ollama
 from .data_utils import create_sentiment_matrix, extract_theme
 from .transcript_utils import load_srt
 from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -0,0 +1,65 @@
 import pandas as pd
 def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
    """
    Create a sentiment matrix for a specific document.
    Parameters:
    - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
    - document_name: Name of the document to filter by
    Returns:
    - DataFrame representing the sentiment matrix
    """
    # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
    sentiment_rows = doc_df[
        doc_df['tag'].str.contains(column_prefix, na=False)
    ].copy()
    if sentiment_rows.empty:
        print("No sentiment data found")
        return pd.DataFrame()
    # Filter for rows with valid Voice/Character context
    valid_rows = sentiment_rows[
        sentiment_rows['_context'].notna() & 
        (sentiment_rows['_context'].str.contains(row_prefix, na=False))
    ].copy()
    if valid_rows.empty:
        print("No Voice/Character context found")
        return pd.DataFrame()
    # Create aggregation: group by Voice/Character (_context) and Theme (tag)
    # Sum sentiment scores for each combination
    matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
    # Pivot to create the matrix
    matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
    # # Convert to integers for cleaner display
    # matrix = matrix.astype(int)
    return matrix
 def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
    """
    Extract the theme from a tag string.
    Parameters:
    - tag: str, the tag string (e.g., 'VT - Personal Experience')
    - theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
    Returns:
    - str, the extracted theme (e.g., 'Personal Experience')
    - None if no theme found
    """
    for prefix in theme_prefixes.split('|'):
        if tag.startswith(prefix):
            return tag.replace(prefix, '').strip()
    return None
--- a/utils/ollama_utils.py
+++ b/utils/ollama_utils.py
@@ -0,0 +1,42 @@
 import requests
 from ollama import Client
 def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
    """Establish connection to Qumo Ollama instance
    vm_name: str ('ollama-lite' or 'hiperf-gpu')
        Name of the VM running the Ollama instance
    Returns:
        tuple(Client): Ollama client connected to the specified VM
    """
    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
    if vm_name in ['localhost', '0.0.0.0']:
        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
    try:
        requests.get(QUMO_OLLAMA_URL, timeout=5)
        client = Client(
            host=QUMO_OLLAMA_URL
        )
        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
        models = [m.model for m in client.list().models]
        if print_models:
            print("Available models:")
            for m in models:
                print(f"  - '{m}' ")
        return client, models
    except requests.ConnectionError:
        pass
    print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
    return None, None
--- a/utils/sentiment_analysis.py
+++ b/utils/sentiment_analysis.py
@@ -0,0 +1,128 @@
 import random
 import pandas as pd
 from ollama import Client
 import json
 def dummy_sentiment_analysis(content, tag):
    if tag.startswith('VT -') or tag.startswith('CT -'):
        return random.choice([-1, 0, 1]), 'random dummy sentiment'  # Random sentiment for testing
    return 'test', 'not applicable'
 def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
    """
    Perform sentiment analysis using Ollama model.
    Parameters:
    - content: Text content to analyze
    - tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
    Returns:
    - sentiment score and reason
    """
    prompt = f"""
    # Instructions
    You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
    You need to deliver three pieces of information:
    1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
    2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
    3. A brief reason (max 10 words) explaining your sentiment score.
    # Guidelines    
    Keywords should be directly relevant to the theme.
    The reason should be extremely concise and to the point:
    - Does not need to be a full sentence.
    - Sentiment itself does not need to be stated in the explanation.
    - If keywords are present in the quote that directly capture the sentiment, give that as the reason..
    # Input
    Theme: `{theme}`
    Quote:
    ```
    {content}
    ```
    # Response Format
    Provide your response in the following JSON format:
    {{
        "keywords": ["<list_of_relevant_keywords_if_any>"],
        "sentiment": <sentiment_score>,
        "reason": "<brief_explanation_max_10_words>"
    }}
    # Examples
    ** Example 1**
    - Theme: `Speed`
    - Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
    - Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
    ** Example 2**
    - Theme: `Friendliness / Empathy`
    - Quote: `Sound very welcoming`
    - Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
    """
    resp = client.generate(
        model=model,
        prompt=prompt,
    )
    try:
        response_text = resp.response.strip()
        # Extract JSON from response
        start_index = response_text.find('{')
        end_index = response_text.rfind('}') + 1
        json_str = response_text[start_index:end_index]
        response_json = json.loads(json_str)
        keywords = response_json.get('keywords', [])
        sentiment = response_json.get('sentiment', 'test')
        reason = response_json.get('reason', 'no reason provided')
        return keywords, sentiment, reason
    except Exception as e:
        print(f"Error parsing response: {e}")
        return [], None, 'parsing error'
 if __name__ == "__main__":
    client = Client(
            host="http://localhost:11434"
        )
    sentiment_df = pd.DataFrame({
        'content': [
            "I love this product!",
            "This is the worst service ever.",
            "It's okay, not great but not terrible."
        ],
        'tag': [
            'VT - Personal Experience',
            'VT - Personal Experience',
            'VT - Personal Experience'
        ],
        'manual_analysis': [False, False, True]
    })
    sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
        axis=1
    )
    print(sentiment_df.head())
--- a/utils/transcript_utils.py
+++ b/utils/transcript_utils.py
@@ -1,13 +1,6 @@
 """
 Standard utils for this repository
 """
 import re
 from pathlib import Path
-
+import re
 import requests
 from ollama import Client
 def load_srt(path: str | Path) -> str:
    """Load and parse an SRT file, returning clean transcript with speaker labels.
@@ -58,37 +51,4 @@ def load_srt(path: str | Path) -> str:
    # Format as "SPEAKER_XX: text"
    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
-    return '\n\n'.join(transcript_lines)
+    return '\n\n'.join(transcript_lines)
 def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
    """Establish connection to Qumo Ollama instance
    vm_name: str ('ollama-lite' or 'hiperf-gpu')
        Name of the VM running the Ollama instance
    Returns:
        tuple(Client): Ollama client connected to the specified VM
    """
    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
    if vm_name in ['localhost', '0.0.0.0']:
        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
    try:
        requests.get(QUMO_OLLAMA_URL, timeout=5)
        client = Client(
            host=QUMO_OLLAMA_URL
        )
        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
        for m in client.list().models:
            print(f"  - '{m.model}' ")
        return client
    except requests.ConnectionError:
        pass
    print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
    return None