llm processing of sentiment

basic parsing working
2025-12-12 14:28:51 +01:00 · 2025-12-11 12:56:23 +01:00
11 changed files with 479 additions and 199 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
--- a/01_Taguette-Pre-Process.py
+++ b/01_Taguette-Pre-Process.py
@@ -70,13 +70,13 @@ def csv_to_markdown(df):
    return "\n\n".join(lines)


-@app.cell
+@app.cell(hide_code=True)
 def _(file_dropdown, mo, pd):
    # Preview
    preview = mo.md("")
    if file_dropdown.value:
        df = pd.read_csv(file_dropdown.value)
-        md_content = csv_to_markdown(df)
+        md_content = csv_to_markdown(df.head(10))
        preview = mo.md(md_content)

    preview
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -16,28 +16,49 @@ def _():
    OLLAMA_LOCATION= 'localhost'
    # VM_NAME = 'ollama-lite'

-    client = connect_qumo_ollama(OLLAMA_LOCATION)
+    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)

-    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
+    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
    if not TAGUETTE_EXPORT_DIR.exists():
        TAGUETTE_EXPORT_DIR.mkdir(parents=True)
-    return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
+
+    model_select = mo.ui.dropdown(
+        options=_models,
+        value=_models[0],
+        label="Select Ollama Model to use",
+        searchable=True,
+    )
+    model_select
+    return (
+        TAGUETTE_EXPORT_DIR,
+        WORKING_DIR,
+        client,
+        datetime,
+        mo,
+        model_select,
+        pd,
+    )


@app.cell(hide_code=True)
 def _(TAGUETTE_EXPORT_DIR, mo):
    mo.md(rf"""
-    # Step 1: Export All Highlights out of Taguette
+    # Step 1: Export Data out of Taguette

-    1. Go to: http://taguette.tail44fa00.ts.net/project/1
-    2. Select 'Highlights' on left
-    3. Select 'See all hightlights'
-    4. Top right 'Export this view' > 'CSV'
-    5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+    **Highlights**
+    1. Go to: https://taguette.qumo.io/project/1
+    2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
+    3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+
+    **Tags Codebook**
+    1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
+    2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
+
+    _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
    """)
    return

@@ -51,13 +72,21 @@ def _(mo):


@app.cell
-def _(pd):
-    all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
+def _(TAGUETTE_EXPORT_DIR, pd):
+    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
-    all_tags_df.head(20)
+    all_tags_df
    return (all_tags_df,)


+@app.cell
+def _(TAGUETTE_EXPORT_DIR, pd):
+    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
+    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
+    codebook_df
+    return (codebook_df,)
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -89,7 +118,7 @@ def _(all_tags_df, interview_select, mo):
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    ### Add `_context` column to track Voice / Character is being referred to per highlight
+    ## Add `_context` column to track Voice / Character is being referred to per highlight
    Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.

    1. Iterates through the dataframe in document order (row by row)
@@ -102,12 +131,12 @@ def _(mo):

    Example of challenging case:

-    | id  | document | tag                                | content | _seq_id | _context         |
-    |-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
-    | 88  | P2 - Done   | _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
-    | 88  | P2 - Done   | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
-    | 88  | P2 - Done   | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
-    | 88  | P2 - Done   | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
+    | tag                                | content | _seq_id | _context         |
+    |------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
+    |  _V-54                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 117        | _V-54, _V-41         |
+    | _V-41                              | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 118        | _V-54, _V-41         |
+    | VT - Human / Artificial            | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 119        | _V-54, _V-41         |
+    | VT - Friendliness / Empathy        | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them.                                                                                                                                                    | 120        | _V-54, _V-41         |
    """)
    return

@@ -155,7 +184,7 @@ def _(df):
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    ## Resolve multi-context rows (only VT- and CT- theme tags)
+    ## Split multi-context rows (only VT- and CT- theme tags)

    For rows that have multiple contexts (e.g., both _V-54 and _V-41)
    - split these into separate rows for each context.
@@ -165,7 +194,7 @@ def _(mo):


@app.cell
-def _(df, mo, pd):
+def _(df, pd):
    # Expand rows that contain multiple contexts (comma-separated)
    expanded_rows = []

@@ -201,71 +230,32 @@ def _(df, mo, pd):
        expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
    ].copy()

-    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
-    split_rows_editor = None
+    print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")

-
-    if not manual_rows.empty:
-        print(
-            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
-            "See next cell for manual review."
-        )
-
-        # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
-        rows_to_edit = sentiment_df[
-            (sentiment_df['manual_analysis'])
-        ]
-    
-        # Create data editor for split rows
-        split_rows_editor = mo.ui.data_editor(
-            rows_to_edit
-    ).form(label="Update Sentiment / Manual Flag")
-    
-    else:
-        print("✓ No multi-context rows found")
-
-    return rows_to_edit, sentiment_df, split_rows_editor
-
-
-@app.cell(hide_code=True)
-def _(mo, rows_to_edit, split_rows_editor):
-    mo.vstack([
-        mo.md(f"""
-        ### ⚠️ Manual Review Required
-
-        **{len(rows_to_edit)} rows** were split from multi-context entries.
-        Please review them below:
-        1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
-        2. Click **Submit** to apply changes.
-        """),
-        split_rows_editor
-    ])
-    return
-
-
-@app.cell
-def _(mo, split_rows_editor):
-    # Capture the edited manual-analysis rows for validation
-    mo.stop(split_rows_editor.value is None, mo.md("Submit your sentiment analysis changes before continuing."))
-    reviewed_manual_rows = split_rows_editor.value
-
-    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
-    if not reviewed_manual_rows.empty:
-        valid_sentiments = {-1, 0, 1}
-        needs_review = reviewed_manual_rows[
-            reviewed_manual_rows['manual_analysis']
-            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
-        ]
-        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
-
-    print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
-    return (reviewed_manual_rows,)
+    sentiment_df[sentiment_df['manual_analysis']]
+    return (sentiment_df,)


@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
-    # Highlight Sentiment Analysis
+    ## Create 'theme' column
+    """)
+    return
+
+
+@app.cell
+def _(sentiment_df):
+    from utils import extract_theme
+    sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
+    sentiment_df
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    # Extract Sentiment + Reasoning

    For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.

@@ -278,24 +268,134 @@ def _(mo):


@app.cell
-def _(sentiment_df):
-    # for now, create an empty sentiment column with randomized dummy values for testing
-    # only for 'VT -' and 'CT -' tags
-    import random
+def _(mo):
+    start_processing_btn = mo.ui.button(
+        label="Start Sentiment Extraction",
+        kind="warn",
+        on_click=lambda val: True
+    )
+    start_processing_btn
+    return (start_processing_btn,)

-    def dummy_sentiment_analysis(content, tag):
-        if tag.startswith('VT -') or tag.startswith('CT -'):
-            return random.choice([-1, 0, 1])  # Random sentiment for testing
-        return None

-    # Only run on rows without manual_analysis
+@app.cell
+def _(
+    client,
+    codebook_df,
+    mo,
+    model_select,
+    pd,
+    sentiment_df,
+    start_processing_btn,
+):
+    from utils import dummy_sentiment_analysis, ollama_sentiment_analysis

-    sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
+    # add theme_description to be used in LLM prompt
+    _df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))

-    sentiment_df[~sentiment_df['manual_analysis']]
+    # Wait for start processing button
+    mo.stop(not start_processing_btn.value, "Click button above to start processing")
+
+
+    sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
+        lambda row: pd.Series(ollama_sentiment_analysis(
+            content=row['content'], 
+            theme=row['theme'], 
+            theme_description=row['theme_description'],
+            client=client, 
+            model=model_select.value
+        )),
+        axis=1
+    )
    return


+@app.cell
+def _(mo, sentiment_df):
+    mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
+    sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo):
+    mo.md(r"""
+    ## Multi-context tags
+    """)
+    return
+
+
+@app.cell
+def _(mo, sentiment_df):
+    manual_rows = sentiment_df[sentiment_df['manual_analysis']]
+    split_rows_editor = None
+    rows_to_edit = []
+
+    if not manual_rows.empty:
+        print(
+            f"⚠️  {len(manual_rows)} rows were created from multi-context splits. "
+            "See next cell for manual review."
+        )
+
+        # Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
+        rows_to_edit = sentiment_df[
+            (sentiment_df['manual_analysis'])
+        ]
+
+        # Create data editor for split rows
+        split_rows_editor = mo.ui.data_editor(
+            rows_to_edit
+    ).form(label="Update Sentiment / Manual Flag")
+
+    else:
+        print("✓ No multi-context rows found")
+    return rows_to_edit, split_rows_editor
+
+
+@app.cell
+def _(split_rows_editor):
+    split_rows_editor
+
+    return
+
+
+@app.cell(hide_code=True)
+def _(mo, rows_to_edit, split_rows_editor):
+    if split_rows_editor is not None:
+        mo.vstack([
+            mo.md(f"""
+            ### ⚠️ Manual Review Required
+
+            **{len(rows_to_edit)} rows** were split from multi-context entries.
+            Please review them below:
+            1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
+            2. Click **Submit** to apply changes.
+            """),
+            split_rows_editor
+        ])
+    return
+
+
+@app.cell
+def _(mo, split_rows_editor):
+    # Capture the edited manual-analysis rows for validation
+    reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
+    mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
+
+    # Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
+
+    if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
+        valid_sentiments = {-1, 0, 1}
+        needs_review = reviewed_manual_rows[
+            reviewed_manual_rows['manual_analysis']
+            & ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
+        ]
+        assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
+
+        print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
+    return (reviewed_manual_rows,)
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -307,7 +407,10 @@ def _(mo):
@app.cell
 def _(pd, reviewed_manual_rows, sentiment_df):
    _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
-    recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
+    if isinstance(reviewed_manual_rows, pd.DataFrame):
+        recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
+    else:
+        recombined_df = sentiment_df

    recombined_df
    return (recombined_df,)
@@ -348,7 +451,7 @@ def _(mo):
 def _(WORKING_DIR, datetime, interview_select, recombined_df):
    # Save to CSV in working dir
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
-    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
+    filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
    recombined_df.to_csv(filename, index=False)

    print(f"✓ Saved processed data to '{filename}'")
--- a/03_Sentiment_Analysis.py
+++ b/03_Sentiment_Analysis.py
@@ -9,14 +9,14 @@ def _():
    import marimo as mo
    import pandas as pd
    from pathlib import Path
+    from utils import create_sentiment_matrix

    INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
    WORKING_DIR = Path('./data/processing/03_sentiment_analysis')

    if not WORKING_DIR.exists():
        WORKING_DIR.mkdir(parents=True)
-
-    return INPUT_DIR, Path, WORKING_DIR, mo, pd
+    return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd


@app.cell(hide_code=True)
@@ -62,55 +62,6 @@ def _(mo):
    return


-@app.cell
-def _(document_name, pd):
-    import numpy as np
-
-    def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
-        """
-        Create a sentiment matrix for a specific document.
-
-        Parameters:
-        - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
-        - document_name: Name of the document to filter by
-
-        Returns:
-        - DataFrame representing the sentiment matrix
-        """
-
-        # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
-        sentiment_rows = doc_df[
-            doc_df['tag'].str.contains(column_prefix, na=False)
-        ].copy()
-
-        if sentiment_rows.empty:
-            print(f"No sentiment data found for document: {document_name}")
-            return pd.DataFrame()
-
-        # Filter for rows with valid Voice/Character context
-        valid_rows = sentiment_rows[
-            sentiment_rows['_context'].notna() & 
-            (sentiment_rows['_context'].str.contains(row_prefix, na=False))
-        ].copy()
-
-        if valid_rows.empty:
-            print(f"No Voice/Character context found for document: {document_name}")
-            return pd.DataFrame()
-
-        # Create aggregation: group by Voice/Character (_context) and Theme (tag)
-        # Sum sentiment scores for each combination
-        matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
-
-        # Pivot to create the matrix
-        matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
-
-        # # Convert to integers for cleaner display
-        # matrix = matrix.astype(int)
-
-        return matrix
-    return (create_sentiment_matrix,)
-
-
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
--- a/04_Sentiment_Aggregation.py
+++ b/04_Sentiment_Aggregation.py
@@ -32,7 +32,7 @@ def _(INPUT_DIR, mo):
    file_options = {f.stem: str(f) for f in voice_csv_files}

    voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
-    voice_multiselect
+
    return (voice_multiselect,)


--- a/ollama/docker-compose.yml
+++ b/ollama/docker-compose.yml
@@ -17,18 +17,22 @@ services:
    #       c) Explicitly override: docker compose run --gpus all ollama
    # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
    #    'devices' section further down as a fallback (less portable).
-    # deploy:
-    #   resources:
-    #     reservations:
-    #       devices:
-    #         - driver: nvidia
-    #           count: all
-    #           capabilities: [gpu]

-    # environment:
+    ## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+
+    environment:
      # Visible devices / capabilities for the NVIDIA container runtime
-      # - NVIDIA_VISIBLE_DEVICES=all
-      # - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    ## ---------- END GPU SUPPORT BLOCK ------------###
+

    # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
    # devices:
--- a/utils/init.py
+++ b/utils/init.py
@@ -0,0 +1,4 @@
+from .ollama_utils import connect_qumo_ollama
+from .data_utils import create_sentiment_matrix, extract_theme
+from .transcript_utils import load_srt
+from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
--- a/utils/data_utils.py
+++ b/utils/data_utils.py
@@ -0,0 +1,65 @@
+import pandas as pd
+
+
+def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
+    """
+    Create a sentiment matrix for a specific document.
+
+    Parameters:
+    - df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
+    - document_name: Name of the document to filter by
+
+    Returns:
+    - DataFrame representing the sentiment matrix
+    """
+
+    # Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
+    sentiment_rows = doc_df[
+        doc_df['tag'].str.contains(column_prefix, na=False)
+    ].copy()
+
+    if sentiment_rows.empty:
+        print("No sentiment data found")
+        return pd.DataFrame()
+
+    # Filter for rows with valid Voice/Character context
+    valid_rows = sentiment_rows[
+        sentiment_rows['_context'].notna() & 
+        (sentiment_rows['_context'].str.contains(row_prefix, na=False))
+    ].copy()
+
+    if valid_rows.empty:
+        print("No Voice/Character context found")
+        return pd.DataFrame()
+
+    # Create aggregation: group by Voice/Character (_context) and Theme (tag)
+    # Sum sentiment scores for each combination
+    matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
+
+    # Pivot to create the matrix
+    matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
+
+    # # Convert to integers for cleaner display
+    # matrix = matrix.astype(int)
+
+    return matrix
+
+
+
+def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
+    """
+    Extract the theme from a tag string.
+
+    Parameters:
+    - tag: str, the tag string (e.g., 'VT - Personal Experience')
+    - theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
+
+    Returns:
+    - str, the extracted theme (e.g., 'Personal Experience')
+    - None if no theme found
+    """
+    for prefix in theme_prefixes.split('|'):
+        if tag.startswith(prefix):
+            return tag.replace(prefix, '').strip()
+    return None
+    
--- a/utils/ollama_utils.py
+++ b/utils/ollama_utils.py
@@ -0,0 +1,42 @@
+
+
+
+import requests
+from ollama import Client
+
+
+
+
+def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
+    """Establish connection to Qumo Ollama instance
+    
+    vm_name: str ('ollama-lite' or 'hiperf-gpu')
+        Name of the VM running the Ollama instance
+    
+    Returns:
+        tuple(Client): Ollama client connected to the specified VM
+    """
+    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
+
+    if vm_name in ['localhost', '0.0.0.0']:
+        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
+  
+    try:
+        requests.get(QUMO_OLLAMA_URL, timeout=5)
+        client = Client(
+            host=QUMO_OLLAMA_URL
+        )
+    
+        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
+        models = [m.model for m in client.list().models]
+        if print_models:
+            print("Available models:")
+            for m in models:
+                print(f"  - '{m}' ")
+        return client, models
+    
+    except requests.ConnectionError:
+        pass
+    
+    print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
+    return None, None
--- a/utils/sentiment_analysis.py
+++ b/utils/sentiment_analysis.py
@@ -0,0 +1,135 @@
+import random
+import pandas as pd
+
+from ollama import Client
+import json
+
+def dummy_sentiment_analysis(content, tag):
+    if tag.startswith('VT -') or tag.startswith('CT -'):
+        return random.choice([-1, 0, 1]), 'random dummy sentiment'  # Random sentiment for testing
+
+    return 'test', 'not applicable'
+
+
+
+def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
+    """
+    Perform sentiment analysis using Ollama model.
+
+    Parameters:
+    - content: Text content to analyze
+    - tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
+
+    Returns:
+    - sentiment score and reason
+    """
+    prompt = f"""
+    # Role
+    You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
+
+    # Input
+    Theme: `{theme}`
+    Theme Description: `{theme_description}`
+    Quote:
+    ```
+    {content}
+    ```
+
+    # Instructions
+    1. Analyze the sentiment of the quote specifically regarding the theme.
+    2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
+    3. Assign a sentiment score:
+       - -1: Negative (complaint, dissatisfaction, criticism)
+       - 0: Neutral (factual, mixed, or no strong opinion)
+       - 1: Positive (praise, satisfaction, agreement)
+    4. Provide a concise reason (max 10 words).
+
+    # Constraints
+    - Return ONLY a valid JSON object.
+    - Do not use Markdown formatting (no ```json blocks).
+    - Do not write any Python code or explanations outside the JSON.
+    - If the quote is irrelevant to the theme, return sentiment 0.
+
+    # Response Format
+    {{
+        "keywords": ["<list_of_keywords>"],
+        "sentiment": <integer_score>,
+        "reason": "<string_reason>"
+    }}
+
+    # Examples
+
+    Example 1:
+    Theme: `Speed`
+    Quote: `It was a little slow for me.`
+    Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
+
+    Example 2:
+    Theme: `Price`
+    Quote: `It costs $50.`
+    Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
+
+    Example 3:
+    Theme: `Friendliness`
+    Quote: `Sound very welcoming.`
+    Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
+    """
+
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            resp = client.generate(
+                model=model,
+                prompt=prompt,
+            )
+            
+            response_text = resp.response.strip()
+
+            # Extract JSON from response
+            start_index = response_text.find('{')
+            end_index = response_text.rfind('}') + 1
+            
+            if start_index == -1 or end_index == 0:
+                raise ValueError("No JSON found")
+
+            json_str = response_text[start_index:end_index]
+            
+            response_json = json.loads(json_str)
+            keywords = response_json.get('keywords', [])
+            sentiment = response_json.get('sentiment', 'test')
+            reason = response_json.get('reason', 'no reason provided')
+            return keywords, sentiment, reason
+        
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
+            if attempt == max_retries - 1:
+                return [], None, 'parsing error'
+
+
+if __name__ == "__main__":
+
+    client = Client(
+            host="http://localhost:11434"
+        )
+
+    sentiment_df = pd.DataFrame({
+        'content': [
+            "I love this product!",
+            "This is the worst service ever.",
+            "It's okay, not great but not terrible."
+        ],
+        'tag': [
+            'VT - Personal Experience',
+            'VT - Personal Experience',
+            'VT - Personal Experience'
+        ],
+        'manual_analysis': [False, False, True]
+    })
+
+    sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
+        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
+        axis=1
+    )
+
+    print(sentiment_df.head())
+
--- a/utils/transcript_utils.py
+++ b/utils/transcript_utils.py
@@ -1,13 +1,6 @@
-"""
-Standard utils for this repository
-"""

-import re
 from pathlib import Path
-
-import requests
-from ollama import Client
-
+import re

 def load_srt(path: str | Path) -> str:
    """Load and parse an SRT file, returning clean transcript with speaker labels.
@@ -58,37 +51,4 @@ def load_srt(path: str | Path) -> str:
    
    # Format as "SPEAKER_XX: text"
    transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
-    return '\n\n'.join(transcript_lines)
-
-
-def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
-    """Establish connection to Qumo Ollama instance
-    
-    vm_name: str ('ollama-lite' or 'hiperf-gpu')
-        Name of the VM running the Ollama instance
-    
-    Returns:
-        tuple(Client): Ollama client connected to the specified VM
-    """
-    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
-
-    if vm_name in ['localhost', '0.0.0.0']:
-        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
-  
-    try:
-        requests.get(QUMO_OLLAMA_URL, timeout=5)
-        client = Client(
-            host=QUMO_OLLAMA_URL
-        )
-    
-        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
-        for m in client.list().models:
-            print(f"  - '{m.model}' ")
-        return client
-    
-    except requests.ConnectionError:
-        pass
-    
-    print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
-    return None
-
+    return '\n\n'.join(transcript_lines)
Author	SHA1	Message	Date
Luigi Maiorano	ccc5154b93	llm processing of sentiment	2025-12-12 14:28:51 +01:00
Luigi Maiorano	e576f98cce	basic parsing working	2025-12-11 12:56:23 +01:00