Compare commits

...

2 Commits

Author SHA1 Message Date
ccc5154b93 llm processing of sentiment 2025-12-12 14:28:51 +01:00
e576f98cce basic parsing working 2025-12-11 12:56:23 +01:00
11 changed files with 479 additions and 199 deletions

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View File

@@ -70,13 +70,13 @@ def csv_to_markdown(df):
return "\n\n".join(lines)
@app.cell
@app.cell(hide_code=True)
def _(file_dropdown, mo, pd):
# Preview
preview = mo.md("")
if file_dropdown.value:
df = pd.read_csv(file_dropdown.value)
md_content = csv_to_markdown(df)
md_content = csv_to_markdown(df.head(10))
preview = mo.md(md_content)
preview

View File

@@ -16,28 +16,49 @@ def _():
OLLAMA_LOCATION= 'localhost'
# VM_NAME = 'ollama-lite'
client = connect_qumo_ollama(OLLAMA_LOCATION)
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
model_select
return (
TAGUETTE_EXPORT_DIR,
WORKING_DIR,
client,
datetime,
mo,
model_select,
pd,
)
@app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo):
mo.md(rf"""
# Step 1: Export All Highlights out of Taguette
# Step 1: Export Data out of Taguette
1. Go to: http://taguette.tail44fa00.ts.net/project/1
2. Select 'Highlights' on left
3. Select 'See all hightlights'
4. Top right 'Export this view' > 'CSV'
5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
**Highlights**
1. Go to: https://taguette.qumo.io/project/1
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
**Tags Codebook**
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
""")
return
@@ -51,13 +72,21 @@ def _(mo):
@app.cell
def _(pd):
all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df.head(20)
all_tags_df
return (all_tags_df,)
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
codebook_df
return (codebook_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
@@ -89,7 +118,7 @@ def _(all_tags_df, interview_select, mo):
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Add `_context` column to track Voice / Character is being referred to per highlight
## Add `_context` column to track Voice / Character is being referred to per highlight
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
1. Iterates through the dataframe in document order (row by row)
@@ -102,12 +131,12 @@ def _(mo):
Example of challenging case:
| id | document | tag | content | _seq_id | _context |
|-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
| tag | content | _seq_id | _context |
|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
""")
return
@@ -155,7 +184,7 @@ def _(df):
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Resolve multi-context rows (only VT- and CT- theme tags)
## Split multi-context rows (only VT- and CT- theme tags)
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
- split these into separate rows for each context.
@@ -165,7 +194,7 @@ def _(mo):
@app.cell
def _(df, mo, pd):
def _(df, pd):
# Expand rows that contain multiple contexts (comma-separated)
expanded_rows = []
@@ -201,71 +230,32 @@ def _(df, mo, pd):
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
].copy()
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
split_rows_editor = None
print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, sentiment_df, split_rows_editor
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
mo.stop(split_rows_editor.value is None, mo.md("Submit your sentiment analysis changes before continuing."))
reviewed_manual_rows = split_rows_editor.value
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if not reviewed_manual_rows.empty:
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
sentiment_df[sentiment_df['manual_analysis']]
return (sentiment_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Highlight Sentiment Analysis
## Create 'theme' column
""")
return
@app.cell
def _(sentiment_df):
from utils import extract_theme
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
sentiment_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Extract Sentiment + Reasoning
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
@@ -278,24 +268,134 @@ def _(mo):
@app.cell
def _(sentiment_df):
# for now, create an empty sentiment column with randomized dummy values for testing
# only for 'VT -' and 'CT -' tags
import random
def _(mo):
start_processing_btn = mo.ui.button(
label="Start Sentiment Extraction",
kind="warn",
on_click=lambda val: True
)
start_processing_btn
return (start_processing_btn,)
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]) # Random sentiment for testing
return None
# Only run on rows without manual_analysis
@app.cell
def _(
client,
codebook_df,
mo,
model_select,
pd,
sentiment_df,
start_processing_btn,
):
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
# add theme_description to be used in LLM prompt
_df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
sentiment_df[~sentiment_df['manual_analysis']]
# Wait for start processing button
mo.stop(not start_processing_btn.value, "Click button above to start processing")
sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(
content=row['content'],
theme=row['theme'],
theme_description=row['theme_description'],
client=client,
model=model_select.value
)),
axis=1
)
return
@app.cell
def _(mo, sentiment_df):
mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Multi-context tags
""")
return
@app.cell
def _(mo, sentiment_df):
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
split_rows_editor = None
rows_to_edit = []
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, split_rows_editor
@app.cell
def _(split_rows_editor):
split_rows_editor
return
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
if split_rows_editor is not None:
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
@@ -307,7 +407,10 @@ def _(mo):
@app.cell
def _(pd, reviewed_manual_rows, sentiment_df):
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
if isinstance(reviewed_manual_rows, pd.DataFrame):
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
else:
recombined_df = sentiment_df
recombined_df
return (recombined_df,)
@@ -348,7 +451,7 @@ def _(mo):
def _(WORKING_DIR, datetime, interview_select, recombined_df):
# Save to CSV in working dir
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
recombined_df.to_csv(filename, index=False)
print(f"✓ Saved processed data to '{filename}'")

View File

@@ -9,14 +9,14 @@ def _():
import marimo as mo
import pandas as pd
from pathlib import Path
from utils import create_sentiment_matrix
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
return INPUT_DIR, Path, WORKING_DIR, mo, pd
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
@app.cell(hide_code=True)
@@ -62,55 +62,6 @@ def _(mo):
return
@app.cell
def _(document_name, pd):
import numpy as np
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print(f"No sentiment data found for document: {document_name}")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print(f"No Voice/Character context found for document: {document_name}")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
return (create_sentiment_matrix,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""

View File

@@ -32,7 +32,7 @@ def _(INPUT_DIR, mo):
file_options = {f.stem: str(f) for f in voice_csv_files}
voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
voice_multiselect
return (voice_multiselect,)

View File

@@ -17,18 +17,22 @@ services:
# c) Explicitly override: docker compose run --gpus all ollama
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
# 'devices' section further down as a fallback (less portable).
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
# environment:
## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
# Visible devices / capabilities for the NVIDIA container runtime
# - NVIDIA_VISIBLE_DEVICES=all
# - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
## ---------- END GPU SUPPORT BLOCK ------------###
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
# devices:

4
utils/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .ollama_utils import connect_qumo_ollama
from .data_utils import create_sentiment_matrix, extract_theme
from .transcript_utils import load_srt
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis

65
utils/data_utils.py Normal file
View File

@@ -0,0 +1,65 @@
import pandas as pd
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print("No sentiment data found")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print("No Voice/Character context found")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
"""
Extract the theme from a tag string.
Parameters:
- tag: str, the tag string (e.g., 'VT - Personal Experience')
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
Returns:
- str, the extracted theme (e.g., 'Personal Experience')
- None if no theme found
"""
for prefix in theme_prefixes.split('|'):
if tag.startswith(prefix):
return tag.replace(prefix, '').strip()
return None

42
utils/ollama_utils.py Normal file
View File

@@ -0,0 +1,42 @@
import requests
from ollama import Client
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
models = [m.model for m in client.list().models]
if print_models:
print("Available models:")
for m in models:
print(f" - '{m}' ")
return client, models
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None, None

135
utils/sentiment_analysis.py Normal file
View File

@@ -0,0 +1,135 @@
import random
import pandas as pd
from ollama import Client
import json
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
return 'test', 'not applicable'
def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
"""
Perform sentiment analysis using Ollama model.
Parameters:
- content: Text content to analyze
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
Returns:
- sentiment score and reason
"""
prompt = f"""
# Role
You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
# Input
Theme: `{theme}`
Theme Description: `{theme_description}`
Quote:
```
{content}
```
# Instructions
1. Analyze the sentiment of the quote specifically regarding the theme.
2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
3. Assign a sentiment score:
- -1: Negative (complaint, dissatisfaction, criticism)
- 0: Neutral (factual, mixed, or no strong opinion)
- 1: Positive (praise, satisfaction, agreement)
4. Provide a concise reason (max 10 words).
# Constraints
- Return ONLY a valid JSON object.
- Do not use Markdown formatting (no ```json blocks).
- Do not write any Python code or explanations outside the JSON.
- If the quote is irrelevant to the theme, return sentiment 0.
# Response Format
{{
"keywords": ["<list_of_keywords>"],
"sentiment": <integer_score>,
"reason": "<string_reason>"
}}
# Examples
Example 1:
Theme: `Speed`
Quote: `It was a little slow for me.`
Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
Example 2:
Theme: `Price`
Quote: `It costs $50.`
Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
Example 3:
Theme: `Friendliness`
Quote: `Sound very welcoming.`
Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
"""
max_retries = 3
for attempt in range(max_retries):
try:
resp = client.generate(
model=model,
prompt=prompt,
)
response_text = resp.response.strip()
# Extract JSON from response
start_index = response_text.find('{')
end_index = response_text.rfind('}') + 1
if start_index == -1 or end_index == 0:
raise ValueError("No JSON found")
json_str = response_text[start_index:end_index]
response_json = json.loads(json_str)
keywords = response_json.get('keywords', [])
sentiment = response_json.get('sentiment', 'test')
reason = response_json.get('reason', 'no reason provided')
return keywords, sentiment, reason
except Exception as e:
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
if attempt == max_retries - 1:
return [], None, 'parsing error'
if __name__ == "__main__":
client = Client(
host="http://localhost:11434"
)
sentiment_df = pd.DataFrame({
'content': [
"I love this product!",
"This is the worst service ever.",
"It's okay, not great but not terrible."
],
'tag': [
'VT - Personal Experience',
'VT - Personal Experience',
'VT - Personal Experience'
],
'manual_analysis': [False, False, True]
})
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
axis=1
)
print(sentiment_df.head())

View File

@@ -1,13 +1,6 @@
"""
Standard utils for this repository
"""
import re
from pathlib import Path
import requests
from ollama import Client
import re
def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels.
@@ -58,37 +51,4 @@ def load_srt(path: str | Path) -> str:
# Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines)
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
for m in client.list().models:
print(f" - '{m.model}' ")
return client
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None
return '\n\n'.join(transcript_lines)