basic parsing working

This commit is contained in:
2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View File

@@ -16,7 +16,7 @@ def _():
OLLAMA_LOCATION= 'localhost' OLLAMA_LOCATION= 'localhost'
# VM_NAME = 'ollama-lite' # VM_NAME = 'ollama-lite'
client = connect_qumo_ollama(OLLAMA_LOCATION) client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess') WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
@@ -25,7 +25,23 @@ def _():
WORKING_DIR.mkdir(parents=True) WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists(): if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True) TAGUETTE_EXPORT_DIR.mkdir(parents=True)
return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
model_select
return (
TAGUETTE_EXPORT_DIR,
WORKING_DIR,
client,
datetime,
mo,
model_select,
pd,
)
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -89,7 +105,7 @@ def _(all_tags_df, interview_select, mo):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""
### Add `_context` column to track Voice / Character is being referred to per highlight ## Add `_context` column to track Voice / Character is being referred to per highlight
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom. Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
1. Iterates through the dataframe in document order (row by row) 1. Iterates through the dataframe in document order (row by row)
@@ -102,12 +118,12 @@ def _(mo):
Example of challenging case: Example of challenging case:
| id | document | tag | content | _seq_id | _context | | tag | content | _seq_id | _context |
|-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------| |------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 | | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 | | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 | | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 | | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
""") """)
return return
@@ -155,7 +171,7 @@ def _(df):
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""
## Resolve multi-context rows (only VT- and CT- theme tags) ## Split multi-context rows (only VT- and CT- theme tags)
For rows that have multiple contexts (e.g., both _V-54 and _V-41) For rows that have multiple contexts (e.g., both _V-54 and _V-41)
- split these into separate rows for each context. - split these into separate rows for each context.
@@ -165,7 +181,7 @@ def _(mo):
@app.cell @app.cell
def _(df, mo, pd): def _(df, pd):
# Expand rows that contain multiple contexts (comma-separated) # Expand rows that contain multiple contexts (comma-separated)
expanded_rows = [] expanded_rows = []
@@ -201,71 +217,32 @@ def _(df, mo, pd):
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False) expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
].copy() ].copy()
manual_rows = sentiment_df[sentiment_df['manual_analysis']] print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
split_rows_editor = None
sentiment_df[sentiment_df['manual_analysis']]
if not manual_rows.empty: return (sentiment_df,)
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, sentiment_df, split_rows_editor
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
mo.stop(split_rows_editor.value is None, mo.md("Submit your sentiment analysis changes before continuing."))
reviewed_manual_rows = split_rows_editor.value
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if not reviewed_manual_rows.empty:
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""
# Highlight Sentiment Analysis ## Create 'theme' column
""")
return
@app.cell
def _(sentiment_df):
from utils import extract_theme
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
sentiment_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Extract Sentiment + Reasoning
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column. For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
@@ -278,24 +255,106 @@ def _(mo):
@app.cell @app.cell
def _(sentiment_df): def _(client, model_select, pd, sentiment_df):
# for now, create an empty sentiment column with randomized dummy values for testing # for now, create an empty sentiment column with randomized dummy values for testing
# only for 'VT -' and 'CT -' tags # only for 'VT -' and 'CT -' tags
import random
def dummy_sentiment_analysis(content, tag): from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]) # Random sentiment for testing
return None
# Only run on rows without manual_analysis # Only run on rows without manual_analysis
sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1) # sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
# lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])),
# axis=1
# )
sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)),
axis=1
)
sentiment_df[~sentiment_df['manual_analysis']]
return return
@app.cell
def _(sentiment_df):
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Multi-context tags
""")
return
@app.cell
def _(mo, sentiment_df):
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
split_rows_editor = None
rows_to_edit = []
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, split_rows_editor
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
if split_rows_editor is not None:
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""
@@ -307,7 +366,10 @@ def _(mo):
@app.cell @app.cell
def _(pd, reviewed_manual_rows, sentiment_df): def _(pd, reviewed_manual_rows, sentiment_df):
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']] _static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True) if isinstance(reviewed_manual_rows, pd.DataFrame):
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
else:
recombined_df = sentiment_df
recombined_df recombined_df
return (recombined_df,) return (recombined_df,)
@@ -348,7 +410,7 @@ def _(mo):
def _(WORKING_DIR, datetime, interview_select, recombined_df): def _(WORKING_DIR, datetime, interview_select, recombined_df):
# Save to CSV in working dir # Save to CSV in working dir
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv" filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
recombined_df.to_csv(filename, index=False) recombined_df.to_csv(filename, index=False)
print(f"✓ Saved processed data to '{filename}'") print(f"✓ Saved processed data to '{filename}'")

View File

@@ -9,14 +9,14 @@ def _():
import marimo as mo import marimo as mo
import pandas as pd import pandas as pd
from pathlib import Path from pathlib import Path
from utils import create_sentiment_matrix
INPUT_DIR = Path("./data/processing/02_taguette_postprocess") INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
WORKING_DIR = Path('./data/processing/03_sentiment_analysis') WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
if not WORKING_DIR.exists(): if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True) WORKING_DIR.mkdir(parents=True)
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
return INPUT_DIR, Path, WORKING_DIR, mo, pd
@app.cell(hide_code=True) @app.cell(hide_code=True)
@@ -62,55 +62,6 @@ def _(mo):
return return
@app.cell
def _(document_name, pd):
import numpy as np
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print(f"No sentiment data found for document: {document_name}")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print(f"No Voice/Character context found for document: {document_name}")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
return (create_sentiment_matrix,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""

View File

@@ -17,18 +17,18 @@ services:
# c) Explicitly override: docker compose run --gpus all ollama # c) Explicitly override: docker compose run --gpus all ollama
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
# 'devices' section further down as a fallback (less portable). # 'devices' section further down as a fallback (less portable).
# deploy: deploy:
# resources: resources:
# reservations: reservations:
# devices: devices:
# - driver: nvidia - driver: nvidia
# count: all count: all
# capabilities: [gpu] capabilities: [gpu]
# environment: environment:
# Visible devices / capabilities for the NVIDIA container runtime # Visible devices / capabilities for the NVIDIA container runtime
# - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_VISIBLE_DEVICES=all
# - NVIDIA_DRIVER_CAPABILITIES=compute,utility - NVIDIA_DRIVER_CAPABILITIES=compute,utility
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors): # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
# devices: # devices:

4
utils/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .ollama_utils import connect_qumo_ollama
from .data_utils import create_sentiment_matrix, extract_theme
from .transcript_utils import load_srt
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis

65
utils/data_utils.py Normal file
View File

@@ -0,0 +1,65 @@
import pandas as pd
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print("No sentiment data found")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print("No Voice/Character context found")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
"""
Extract the theme from a tag string.
Parameters:
- tag: str, the tag string (e.g., 'VT - Personal Experience')
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
Returns:
- str, the extracted theme (e.g., 'Personal Experience')
- None if no theme found
"""
for prefix in theme_prefixes.split('|'):
if tag.startswith(prefix):
return tag.replace(prefix, '').strip()
return None

42
utils/ollama_utils.py Normal file
View File

@@ -0,0 +1,42 @@
import requests
from ollama import Client
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
models = [m.model for m in client.list().models]
if print_models:
print("Available models:")
for m in models:
print(f" - '{m}' ")
return client, models
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None, None

128
utils/sentiment_analysis.py Normal file
View File

@@ -0,0 +1,128 @@
import random
import pandas as pd
from ollama import Client
import json
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
return 'test', 'not applicable'
def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
"""
Perform sentiment analysis using Ollama model.
Parameters:
- content: Text content to analyze
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
Returns:
- sentiment score and reason
"""
prompt = f"""
# Instructions
You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
You need to deliver three pieces of information:
1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
3. A brief reason (max 10 words) explaining your sentiment score.
# Guidelines
Keywords should be directly relevant to the theme.
The reason should be extremely concise and to the point:
- Does not need to be a full sentence.
- Sentiment itself does not need to be stated in the explanation.
- If keywords are present in the quote that directly capture the sentiment, give that as the reason..
# Input
Theme: `{theme}`
Quote:
```
{content}
```
# Response Format
Provide your response in the following JSON format:
{{
"keywords": ["<list_of_relevant_keywords_if_any>"],
"sentiment": <sentiment_score>,
"reason": "<brief_explanation_max_10_words>"
}}
# Examples
** Example 1**
- Theme: `Speed`
- Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
- Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
** Example 2**
- Theme: `Friendliness / Empathy`
- Quote: `Sound very welcoming`
- Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
"""
resp = client.generate(
model=model,
prompt=prompt,
)
try:
response_text = resp.response.strip()
# Extract JSON from response
start_index = response_text.find('{')
end_index = response_text.rfind('}') + 1
json_str = response_text[start_index:end_index]
response_json = json.loads(json_str)
keywords = response_json.get('keywords', [])
sentiment = response_json.get('sentiment', 'test')
reason = response_json.get('reason', 'no reason provided')
return keywords, sentiment, reason
except Exception as e:
print(f"Error parsing response: {e}")
return [], None, 'parsing error'
if __name__ == "__main__":
client = Client(
host="http://localhost:11434"
)
sentiment_df = pd.DataFrame({
'content': [
"I love this product!",
"This is the worst service ever.",
"It's okay, not great but not terrible."
],
'tag': [
'VT - Personal Experience',
'VT - Personal Experience',
'VT - Personal Experience'
],
'manual_analysis': [False, False, True]
})
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
axis=1
)
print(sentiment_df.head())

View File

@@ -1,13 +1,6 @@
"""
Standard utils for this repository
"""
import re
from pathlib import Path from pathlib import Path
import re
import requests
from ollama import Client
def load_srt(path: str | Path) -> str: def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels. """Load and parse an SRT file, returning clean transcript with speaker labels.
@@ -58,37 +51,4 @@ def load_srt(path: str | Path) -> str:
# Format as "SPEAKER_XX: text" # Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged] transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines) return '\n\n'.join(transcript_lines)
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
for m in client.list().models:
print(f" - '{m.model}' ")
return client
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None