basic parsing working

This commit is contained in:
2025-12-11 12:56:23 +01:00
parent b023d44934
commit e576f98cce
10 changed files with 411 additions and 183 deletions

16
.vscode/launch.json vendored Normal file
View File

@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}

View File

@@ -16,7 +16,7 @@ def _():
OLLAMA_LOCATION= 'localhost'
# VM_NAME = 'ollama-lite'
client = connect_qumo_ollama(OLLAMA_LOCATION)
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
@@ -25,7 +25,23 @@ def _():
WORKING_DIR.mkdir(parents=True)
if not TAGUETTE_EXPORT_DIR.exists():
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
model_select = mo.ui.dropdown(
options=_models,
value=_models[0],
label="Select Ollama Model to use",
searchable=True,
)
model_select
return (
TAGUETTE_EXPORT_DIR,
WORKING_DIR,
client,
datetime,
mo,
model_select,
pd,
)
@app.cell(hide_code=True)
@@ -89,7 +105,7 @@ def _(all_tags_df, interview_select, mo):
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
### Add `_context` column to track Voice / Character is being referred to per highlight
## Add `_context` column to track Voice / Character is being referred to per highlight
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
1. Iterates through the dataframe in document order (row by row)
@@ -102,12 +118,12 @@ def _(mo):
Example of challenging case:
| id | document | tag | content | _seq_id | _context |
|-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
| tag | content | _seq_id | _context |
|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
| _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
| _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
| VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
| VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
""")
return
@@ -155,7 +171,7 @@ def _(df):
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Resolve multi-context rows (only VT- and CT- theme tags)
## Split multi-context rows (only VT- and CT- theme tags)
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
- split these into separate rows for each context.
@@ -165,7 +181,7 @@ def _(mo):
@app.cell
def _(df, mo, pd):
def _(df, pd):
# Expand rows that contain multiple contexts (comma-separated)
expanded_rows = []
@@ -201,71 +217,32 @@ def _(df, mo, pd):
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
].copy()
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
split_rows_editor = None
print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, sentiment_df, split_rows_editor
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
mo.stop(split_rows_editor.value is None, mo.md("Submit your sentiment analysis changes before continuing."))
reviewed_manual_rows = split_rows_editor.value
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if not reviewed_manual_rows.empty:
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
sentiment_df[sentiment_df['manual_analysis']]
return (sentiment_df,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Highlight Sentiment Analysis
## Create 'theme' column
""")
return
@app.cell
def _(sentiment_df):
from utils import extract_theme
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
sentiment_df
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
# Extract Sentiment + Reasoning
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
@@ -278,24 +255,106 @@ def _(mo):
@app.cell
def _(sentiment_df):
def _(client, model_select, pd, sentiment_df):
# for now, create an empty sentiment column with randomized dummy values for testing
# only for 'VT -' and 'CT -' tags
import random
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]) # Random sentiment for testing
return None
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
# Only run on rows without manual_analysis
sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
# sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
# lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])),
# axis=1
# )
sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)),
axis=1
)
sentiment_df[~sentiment_df['manual_analysis']]
return
@app.cell
def _(sentiment_df):
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
return
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
## Multi-context tags
""")
return
@app.cell
def _(mo, sentiment_df):
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
split_rows_editor = None
rows_to_edit = []
if not manual_rows.empty:
print(
f"⚠️ {len(manual_rows)} rows were created from multi-context splits. "
"See next cell for manual review."
)
# Filter for rows that need review. Manual analysis and the tag starts with 'VT -' or 'CT -'
rows_to_edit = sentiment_df[
(sentiment_df['manual_analysis'])
]
# Create data editor for split rows
split_rows_editor = mo.ui.data_editor(
rows_to_edit
).form(label="Update Sentiment / Manual Flag")
else:
print("✓ No multi-context rows found")
return rows_to_edit, split_rows_editor
@app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor):
if split_rows_editor is not None:
mo.vstack([
mo.md(f"""
### ⚠️ Manual Review Required
**{len(rows_to_edit)} rows** were split from multi-context entries.
Please review them below:
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
2. Click **Submit** to apply changes.
"""),
split_rows_editor
])
return
@app.cell
def _(mo, split_rows_editor):
# Capture the edited manual-analysis rows for validation
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
valid_sentiments = {-1, 0, 1}
needs_review = reviewed_manual_rows[
reviewed_manual_rows['manual_analysis']
& ~reviewed_manual_rows['sentiment'].isin(valid_sentiments)
]
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
return (reviewed_manual_rows,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""
@@ -307,7 +366,10 @@ def _(mo):
@app.cell
def _(pd, reviewed_manual_rows, sentiment_df):
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
if isinstance(reviewed_manual_rows, pd.DataFrame):
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
else:
recombined_df = sentiment_df
recombined_df
return (recombined_df,)
@@ -348,7 +410,7 @@ def _(mo):
def _(WORKING_DIR, datetime, interview_select, recombined_df):
# Save to CSV in working dir
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
recombined_df.to_csv(filename, index=False)
print(f"✓ Saved processed data to '{filename}'")

View File

@@ -9,14 +9,14 @@ def _():
import marimo as mo
import pandas as pd
from pathlib import Path
from utils import create_sentiment_matrix
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
if not WORKING_DIR.exists():
WORKING_DIR.mkdir(parents=True)
return INPUT_DIR, Path, WORKING_DIR, mo, pd
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
@app.cell(hide_code=True)
@@ -62,55 +62,6 @@ def _(mo):
return
@app.cell
def _(document_name, pd):
import numpy as np
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print(f"No sentiment data found for document: {document_name}")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print(f"No Voice/Character context found for document: {document_name}")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
return (create_sentiment_matrix,)
@app.cell(hide_code=True)
def _(mo):
mo.md(r"""

View File

@@ -17,18 +17,18 @@ services:
# c) Explicitly override: docker compose run --gpus all ollama
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
# 'devices' section further down as a fallback (less portable).
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: all
# capabilities: [gpu]
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
# environment:
environment:
# Visible devices / capabilities for the NVIDIA container runtime
# - NVIDIA_VISIBLE_DEVICES=all
# - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
# devices:

4
utils/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
from .ollama_utils import connect_qumo_ollama
from .data_utils import create_sentiment_matrix, extract_theme
from .transcript_utils import load_srt
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis

65
utils/data_utils.py Normal file
View File

@@ -0,0 +1,65 @@
import pandas as pd
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
"""
Create a sentiment matrix for a specific document.
Parameters:
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
- document_name: Name of the document to filter by
Returns:
- DataFrame representing the sentiment matrix
"""
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
sentiment_rows = doc_df[
doc_df['tag'].str.contains(column_prefix, na=False)
].copy()
if sentiment_rows.empty:
print("No sentiment data found")
return pd.DataFrame()
# Filter for rows with valid Voice/Character context
valid_rows = sentiment_rows[
sentiment_rows['_context'].notna() &
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
].copy()
if valid_rows.empty:
print("No Voice/Character context found")
return pd.DataFrame()
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
# Sum sentiment scores for each combination
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
# Pivot to create the matrix
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
# # Convert to integers for cleaner display
# matrix = matrix.astype(int)
return matrix
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
"""
Extract the theme from a tag string.
Parameters:
- tag: str, the tag string (e.g., 'VT - Personal Experience')
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
Returns:
- str, the extracted theme (e.g., 'Personal Experience')
- None if no theme found
"""
for prefix in theme_prefixes.split('|'):
if tag.startswith(prefix):
return tag.replace(prefix, '').strip()
return None

42
utils/ollama_utils.py Normal file
View File

@@ -0,0 +1,42 @@
import requests
from ollama import Client
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
models = [m.model for m in client.list().models]
if print_models:
print("Available models:")
for m in models:
print(f" - '{m}' ")
return client, models
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None, None

128
utils/sentiment_analysis.py Normal file
View File

@@ -0,0 +1,128 @@
import random
import pandas as pd
from ollama import Client
import json
def dummy_sentiment_analysis(content, tag):
if tag.startswith('VT -') or tag.startswith('CT -'):
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
return 'test', 'not applicable'
def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
"""
Perform sentiment analysis using Ollama model.
Parameters:
- content: Text content to analyze
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
Returns:
- sentiment score and reason
"""
prompt = f"""
# Instructions
You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
You need to deliver three pieces of information:
1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
3. A brief reason (max 10 words) explaining your sentiment score.
# Guidelines
Keywords should be directly relevant to the theme.
The reason should be extremely concise and to the point:
- Does not need to be a full sentence.
- Sentiment itself does not need to be stated in the explanation.
- If keywords are present in the quote that directly capture the sentiment, give that as the reason..
# Input
Theme: `{theme}`
Quote:
```
{content}
```
# Response Format
Provide your response in the following JSON format:
{{
"keywords": ["<list_of_relevant_keywords_if_any>"],
"sentiment": <sentiment_score>,
"reason": "<brief_explanation_max_10_words>"
}}
# Examples
** Example 1**
- Theme: `Speed`
- Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
- Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
** Example 2**
- Theme: `Friendliness / Empathy`
- Quote: `Sound very welcoming`
- Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
"""
resp = client.generate(
model=model,
prompt=prompt,
)
try:
response_text = resp.response.strip()
# Extract JSON from response
start_index = response_text.find('{')
end_index = response_text.rfind('}') + 1
json_str = response_text[start_index:end_index]
response_json = json.loads(json_str)
keywords = response_json.get('keywords', [])
sentiment = response_json.get('sentiment', 'test')
reason = response_json.get('reason', 'no reason provided')
return keywords, sentiment, reason
except Exception as e:
print(f"Error parsing response: {e}")
return [], None, 'parsing error'
if __name__ == "__main__":
client = Client(
host="http://localhost:11434"
)
sentiment_df = pd.DataFrame({
'content': [
"I love this product!",
"This is the worst service ever.",
"It's okay, not great but not terrible."
],
'tag': [
'VT - Personal Experience',
'VT - Personal Experience',
'VT - Personal Experience'
],
'manual_analysis': [False, False, True]
})
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
axis=1
)
print(sentiment_df.head())

View File

@@ -1,13 +1,6 @@
"""
Standard utils for this repository
"""
import re
from pathlib import Path
import requests
from ollama import Client
import re
def load_srt(path: str | Path) -> str:
"""Load and parse an SRT file, returning clean transcript with speaker labels.
@@ -58,37 +51,4 @@ def load_srt(path: str | Path) -> str:
# Format as "SPEAKER_XX: text"
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
return '\n\n'.join(transcript_lines)
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
"""Establish connection to Qumo Ollama instance
vm_name: str ('ollama-lite' or 'hiperf-gpu')
Name of the VM running the Ollama instance
Returns:
tuple(Client): Ollama client connected to the specified VM
"""
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
if vm_name in ['localhost', '0.0.0.0']:
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
try:
requests.get(QUMO_OLLAMA_URL, timeout=5)
client = Client(
host=QUMO_OLLAMA_URL
)
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
for m in client.list().models:
print(f" - '{m.model}' ")
return client
except requests.ConnectionError:
pass
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
return None
return '\n\n'.join(transcript_lines)