Compare commits
2 Commits
b023d44934
...
ccc5154b93
| Author | SHA1 | Date | |
|---|---|---|---|
| ccc5154b93 | |||
| e576f98cce |
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
|
||||
{
|
||||
"name": "Python Debugger: Current File",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -70,13 +70,13 @@ def csv_to_markdown(df):
|
||||
return "\n\n".join(lines)
|
||||
|
||||
|
||||
@app.cell
|
||||
@app.cell(hide_code=True)
|
||||
def _(file_dropdown, mo, pd):
|
||||
# Preview
|
||||
preview = mo.md("")
|
||||
if file_dropdown.value:
|
||||
df = pd.read_csv(file_dropdown.value)
|
||||
md_content = csv_to_markdown(df)
|
||||
md_content = csv_to_markdown(df.head(10))
|
||||
preview = mo.md(md_content)
|
||||
|
||||
preview
|
||||
|
||||
@@ -16,28 +16,49 @@ def _():
|
||||
OLLAMA_LOCATION= 'localhost'
|
||||
# VM_NAME = 'ollama-lite'
|
||||
|
||||
client = connect_qumo_ollama(OLLAMA_LOCATION)
|
||||
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
|
||||
|
||||
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
|
||||
TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
|
||||
WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
if not TAGUETTE_EXPORT_DIR.exists():
|
||||
TAGUETTE_EXPORT_DIR.mkdir(parents=True)
|
||||
return TAGUETTE_EXPORT_DIR, WORKING_DIR, datetime, mo, pd
|
||||
|
||||
model_select = mo.ui.dropdown(
|
||||
options=_models,
|
||||
value=_models[0],
|
||||
label="Select Ollama Model to use",
|
||||
searchable=True,
|
||||
)
|
||||
model_select
|
||||
return (
|
||||
TAGUETTE_EXPORT_DIR,
|
||||
WORKING_DIR,
|
||||
client,
|
||||
datetime,
|
||||
mo,
|
||||
model_select,
|
||||
pd,
|
||||
)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(TAGUETTE_EXPORT_DIR, mo):
|
||||
mo.md(rf"""
|
||||
# Step 1: Export All Highlights out of Taguette
|
||||
# Step 1: Export Data out of Taguette
|
||||
|
||||
1. Go to: http://taguette.tail44fa00.ts.net/project/1
|
||||
2. Select 'Highlights' on left
|
||||
3. Select 'See all hightlights'
|
||||
4. Top right 'Export this view' > 'CSV'
|
||||
5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
|
||||
**Highlights**
|
||||
1. Go to: https://taguette.qumo.io/project/1
|
||||
2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
|
||||
3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
|
||||
|
||||
**Tags Codebook**
|
||||
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
|
||||
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
|
||||
|
||||
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -51,13 +72,21 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(pd):
|
||||
all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
|
||||
def _(TAGUETTE_EXPORT_DIR, pd):
|
||||
all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
|
||||
all_tags_df['_seq_id'] = range(len(all_tags_df))
|
||||
all_tags_df.head(20)
|
||||
all_tags_df
|
||||
return (all_tags_df,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(TAGUETTE_EXPORT_DIR, pd):
|
||||
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
|
||||
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
|
||||
codebook_df
|
||||
return (codebook_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
@@ -89,7 +118,7 @@ def _(all_tags_df, interview_select, mo):
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
### Add `_context` column to track Voice / Character is being referred to per highlight
|
||||
## Add `_context` column to track Voice / Character is being referred to per highlight
|
||||
Create a new column 'context', which is defined by the last '_V-' or '_C-' tag seen in the 'tags' column', when moving row by row from top to bottom.
|
||||
|
||||
1. Iterates through the dataframe in document order (row by row)
|
||||
@@ -102,12 +131,12 @@ def _(mo):
|
||||
|
||||
Example of challenging case:
|
||||
|
||||
| id | document | tag | content | _seq_id | _context |
|
||||
|-----|-------------|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
|
||||
| 88 | P2 - Done | _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
|
||||
| 88 | P2 - Done | _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
|
||||
| 88 | P2 - Done | VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
|
||||
| 88 | P2 - Done | VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
|
||||
| tag | content | _seq_id | _context |
|
||||
|------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------|----------------------|
|
||||
| _V-54 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 117 | _V-54, _V-41 |
|
||||
| _V-41 | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 118 | _V-54, _V-41 |
|
||||
| VT - Human / Artificial | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 119 | _V-54, _V-41 |
|
||||
| VT - Friendliness / Empathy | They I feel like they're like twins in that sense. Like, they both had this calming, like, calming voice that was smooth. It felt, like, but articulated and helpful, and, like, I felt reassured listening to them. | 120 | _V-54, _V-41 |
|
||||
""")
|
||||
return
|
||||
|
||||
@@ -155,7 +184,7 @@ def _(df):
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Resolve multi-context rows (only VT- and CT- theme tags)
|
||||
## Split multi-context rows (only VT- and CT- theme tags)
|
||||
|
||||
For rows that have multiple contexts (e.g., both _V-54 and _V-41)
|
||||
- split these into separate rows for each context.
|
||||
@@ -165,7 +194,7 @@ def _(mo):
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(df, mo, pd):
|
||||
def _(df, pd):
|
||||
# Expand rows that contain multiple contexts (comma-separated)
|
||||
expanded_rows = []
|
||||
|
||||
@@ -201,9 +230,106 @@ def _(df, mo, pd):
|
||||
expanded_df_raw['tag'].str.startswith(('VT -', 'CT -'), na=False)
|
||||
].copy()
|
||||
|
||||
print(f"{len(sentiment_df[sentiment_df['manual_analysis']])} Rows with multiple contexts")
|
||||
|
||||
sentiment_df[sentiment_df['manual_analysis']]
|
||||
return (sentiment_df,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Create 'theme' column
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sentiment_df):
|
||||
from utils import extract_theme
|
||||
sentiment_df['theme'] = sentiment_df.apply(lambda row: extract_theme(row['tag']), axis=1)
|
||||
sentiment_df
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Extract Sentiment + Reasoning
|
||||
|
||||
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
|
||||
|
||||
Values to be used:
|
||||
- Positive: +1
|
||||
- Neutral: 0
|
||||
- Negative: -1
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo):
|
||||
start_processing_btn = mo.ui.button(
|
||||
label="Start Sentiment Extraction",
|
||||
kind="warn",
|
||||
on_click=lambda val: True
|
||||
)
|
||||
start_processing_btn
|
||||
return (start_processing_btn,)
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(
|
||||
client,
|
||||
codebook_df,
|
||||
mo,
|
||||
model_select,
|
||||
pd,
|
||||
sentiment_df,
|
||||
start_processing_btn,
|
||||
):
|
||||
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
|
||||
|
||||
# add theme_description to be used in LLM prompt
|
||||
_df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
|
||||
|
||||
# Wait for start processing button
|
||||
mo.stop(not start_processing_btn.value, "Click button above to start processing")
|
||||
|
||||
|
||||
sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
|
||||
lambda row: pd.Series(ollama_sentiment_analysis(
|
||||
content=row['content'],
|
||||
theme=row['theme'],
|
||||
theme_description=row['theme_description'],
|
||||
client=client,
|
||||
model=model_select.value
|
||||
)),
|
||||
axis=1
|
||||
)
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, sentiment_df):
|
||||
mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
|
||||
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
## Multi-context tags
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, sentiment_df):
|
||||
manual_rows = sentiment_df[sentiment_df['manual_analysis']]
|
||||
split_rows_editor = None
|
||||
|
||||
rows_to_edit = []
|
||||
|
||||
if not manual_rows.empty:
|
||||
print(
|
||||
@@ -223,34 +349,42 @@ def _(df, mo, pd):
|
||||
|
||||
else:
|
||||
print("✓ No multi-context rows found")
|
||||
return rows_to_edit, split_rows_editor
|
||||
|
||||
return rows_to_edit, sentiment_df, split_rows_editor
|
||||
|
||||
@app.cell
|
||||
def _(split_rows_editor):
|
||||
split_rows_editor
|
||||
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo, rows_to_edit, split_rows_editor):
|
||||
mo.vstack([
|
||||
mo.md(f"""
|
||||
### ⚠️ Manual Review Required
|
||||
if split_rows_editor is not None:
|
||||
mo.vstack([
|
||||
mo.md(f"""
|
||||
### ⚠️ Manual Review Required
|
||||
|
||||
**{len(rows_to_edit)} rows** were split from multi-context entries.
|
||||
Please review them below:
|
||||
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
|
||||
2. Click **Submit** to apply changes.
|
||||
"""),
|
||||
split_rows_editor
|
||||
])
|
||||
**{len(rows_to_edit)} rows** were split from multi-context entries.
|
||||
Please review them below:
|
||||
1. Update the `sentiment` column (-1, 0, 1) for each row based on the specific context.
|
||||
2. Click **Submit** to apply changes.
|
||||
"""),
|
||||
split_rows_editor
|
||||
])
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(mo, split_rows_editor):
|
||||
# Capture the edited manual-analysis rows for validation
|
||||
mo.stop(split_rows_editor.value is None, mo.md("Submit your sentiment analysis changes before continuing."))
|
||||
reviewed_manual_rows = split_rows_editor.value
|
||||
reviewed_manual_rows = getattr(split_rows_editor, 'value', '')
|
||||
mo.stop(reviewed_manual_rows is None, mo.md("Submit your sentiment analysis changes before continuing."))
|
||||
|
||||
# Ensure all manual-analysis rows include a sentiment of -1, 0, or 1
|
||||
if not reviewed_manual_rows.empty:
|
||||
|
||||
if (reviewed_manual_rows != '') and (not reviewed_manual_rows.empty):
|
||||
valid_sentiments = {-1, 0, 1}
|
||||
needs_review = reviewed_manual_rows[
|
||||
reviewed_manual_rows['manual_analysis']
|
||||
@@ -258,44 +392,10 @@ def _(mo, split_rows_editor):
|
||||
]
|
||||
assert needs_review.empty, f"{len(needs_review)} manual-analysis rows missing sentiment -1/0/1"
|
||||
|
||||
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
|
||||
print("Verification: ✓ All Manual-analysis rows have valid sentiment values")
|
||||
return (reviewed_manual_rows,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
# Highlight Sentiment Analysis
|
||||
|
||||
For each row in the dataframe, analyze the sentiment of the 'content' regarding the respective tag. This should be done for all 'VT -' and 'CT -' tags, since these represent the 'VoiceThemes' and 'CharacterThemes' respectively. The results should be stored in a new 'sentiment' column.
|
||||
|
||||
Values to be used:
|
||||
- Positive: +1
|
||||
- Neutral: 0
|
||||
- Negative: -1
|
||||
""")
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(sentiment_df):
|
||||
# for now, create an empty sentiment column with randomized dummy values for testing
|
||||
# only for 'VT -' and 'CT -' tags
|
||||
import random
|
||||
|
||||
def dummy_sentiment_analysis(content, tag):
|
||||
if tag.startswith('VT -') or tag.startswith('CT -'):
|
||||
return random.choice([-1, 0, 1]) # Random sentiment for testing
|
||||
return None
|
||||
|
||||
# Only run on rows without manual_analysis
|
||||
|
||||
sentiment_df['sentiment'] = sentiment_df[~sentiment_df['manual_analysis']].apply(lambda row: dummy_sentiment_analysis(row['content'], row['tag']), axis=1)
|
||||
|
||||
sentiment_df[~sentiment_df['manual_analysis']]
|
||||
return
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
@@ -307,7 +407,10 @@ def _(mo):
|
||||
@app.cell
|
||||
def _(pd, reviewed_manual_rows, sentiment_df):
|
||||
_static_analysis_rows = sentiment_df[~sentiment_df['manual_analysis']]
|
||||
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
|
||||
if isinstance(reviewed_manual_rows, pd.DataFrame):
|
||||
recombined_df = pd.concat([_static_analysis_rows, reviewed_manual_rows]).sort_values(by='_seq_id').reset_index(drop=True)
|
||||
else:
|
||||
recombined_df = sentiment_df
|
||||
|
||||
recombined_df
|
||||
return (recombined_df,)
|
||||
@@ -348,7 +451,7 @@ def _(mo):
|
||||
def _(WORKING_DIR, datetime, interview_select, recombined_df):
|
||||
# Save to CSV in working dir
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments_{timestamp}.csv"
|
||||
filename = WORKING_DIR / f"{interview_select.value.split(' ')[0]}_sentiments.csv"
|
||||
recombined_df.to_csv(filename, index=False)
|
||||
|
||||
print(f"✓ Saved processed data to '{filename}'")
|
||||
|
||||
@@ -9,14 +9,14 @@ def _():
|
||||
import marimo as mo
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from utils import create_sentiment_matrix
|
||||
|
||||
INPUT_DIR = Path("./data/processing/02_taguette_postprocess")
|
||||
WORKING_DIR = Path('./data/processing/03_sentiment_analysis')
|
||||
|
||||
if not WORKING_DIR.exists():
|
||||
WORKING_DIR.mkdir(parents=True)
|
||||
|
||||
return INPUT_DIR, Path, WORKING_DIR, mo, pd
|
||||
return INPUT_DIR, Path, WORKING_DIR, create_sentiment_matrix, mo, pd
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
@@ -62,55 +62,6 @@ def _(mo):
|
||||
return
|
||||
|
||||
|
||||
@app.cell
|
||||
def _(document_name, pd):
|
||||
import numpy as np
|
||||
|
||||
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print(f"No sentiment data found for document: {document_name}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print(f"No Voice/Character context found for document: {document_name}")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# # Convert to integers for cleaner display
|
||||
# matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
return (create_sentiment_matrix,)
|
||||
|
||||
|
||||
@app.cell(hide_code=True)
|
||||
def _(mo):
|
||||
mo.md(r"""
|
||||
|
||||
@@ -32,7 +32,7 @@ def _(INPUT_DIR, mo):
|
||||
file_options = {f.stem: str(f) for f in voice_csv_files}
|
||||
|
||||
voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
|
||||
voice_multiselect
|
||||
|
||||
return (voice_multiselect,)
|
||||
|
||||
|
||||
@@ -17,18 +17,22 @@ services:
|
||||
# c) Explicitly override: docker compose run --gpus all ollama
|
||||
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
|
||||
# 'devices' section further down as a fallback (less portable).
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: all
|
||||
# capabilities: [gpu]
|
||||
|
||||
# environment:
|
||||
## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
|
||||
environment:
|
||||
# Visible devices / capabilities for the NVIDIA container runtime
|
||||
# - NVIDIA_VISIBLE_DEVICES=all
|
||||
# - NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
- NVIDIA_VISIBLE_DEVICES=all
|
||||
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
|
||||
## ---------- END GPU SUPPORT BLOCK ------------###
|
||||
|
||||
|
||||
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
|
||||
# devices:
|
||||
|
||||
4
utils/__init__.py
Normal file
4
utils/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .ollama_utils import connect_qumo_ollama
|
||||
from .data_utils import create_sentiment_matrix, extract_theme
|
||||
from .transcript_utils import load_srt
|
||||
from .sentiment_analysis import dummy_sentiment_analysis, ollama_sentiment_analysis
|
||||
65
utils/data_utils.py
Normal file
65
utils/data_utils.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def create_sentiment_matrix(doc_df, column_prefix='VT - |CT - ', row_prefix='_V-|_C-'):
|
||||
"""
|
||||
Create a sentiment matrix for a specific document.
|
||||
|
||||
Parameters:
|
||||
- df: DataFrame with columns ['document', 'tag', '_context', 'sentiment']
|
||||
- document_name: Name of the document to filter by
|
||||
|
||||
Returns:
|
||||
- DataFrame representing the sentiment matrix
|
||||
"""
|
||||
|
||||
# Filter for rows where the tag matches the sentiment prefixes (VT-/CT-)
|
||||
sentiment_rows = doc_df[
|
||||
doc_df['tag'].str.contains(column_prefix, na=False)
|
||||
].copy()
|
||||
|
||||
if sentiment_rows.empty:
|
||||
print("No sentiment data found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Filter for rows with valid Voice/Character context
|
||||
valid_rows = sentiment_rows[
|
||||
sentiment_rows['_context'].notna() &
|
||||
(sentiment_rows['_context'].str.contains(row_prefix, na=False))
|
||||
].copy()
|
||||
|
||||
if valid_rows.empty:
|
||||
print("No Voice/Character context found")
|
||||
return pd.DataFrame()
|
||||
|
||||
# Create aggregation: group by Voice/Character (_context) and Theme (tag)
|
||||
# Sum sentiment scores for each combination
|
||||
matrix_data = valid_rows.groupby(['_context', 'tag'])['sentiment'].sum().reset_index()
|
||||
|
||||
# Pivot to create the matrix
|
||||
matrix = matrix_data.pivot(index='_context', columns='tag', values='sentiment')
|
||||
|
||||
# # Convert to integers for cleaner display
|
||||
# matrix = matrix.astype(int)
|
||||
|
||||
return matrix
|
||||
|
||||
|
||||
|
||||
def extract_theme(tag: str, theme_prefixes='VT - |CT - ') -> str:
|
||||
"""
|
||||
Extract the theme from a tag string.
|
||||
|
||||
Parameters:
|
||||
- tag: str, the tag string (e.g., 'VT - Personal Experience')
|
||||
- theme_prefixes: str, prefixes to remove from the tag (e.g., 'VT - |CT - ')
|
||||
|
||||
Returns:
|
||||
- str, the extracted theme (e.g., 'Personal Experience')
|
||||
- None if no theme found
|
||||
"""
|
||||
for prefix in theme_prefixes.split('|'):
|
||||
if tag.startswith(prefix):
|
||||
return tag.replace(prefix, '').strip()
|
||||
return None
|
||||
|
||||
42
utils/ollama_utils.py
Normal file
42
utils/ollama_utils.py
Normal file
@@ -0,0 +1,42 @@
|
||||
|
||||
|
||||
|
||||
import requests
|
||||
from ollama import Client
|
||||
|
||||
|
||||
|
||||
|
||||
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434', print_models=True) -> Client:
|
||||
"""Establish connection to Qumo Ollama instance
|
||||
|
||||
vm_name: str ('ollama-lite' or 'hiperf-gpu')
|
||||
Name of the VM running the Ollama instance
|
||||
|
||||
Returns:
|
||||
tuple(Client): Ollama client connected to the specified VM
|
||||
"""
|
||||
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
|
||||
|
||||
if vm_name in ['localhost', '0.0.0.0']:
|
||||
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
|
||||
|
||||
try:
|
||||
requests.get(QUMO_OLLAMA_URL, timeout=5)
|
||||
client = Client(
|
||||
host=QUMO_OLLAMA_URL
|
||||
)
|
||||
|
||||
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}")
|
||||
models = [m.model for m in client.list().models]
|
||||
if print_models:
|
||||
print("Available models:")
|
||||
for m in models:
|
||||
print(f" - '{m}' ")
|
||||
return client, models
|
||||
|
||||
except requests.ConnectionError:
|
||||
pass
|
||||
|
||||
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
||||
return None, None
|
||||
135
utils/sentiment_analysis.py
Normal file
135
utils/sentiment_analysis.py
Normal file
@@ -0,0 +1,135 @@
|
||||
import random
|
||||
import pandas as pd
|
||||
|
||||
from ollama import Client
|
||||
import json
|
||||
|
||||
def dummy_sentiment_analysis(content, tag):
|
||||
if tag.startswith('VT -') or tag.startswith('CT -'):
|
||||
return random.choice([-1, 0, 1]), 'random dummy sentiment' # Random sentiment for testing
|
||||
|
||||
return 'test', 'not applicable'
|
||||
|
||||
|
||||
|
||||
def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
|
||||
"""
|
||||
Perform sentiment analysis using Ollama model.
|
||||
|
||||
Parameters:
|
||||
- content: Text content to analyze
|
||||
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
|
||||
|
||||
Returns:
|
||||
- sentiment score and reason
|
||||
"""
|
||||
prompt = f"""
|
||||
# Role
|
||||
You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
|
||||
|
||||
# Input
|
||||
Theme: `{theme}`
|
||||
Theme Description: `{theme_description}`
|
||||
Quote:
|
||||
```
|
||||
{content}
|
||||
```
|
||||
|
||||
# Instructions
|
||||
1. Analyze the sentiment of the quote specifically regarding the theme.
|
||||
2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
|
||||
3. Assign a sentiment score:
|
||||
- -1: Negative (complaint, dissatisfaction, criticism)
|
||||
- 0: Neutral (factual, mixed, or no strong opinion)
|
||||
- 1: Positive (praise, satisfaction, agreement)
|
||||
4. Provide a concise reason (max 10 words).
|
||||
|
||||
# Constraints
|
||||
- Return ONLY a valid JSON object.
|
||||
- Do not use Markdown formatting (no ```json blocks).
|
||||
- Do not write any Python code or explanations outside the JSON.
|
||||
- If the quote is irrelevant to the theme, return sentiment 0.
|
||||
|
||||
# Response Format
|
||||
{{
|
||||
"keywords": ["<list_of_keywords>"],
|
||||
"sentiment": <integer_score>,
|
||||
"reason": "<string_reason>"
|
||||
}}
|
||||
|
||||
# Examples
|
||||
|
||||
Example 1:
|
||||
Theme: `Speed`
|
||||
Quote: `It was a little slow for me.`
|
||||
Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
|
||||
|
||||
Example 2:
|
||||
Theme: `Price`
|
||||
Quote: `It costs $50.`
|
||||
Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
|
||||
|
||||
Example 3:
|
||||
Theme: `Friendliness`
|
||||
Quote: `Sound very welcoming.`
|
||||
Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
|
||||
"""
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
resp = client.generate(
|
||||
model=model,
|
||||
prompt=prompt,
|
||||
)
|
||||
|
||||
response_text = resp.response.strip()
|
||||
|
||||
# Extract JSON from response
|
||||
start_index = response_text.find('{')
|
||||
end_index = response_text.rfind('}') + 1
|
||||
|
||||
if start_index == -1 or end_index == 0:
|
||||
raise ValueError("No JSON found")
|
||||
|
||||
json_str = response_text[start_index:end_index]
|
||||
|
||||
response_json = json.loads(json_str)
|
||||
keywords = response_json.get('keywords', [])
|
||||
sentiment = response_json.get('sentiment', 'test')
|
||||
reason = response_json.get('reason', 'no reason provided')
|
||||
return keywords, sentiment, reason
|
||||
|
||||
except Exception as e:
|
||||
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return [], None, 'parsing error'
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
client = Client(
|
||||
host="http://localhost:11434"
|
||||
)
|
||||
|
||||
sentiment_df = pd.DataFrame({
|
||||
'content': [
|
||||
"I love this product!",
|
||||
"This is the worst service ever.",
|
||||
"It's okay, not great but not terrible."
|
||||
],
|
||||
'tag': [
|
||||
'VT - Personal Experience',
|
||||
'VT - Personal Experience',
|
||||
'VT - Personal Experience'
|
||||
],
|
||||
'manual_analysis': [False, False, True]
|
||||
})
|
||||
|
||||
sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
|
||||
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['tag'], client, model='llama3.2:latest')),
|
||||
axis=1
|
||||
)
|
||||
|
||||
print(sentiment_df.head())
|
||||
|
||||
@@ -1,13 +1,6 @@
|
||||
"""
|
||||
Standard utils for this repository
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from ollama import Client
|
||||
|
||||
import re
|
||||
|
||||
def load_srt(path: str | Path) -> str:
|
||||
"""Load and parse an SRT file, returning clean transcript with speaker labels.
|
||||
@@ -59,36 +52,3 @@ def load_srt(path: str | Path) -> str:
|
||||
# Format as "SPEAKER_XX: text"
|
||||
transcript_lines = [f"{speaker}: {utterance}" for speaker, utterance in merged]
|
||||
return '\n\n'.join(transcript_lines)
|
||||
|
||||
|
||||
def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
|
||||
"""Establish connection to Qumo Ollama instance
|
||||
|
||||
vm_name: str ('ollama-lite' or 'hiperf-gpu')
|
||||
Name of the VM running the Ollama instance
|
||||
|
||||
Returns:
|
||||
tuple(Client): Ollama client connected to the specified VM
|
||||
"""
|
||||
QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
|
||||
|
||||
if vm_name in ['localhost', '0.0.0.0']:
|
||||
QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
|
||||
|
||||
try:
|
||||
requests.get(QUMO_OLLAMA_URL, timeout=5)
|
||||
client = Client(
|
||||
host=QUMO_OLLAMA_URL
|
||||
)
|
||||
|
||||
print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
|
||||
for m in client.list().models:
|
||||
print(f" - '{m.model}' ")
|
||||
return client
|
||||
|
||||
except requests.ConnectionError:
|
||||
pass
|
||||
|
||||
print(f"Failed to reach {QUMO_OLLAMA_URL}. Check that the VM is running and Tailscale is up")
|
||||
return None
|
||||
|
||||
Reference in New Issue
Block a user