llm processing of sentiment

This commit is contained in:
2025-12-12 14:28:51 +01:00
parent e576f98cce
commit ccc5154b93
5 changed files with 135 additions and 83 deletions

View File

@@ -70,13 +70,13 @@ def csv_to_markdown(df):
return "\n\n".join(lines) return "\n\n".join(lines)
@app.cell @app.cell(hide_code=True)
def _(file_dropdown, mo, pd): def _(file_dropdown, mo, pd):
# Preview # Preview
preview = mo.md("") preview = mo.md("")
if file_dropdown.value: if file_dropdown.value:
df = pd.read_csv(file_dropdown.value) df = pd.read_csv(file_dropdown.value)
md_content = csv_to_markdown(df) md_content = csv_to_markdown(df.head(10))
preview = mo.md(md_content) preview = mo.md(md_content)
preview preview

View File

@@ -18,7 +18,7 @@ def _():
client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
WORKING_DIR = Path('./data/processing/02_taguette_postprocess') WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
if not WORKING_DIR.exists(): if not WORKING_DIR.exists():
@@ -47,13 +47,18 @@ def _():
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(TAGUETTE_EXPORT_DIR, mo): def _(TAGUETTE_EXPORT_DIR, mo):
mo.md(rf""" mo.md(rf"""
# Step 1: Export All Highlights out of Taguette # Step 1: Export Data out of Taguette
1. Go to: http://taguette.tail44fa00.ts.net/project/1 **Highlights**
2. Select 'Highlights' on left 1. Go to: https://taguette.qumo.io/project/1
3. Select 'See all hightlights' 2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
4. Top right 'Export this view' > 'CSV' 3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
**Tags Codebook**
1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
_NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
""") """)
return return
@@ -67,13 +72,21 @@ def _(mo):
@app.cell @app.cell
def _(pd): def _(TAGUETTE_EXPORT_DIR, pd):
all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv') all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
all_tags_df['_seq_id'] = range(len(all_tags_df)) all_tags_df['_seq_id'] = range(len(all_tags_df))
all_tags_df.head(20) all_tags_df
return (all_tags_df,) return (all_tags_df,)
@app.cell
def _(TAGUETTE_EXPORT_DIR, pd):
codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
codebook_df
return (codebook_df,)
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo): def _(mo):
mo.md(r""" mo.md(r"""
@@ -255,30 +268,51 @@ def _(mo):
@app.cell @app.cell
def _(client, model_select, pd, sentiment_df): def _(mo):
# for now, create an empty sentiment column with randomized dummy values for testing start_processing_btn = mo.ui.button(
# only for 'VT -' and 'CT -' tags label="Start Sentiment Extraction",
kind="warn",
on_click=lambda val: True
)
start_processing_btn
return (start_processing_btn,)
@app.cell
def _(
client,
codebook_df,
mo,
model_select,
pd,
sentiment_df,
start_processing_btn,
):
from utils import dummy_sentiment_analysis, ollama_sentiment_analysis from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
# Only run on rows without manual_analysis # add theme_description to be used in LLM prompt
_df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
# sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply( # Wait for start processing button
# lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])), mo.stop(not start_processing_btn.value, "Click button above to start processing")
# axis=1
# )
sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)), sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
lambda row: pd.Series(ollama_sentiment_analysis(
content=row['content'],
theme=row['theme'],
theme_description=row['theme_description'],
client=client,
model=model_select.value
)),
axis=1 axis=1
) )
return return
@app.cell @app.cell
def _(sentiment_df): def _(mo, sentiment_df):
mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']] sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
return return
@@ -318,6 +352,13 @@ def _(mo, sentiment_df):
return rows_to_edit, split_rows_editor return rows_to_edit, split_rows_editor
@app.cell
def _(split_rows_editor):
split_rows_editor
return
@app.cell(hide_code=True) @app.cell(hide_code=True)
def _(mo, rows_to_edit, split_rows_editor): def _(mo, rows_to_edit, split_rows_editor):
if split_rows_editor is not None: if split_rows_editor is not None:

View File

@@ -32,7 +32,7 @@ def _(INPUT_DIR, mo):
file_options = {f.stem: str(f) for f in voice_csv_files} file_options = {f.stem: str(f) for f in voice_csv_files}
voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation") voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
voice_multiselect
return (voice_multiselect,) return (voice_multiselect,)

View File

@@ -17,6 +17,8 @@ services:
# c) Explicitly override: docker compose run --gpus all ollama # c) Explicitly override: docker compose run --gpus all ollama
# 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
# 'devices' section further down as a fallback (less portable). # 'devices' section further down as a fallback (less portable).
## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
deploy: deploy:
resources: resources:
reservations: reservations:
@@ -29,6 +31,8 @@ services:
# Visible devices / capabilities for the NVIDIA container runtime # Visible devices / capabilities for the NVIDIA container runtime
- NVIDIA_VISIBLE_DEVICES=all - NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility - NVIDIA_DRIVER_CAPABILITIES=compute,utility
## ---------- END GPU SUPPORT BLOCK ------------###
# Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors): # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
# devices: # devices:

View File

@@ -12,7 +12,7 @@ def dummy_sentiment_analysis(content, tag):
def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]: def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
""" """
Perform sentiment analysis using Ollama model. Perform sentiment analysis using Ollama model.
@@ -24,79 +24,86 @@ def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[li
- sentiment score and reason - sentiment score and reason
""" """
prompt = f""" prompt = f"""
# Instructions # Role
You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words). You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
You need to deliver three pieces of information:
1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
3. A brief reason (max 10 words) explaining your sentiment score.
# Guidelines
Keywords should be directly relevant to the theme.
The reason should be extremely concise and to the point:
- Does not need to be a full sentence.
- Sentiment itself does not need to be stated in the explanation.
- If keywords are present in the quote that directly capture the sentiment, give that as the reason..
# Input # Input
Theme: `{theme}` Theme: `{theme}`
Theme Description: `{theme_description}`
Quote: Quote:
``` ```
{content} {content}
``` ```
# Instructions
1. Analyze the sentiment of the quote specifically regarding the theme.
2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
3. Assign a sentiment score:
- -1: Negative (complaint, dissatisfaction, criticism)
- 0: Neutral (factual, mixed, or no strong opinion)
- 1: Positive (praise, satisfaction, agreement)
4. Provide a concise reason (max 10 words).
# Constraints
- Return ONLY a valid JSON object.
- Do not use Markdown formatting (no ```json blocks).
- Do not write any Python code or explanations outside the JSON.
- If the quote is irrelevant to the theme, return sentiment 0.
# Response Format # Response Format
Provide your response in the following JSON format:
{{ {{
"keywords": ["<list_of_relevant_keywords_if_any>"], "keywords": ["<list_of_keywords>"],
"sentiment": <sentiment_score>, "sentiment": <integer_score>,
"reason": "<brief_explanation_max_10_words>" "reason": "<string_reason>"
}} }}
# Examples # Examples
** Example 1**
- Theme: `Speed`
- Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
- Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}} Example 1:
Theme: `Speed`
** Example 2** Quote: `It was a little slow for me.`
- Theme: `Friendliness / Empathy` Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
- Quote: `Sound very welcoming`
Example 2:
- Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }} Theme: `Price`
Quote: `It costs $50.`
Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
Example 3:
Theme: `Friendliness`
Quote: `Sound very welcoming.`
Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
""" """
resp = client.generate( max_retries = 3
model=model, for attempt in range(max_retries):
prompt=prompt, try:
) resp = client.generate(
model=model,
try: prompt=prompt,
response_text = resp.response.strip() )
response_text = resp.response.strip()
# Extract JSON from response # Extract JSON from response
start_index = response_text.find('{') start_index = response_text.find('{')
end_index = response_text.rfind('}') + 1 end_index = response_text.rfind('}') + 1
json_str = response_text[start_index:end_index]
if start_index == -1 or end_index == 0:
raise ValueError("No JSON found")
json_str = response_text[start_index:end_index]
response_json = json.loads(json_str)
keywords = response_json.get('keywords', [])
sentiment = response_json.get('sentiment', 'test')
reason = response_json.get('reason', 'no reason provided')
return keywords, sentiment, reason
response_json = json.loads(json_str) except Exception as e:
keywords = response_json.get('keywords', []) print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
sentiment = response_json.get('sentiment', 'test') if attempt == max_retries - 1:
reason = response_json.get('reason', 'no reason provided') return [], None, 'parsing error'
return keywords, sentiment, reason
except Exception as e:
print(f"Error parsing response: {e}")
return [], None, 'parsing error'
if __name__ == "__main__": if __name__ == "__main__":