109 lines
3.4 KiB
Python
109 lines
3.4 KiB
Python
import pandas as pd
|
|
|
|
from ollama import Client
|
|
import json
|
|
import matplotlib.pyplot as plt
|
|
|
|
import random
|
|
import matplotlib.colors as mcolors
|
|
|
|
def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
|
|
# Use the provided random_state for reproducibility if available, else use random module
|
|
r = random_state if random_state else random
|
|
|
|
# Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
|
|
# 0.0 is white/light, 1.0 is dark blue
|
|
min_val, max_val = 0.4, 1.0
|
|
color_val = r.uniform(min_val, max_val)
|
|
|
|
# Get color from matplotlib colormap
|
|
rgba = plt.cm.Blues(color_val)
|
|
return mcolors.to_hex(rgba)
|
|
|
|
|
|
def worker_extraction(row, host, model):
|
|
|
|
|
|
# Instantiate local client for this specific worker/thread
|
|
local_client = Client(host=host)
|
|
|
|
return ollama_keyword_extraction(
|
|
content=row['content'],
|
|
tag=row['tag'],
|
|
client=local_client,
|
|
model=model
|
|
)
|
|
|
|
|
|
def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
|
|
"""
|
|
Perform sentiment analysis using Ollama model.
|
|
|
|
Parameters:
|
|
- content: Text content to analyze
|
|
- tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')
|
|
|
|
Returns:
|
|
- sentiment score and reason
|
|
"""
|
|
|
|
# Construct prompt for Ollama model
|
|
# Prompt optimized for small models (Llama 3.2):
|
|
# - Fewer rules, prioritized by importance
|
|
# - Explicit verbatim instruction (prevents truncation errors)
|
|
# - Examples that reinforce exact copying
|
|
# - Positive framing (do X) instead of negative (don't do Y)
|
|
# - Minimal formatting overhead
|
|
prompt = f"""Extract keywords from interview quotes for thematic analysis.
|
|
|
|
RULES (in priority order):
|
|
1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
|
|
2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional").
|
|
3. Extract 1-5 keywords or short phrases that capture key themes.
|
|
4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").
|
|
|
|
EXAMPLES:
|
|
|
|
Context: Chase as a Brand
|
|
Quote: "It's definitely not, like, youthful or trendy."
|
|
Output: {{"keywords": ["traditional", "established"]}}
|
|
|
|
Context: App Usability
|
|
Quote: "There are so many options when I try to pay, it's confusing."
|
|
Output: {{"keywords": ["confusing", "overwhelming options"]}}
|
|
|
|
Context: Brand Perception
|
|
Quote: "I would say reliable, trustworthy, kind of old-school."
|
|
Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}
|
|
|
|
NOW EXTRACT KEYWORDS:
|
|
|
|
Context: {tag}
|
|
Quote: "{content}"
|
|
Output:"""
|
|
|
|
max_retries = 3
|
|
for attempt in range(max_retries):
|
|
try:
|
|
resp = client.generate(
|
|
model=model,
|
|
prompt=prompt,
|
|
format='json',
|
|
)
|
|
|
|
response_text = resp.response.strip()
|
|
|
|
# Extract JSON from response
|
|
start_index = response_text.find('{')
|
|
|
|
if start_index == -1:
|
|
raise ValueError("No JSON found")
|
|
|
|
response_json, _ = json.JSONDecoder().raw_decode(response_text[start_index:])
|
|
keywords = response_json.get('keywords', [])
|
|
return [keywords]
|
|
|
|
except Exception as e:
|
|
print(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Output was: {response_text}")
|
|
if attempt == max_retries - 1:
|
|
return [[]] |