Interview-Analysis/utils/keyword_analysis.py

import pandas as pd

from ollama import Client
import json
import matplotlib.pyplot as plt

import random
import matplotlib.colors as mcolors

def blue_color_func( word, font_size, position, orientation, random_state=None, **kwargs):
    # Use the provided random_state for reproducibility if available, else use random module
    r = random_state if random_state else random

    # Sample from the darker end of the 'Blues' colormap (e.g., 0.4 to 1.0)
    # 0.0 is white/light, 1.0 is dark blue
    min_val, max_val = 0.4, 1.0
    color_val = r.uniform(min_val, max_val)

    # Get color from matplotlib colormap
    rgba = plt.cm.Blues(color_val)
    return mcolors.to_hex(rgba)


def worker_extraction(row, host, model):


    # Instantiate local client for this specific worker/thread
    local_client = Client(host=host)

    return ollama_keyword_extraction(
        content=row['content'],
        tag=row['tag'],
        client=local_client,
        model=model
    )


def ollama_keyword_extraction(content, tag, client: Client, model) -> list:
    """
    Perform sentiment analysis using Ollama model.

    Parameters:
    - content: Text content to analyze
    - tag: Tag indicating the type of sentiment analysis (e.g., 'VT - Positive')

    Returns:
    - sentiment score and reason
    """

    # Construct prompt for Ollama model
    # Prompt optimized for small models (Llama 3.2):
    # - Fewer rules, prioritized by importance
    # - Explicit verbatim instruction (prevents truncation errors)
    # - Examples that reinforce exact copying
    # - Positive framing (do X) instead of negative (don't do Y)
    # - Minimal formatting overhead
    prompt = f"""Extract keywords from interview quotes for thematic analysis.

RULES (in priority order):
1. Extract only keywords RELEVANT to the given context. Ignore off-topic content. Do NOT invent keywords.
2. Use words from the quote, but generalize for clustering (e.g., "not youthful" → "traditional").
3. Extract 1-5 keywords or short phrases that capture key themes.
4. Prefer descriptive phrases over vague single words (e.g., "tech forward" not "tech").

EXAMPLES:

Context: Chase as a Brand
Quote: "It's definitely not, like, youthful or trendy."
Output: {{"keywords": ["traditional", "established"]}}

Context: App Usability
Quote: "There are so many options when I try to pay, it's confusing."
Output: {{"keywords": ["confusing", "overwhelming options"]}}

Context: Brand Perception
Quote: "I would say reliable, trustworthy, kind of old-school."
Output: {{"keywords": ["reliable", "trustworthy", "old-school"]}}

NOW EXTRACT KEYWORDS:

Context: {tag}
Quote: "{content}"
Output:"""

    max_retries = 3
    for attempt in range(max_retries):
        try:
            resp = client.generate(
                model=model,
                prompt=prompt,
                format='json',
            )

            response_text = resp.response.strip()

            # Extract JSON from response
            start_index = response_text.find('{')

            if start_index == -1:
                raise ValueError("No JSON found")

            response_json, _ = json.JSONDecoder().raw_decode(response_text[start_index:])
            keywords = response_json.get('keywords', [])
            return [keywords]

        except Exception as e:
            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}. Output was: {response_text}")
            if attempt == max_retries - 1:
                return [[]]