llm processing of sentiment

2025-12-12 14:28:51 +01:00
parent e576f98cce
commit ccc5154b93
5 changed files with 135 additions and 83 deletions
--- a/01_Taguette-Pre-Process.py
+++ b/01_Taguette-Pre-Process.py
@@ -70,13 +70,13 @@ def csv_to_markdown(df):
    return "\n\n".join(lines)
-@app.cell
+@app.cell(hide_code=True)
 def _(file_dropdown, mo, pd):
    # Preview
    preview = mo.md("")
    if file_dropdown.value:
        df = pd.read_csv(file_dropdown.value)
-        md_content = csv_to_markdown(df)
+        md_content = csv_to_markdown(df.head(10))
        preview = mo.md(md_content)
    preview
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -18,7 +18,7 @@ def _():
    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)
-    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
+    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
    if not WORKING_DIR.exists():
@@ -47,13 +47,18 @@ def _():
@app.cell(hide_code=True)
 def _(TAGUETTE_EXPORT_DIR, mo):
    mo.md(rf"""
-    # Step 1: Export All Highlights out of Taguette
+    # Step 1: Export Data out of Taguette
-    1. Go to: http://taguette.tail44fa00.ts.net/project/1
+    **Highlights**
-    2. Select 'Highlights' on left
+    1. Go to: https://taguette.qumo.io/project/1
-    3. Select 'See all hightlights'
+    2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
-    4. Top right 'Export this view' > 'CSV'
+    3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
-    5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+
    **Tags Codebook**
    1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
    2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
    _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
    """)
    return
@@ -67,13 +72,21 @@ def _(mo):
@app.cell
-def _(pd):
+def _(TAGUETTE_EXPORT_DIR, pd):
-    all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
+    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
-    all_tags_df.head(20)
+    all_tags_df
    return (all_tags_df,)
@app.cell
 def _(TAGUETTE_EXPORT_DIR, pd):
    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
    codebook_df
    return (codebook_df,)
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -255,30 +268,51 @@ def _(mo):
@app.cell
-def _(client, model_select, pd, sentiment_df):
+def _(mo):
-    # for now, create an empty sentiment column with randomized dummy values for testing
+    start_processing_btn = mo.ui.button(
-    # only for 'VT -' and 'CT -' tags
+        label="Start Sentiment Extraction",
        kind="warn",
        on_click=lambda val: True
    )
    start_processing_btn
    return (start_processing_btn,)
@app.cell
 def _(
    client,
    codebook_df,
    mo,
    model_select,
    pd,
    sentiment_df,
    start_processing_btn,
 ):
    from utils import dummy_sentiment_analysis, ollama_sentiment_analysis
-    # Only run on rows without manual_analysis
+    # add theme_description to be used in LLM prompt
    _df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))
-    # sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
+    # Wait for start processing button
-    #     lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])),
+    mo.stop(not start_processing_btn.value, "Click button above to start processing")
    #     axis=1
    # )
-    sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
+
-        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)),
+    sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
        lambda row: pd.Series(ollama_sentiment_analysis(
            content=row['content'], 
            theme=row['theme'], 
            theme_description=row['theme_description'],
            client=client, 
            model=model_select.value
        )),
        axis=1
    )
    return
@app.cell
-def _(sentiment_df):
+def _(mo, sentiment_df):
    mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
    sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
    return
@@ -318,6 +352,13 @@ def _(mo, sentiment_df):
    return rows_to_edit, split_rows_editor
@app.cell
 def _(split_rows_editor):
    split_rows_editor
    return
@app.cell(hide_code=True)
 def _(mo, rows_to_edit, split_rows_editor):
    if split_rows_editor is not None:
--- a/04_Results_Aggregation.py
+++ b/04_Results_Aggregation.py
@@ -32,7 +32,7 @@ def _(INPUT_DIR, mo):
    file_options = {f.stem: str(f) for f in voice_csv_files}
    voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
-    voice_multiselect
+
    return (voice_multiselect,)
--- a/ollama/docker-compose.yml
+++ b/ollama/docker-compose.yml
@@ -17,6 +17,8 @@ services:
    #       c) Explicitly override: docker compose run --gpus all ollama
    # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
    #    'devices' section further down as a fallback (less portable).
    ## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
    deploy:
      resources:
        reservations:
@@ -29,6 +31,8 @@ services:
      # Visible devices / capabilities for the NVIDIA container runtime
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
    ## ---------- END GPU SUPPORT BLOCK ------------###
    # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
    # devices:
--- a/utils/sentiment_analysis.py
+++ b/utils/sentiment_analysis.py
@@ -12,7 +12,7 @@ def dummy_sentiment_analysis(content, tag):
-def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
+def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
    """
    Perform sentiment analysis using Ollama model.
@@ -24,79 +24,86 @@ def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[li
    - sentiment score and reason
    """
    prompt = f"""
-    # Instructions
+    # Role
-    You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
+    You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.
    You need to deliver three pieces of information:
    1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
    2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
    3. A brief reason (max 10 words) explaining your sentiment score.
    # Guidelines    
    Keywords should be directly relevant to the theme.
    The reason should be extremely concise and to the point:
    - Does not need to be a full sentence.
    - Sentiment itself does not need to be stated in the explanation.
    - If keywords are present in the quote that directly capture the sentiment, give that as the reason..
    # Input
    Theme: `{theme}`
-    
+    Theme Description: `{theme_description}`
    Quote:
    ```
    {content}
    ```
-    
+
    # Instructions
    1. Analyze the sentiment of the quote specifically regarding the theme.
    2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
    3. Assign a sentiment score:
       - -1: Negative (complaint, dissatisfaction, criticism)
       - 0: Neutral (factual, mixed, or no strong opinion)
       - 1: Positive (praise, satisfaction, agreement)
    4. Provide a concise reason (max 10 words).
    # Constraints
    - Return ONLY a valid JSON object.
    - Do not use Markdown formatting (no ```json blocks).
    - Do not write any Python code or explanations outside the JSON.
    - If the quote is irrelevant to the theme, return sentiment 0.
    # Response Format
    Provide your response in the following JSON format:
    {{
-        "keywords": ["<list_of_relevant_keywords_if_any>"],
+        "keywords": ["<list_of_keywords>"],
-        "sentiment": <sentiment_score>,
+        "sentiment": <integer_score>,
-        "reason": "<brief_explanation_max_10_words>"
+        "reason": "<string_reason>"
    }}
    # Examples
    ** Example 1**
    - Theme: `Speed`
    - Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
-    - Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
+    Example 1:
-    
+    Theme: `Speed`
-    ** Example 2**
+    Quote: `It was a little slow for me.`
-    - Theme: `Friendliness / Empathy`
+    Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}
-    - Quote: `Sound very welcoming`
+
-    
+    Example 2:
-    - Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
+    Theme: `Price`
-    
+    Quote: `It costs $50.`
    Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}
    Example 3:
    Theme: `Friendliness`
    Quote: `Sound very welcoming.`
    Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
    """
-    resp = client.generate(
+    max_retries = 3
-        model=model,
+    for attempt in range(max_retries):
-        prompt=prompt,
+        try:
-    )
+            resp = client.generate(
-    
+                model=model,
-    try:
+                prompt=prompt,
-        response_text = resp.response.strip()
+            )
            response_text = resp.response.strip()
-        # Extract JSON from response
+            # Extract JSON from response
-        start_index = response_text.find('{')
+            start_index = response_text.find('{')
-        end_index = response_text.rfind('}') + 1
+            end_index = response_text.rfind('}') + 1
-        json_str = response_text[start_index:end_index]
+            
            if start_index == -1 or end_index == 0:
                raise ValueError("No JSON found")
            json_str = response_text[start_index:end_index]
            response_json = json.loads(json_str)
            keywords = response_json.get('keywords', [])
            sentiment = response_json.get('sentiment', 'test')
            reason = response_json.get('reason', 'no reason provided')
            return keywords, sentiment, reason
-        response_json = json.loads(json_str)
+        except Exception as e:
-        keywords = response_json.get('keywords', [])
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
-        sentiment = response_json.get('sentiment', 'test')
+            if attempt == max_retries - 1:
-        reason = response_json.get('reason', 'no reason provided')
+                return [], None, 'parsing error'
        return keywords, sentiment, reason
    except Exception as e:
        print(f"Error parsing response: {e}")
        return [], None, 'parsing error'
 if __name__ == "__main__":