llm processing of sentiment

2025-12-12 14:28:51 +01:00
parent e576f98cce
commit ccc5154b93
5 changed files with 135 additions and 83 deletions
--- a/01_Taguette-Pre-Process.py
+++ b/01_Taguette-Pre-Process.py
@@ -70,13 +70,13 @@ def csv_to_markdown(df):
    return "\n\n".join(lines)


-@app.cell
+@app.cell(hide_code=True)
 def _(file_dropdown, mo, pd):
    # Preview
    preview = mo.md("")
    if file_dropdown.value:
        df = pd.read_csv(file_dropdown.value)
-        md_content = csv_to_markdown(df)
+        md_content = csv_to_markdown(df.head(10))
        preview = mo.md(md_content)

    preview
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -18,7 +18,7 @@ def _():

    client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False)

-    TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
+    TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export')
    WORKING_DIR = Path('./data/processing/02_taguette_postprocess')

    if not WORKING_DIR.exists():
@@ -47,13 +47,18 @@ def _():
@app.cell(hide_code=True)
 def _(TAGUETTE_EXPORT_DIR, mo):
    mo.md(rf"""
-    # Step 1: Export All Highlights out of Taguette
+    # Step 1: Export Data out of Taguette

-    1. Go to: http://taguette.tail44fa00.ts.net/project/1
-    2. Select 'Highlights' on left
-    3. Select 'See all hightlights'
-    4. Top right 'Export this view' > 'CSV'
-    5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+    **Highlights**
+    1. Go to: https://taguette.qumo.io/project/1
+    2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV'
+    3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv'
+
+    **Tags Codebook**
+    1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV'
+    2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv'
+
+    _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_
    """)
    return

@@ -67,13 +72,21 @@ def _(mo):


@app.cell
-def _(pd):
-    all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv')
+def _(TAGUETTE_EXPORT_DIR, pd):
+    all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv')
    all_tags_df['_seq_id'] = range(len(all_tags_df))
-    all_tags_df.head(20)
+    all_tags_df
    return (all_tags_df,)


+@app.cell
+def _(TAGUETTE_EXPORT_DIR, pd):
+    codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv')
+    codebook_df.rename(columns={'description': 'theme_description'}, inplace=True)
+    codebook_df
+    return (codebook_df,)
+
+
@app.cell(hide_code=True)
 def _(mo):
    mo.md(r"""
@@ -255,30 +268,51 @@ def _(mo):


@app.cell
-def _(client, model_select, pd, sentiment_df):
-    # for now, create an empty sentiment column with randomized dummy values for testing
-    # only for 'VT -' and 'CT -' tags
+def _(mo):
+    start_processing_btn = mo.ui.button(
+        label="Start Sentiment Extraction",
+        kind="warn",
+        on_click=lambda val: True
+    )
+    start_processing_btn
+    return (start_processing_btn,)

+
+@app.cell
+def _(
+    client,
+    codebook_df,
+    mo,
+    model_select,
+    pd,
+    sentiment_df,
+    start_processing_btn,
+):
    from utils import dummy_sentiment_analysis, ollama_sentiment_analysis

-    # Only run on rows without manual_analysis
+    # add theme_description to be used in LLM prompt
+    _df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook'))

-    # sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
-    #     lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])),
-    #     axis=1
-    # )
+    # Wait for start processing button
+    mo.stop(not start_processing_btn.value, "Click button above to start processing")

-    sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply(
-        lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)),
+
+    sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply(
+        lambda row: pd.Series(ollama_sentiment_analysis(
+            content=row['content'], 
+            theme=row['theme'], 
+            theme_description=row['theme_description'],
+            client=client, 
+            model=model_select.value
+        )),
        axis=1
    )
-
-
    return


@app.cell
-def _(sentiment_df):
+def _(mo, sentiment_df):
+    mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis")
    sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']]
    return

@@ -318,6 +352,13 @@ def _(mo, sentiment_df):
    return rows_to_edit, split_rows_editor


+@app.cell
+def _(split_rows_editor):
+    split_rows_editor
+
+    return
+
+
@app.cell(hide_code=True)
 def _(mo, rows_to_edit, split_rows_editor):
    if split_rows_editor is not None:
--- a/04_Results_Aggregation.py
+++ b/04_Results_Aggregation.py
@@ -32,7 +32,7 @@ def _(INPUT_DIR, mo):
    file_options = {f.stem: str(f) for f in voice_csv_files}

    voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation")
-    voice_multiselect
+
    return (voice_multiselect,)


--- a/ollama/docker-compose.yml
+++ b/ollama/docker-compose.yml
@@ -17,6 +17,8 @@ services:
    #       c) Explicitly override: docker compose run --gpus all ollama
    # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
    #    'devices' section further down as a fallback (less portable).
+
+    ## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ###
    deploy:
      resources:
        reservations:
@@ -29,6 +31,8 @@ services:
      # Visible devices / capabilities for the NVIDIA container runtime
      - NVIDIA_VISIBLE_DEVICES=all
      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    ## ---------- END GPU SUPPORT BLOCK ------------###
+

    # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
    # devices:
--- a/utils/sentiment_analysis.py
+++ b/utils/sentiment_analysis.py
@@ -12,7 +12,7 @@ def dummy_sentiment_analysis(content, tag):



-def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]:
+def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]:
    """
    Perform sentiment analysis using Ollama model.

@@ -24,79 +24,86 @@ def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[li
    - sentiment score and reason
    """
    prompt = f"""
-    # Instructions
-    You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words).
-    
-    You need to deliver three pieces of information:
-    1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any).
-    2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment.
-    3. A brief reason (max 10 words) explaining your sentiment score.
-    
-
-    # Guidelines    
-    Keywords should be directly relevant to the theme.
-    
-    The reason should be extremely concise and to the point:
-    - Does not need to be a full sentence.
-    - Sentiment itself does not need to be stated in the explanation.
-    - If keywords are present in the quote that directly capture the sentiment, give that as the reason..
-    
+    # Role
+    You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme.

    # Input
-    
    Theme: `{theme}`
-    
+    Theme Description: `{theme_description}`
    Quote:
    ```
    {content}
    ```

-    # Response Format
-    Provide your response in the following JSON format:
-    {{
-        "keywords": ["<list_of_relevant_keywords_if_any>"],
-        "sentiment": <sentiment_score>,
-        "reason": "<brief_explanation_max_10_words>"
-    }}
+    # Instructions
+    1. Analyze the sentiment of the quote specifically regarding the theme.
+    2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme.
+    3. Assign a sentiment score:
+       - -1: Negative (complaint, dissatisfaction, criticism)
+       - 0: Neutral (factual, mixed, or no strong opinion)
+       - 1: Positive (praise, satisfaction, agreement)
+    4. Provide a concise reason (max 10 words).

+    # Constraints
+    - Return ONLY a valid JSON object.
+    - Do not use Markdown formatting (no ```json blocks).
+    - Do not write any Python code or explanations outside the JSON.
+    - If the quote is irrelevant to the theme, return sentiment 0.
+
+    # Response Format
+    {{
+        "keywords": ["<list_of_keywords>"],
+        "sentiment": <integer_score>,
+        "reason": "<string_reason>"
+    }}

    # Examples

-    ** Example 1**
-    - Theme: `Speed`
-    - Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.`
+    Example 1:
+    Theme: `Speed`
+    Quote: `It was a little slow for me.`
+    Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}}

-    - Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}}
-    
-    ** Example 2**
-    - Theme: `Friendliness / Empathy`
-    - Quote: `Sound very welcoming`
-    
-    - Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }}
+    Example 2:
+    Theme: `Price`
+    Quote: `It costs $50.`
+    Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}}

+    Example 3:
+    Theme: `Friendliness`
+    Quote: `Sound very welcoming.`
+    Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}}
    """

-    resp = client.generate(
-        model=model,
-        prompt=prompt,
-    )
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            resp = client.generate(
+                model=model,
+                prompt=prompt,
+            )
            
-    try:
-        response_text = resp.response.strip()
+            response_text = resp.response.strip()

-        # Extract JSON from response
-        start_index = response_text.find('{')
-        end_index = response_text.rfind('}') + 1
-        json_str = response_text[start_index:end_index]
+            # Extract JSON from response
+            start_index = response_text.find('{')
+            end_index = response_text.rfind('}') + 1
            
-        response_json = json.loads(json_str)
-        keywords = response_json.get('keywords', [])
-        sentiment = response_json.get('sentiment', 'test')
-        reason = response_json.get('reason', 'no reason provided')
-        return keywords, sentiment, reason
-    except Exception as e:
-        print(f"Error parsing response: {e}")
-        return [], None, 'parsing error'
+            if start_index == -1 or end_index == 0:
+                raise ValueError("No JSON found")
+
+            json_str = response_text[start_index:end_index]
+            
+            response_json = json.loads(json_str)
+            keywords = response_json.get('keywords', [])
+            sentiment = response_json.get('sentiment', 'test')
+            reason = response_json.get('reason', 'no reason provided')
+            return keywords, sentiment, reason
+        
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
+            if attempt == max_retries - 1:
+                return [], None, 'parsing error'


 if __name__ == "__main__":