diff --git a/01_Taguette-Pre-Process.py b/01_Taguette-Pre-Process.py index db809cb..7d3c2f0 100644 --- a/01_Taguette-Pre-Process.py +++ b/01_Taguette-Pre-Process.py @@ -70,13 +70,13 @@ def csv_to_markdown(df): return "\n\n".join(lines) -@app.cell +@app.cell(hide_code=True) def _(file_dropdown, mo, pd): # Preview preview = mo.md("") if file_dropdown.value: df = pd.read_csv(file_dropdown.value) - md_content = csv_to_markdown(df) + md_content = csv_to_markdown(df.head(10)) preview = mo.md(md_content) preview diff --git a/02_Taguette_Post-Process.py b/02_Taguette_Post-Process.py index 10ea1b7..082ea59 100644 --- a/02_Taguette_Post-Process.py +++ b/02_Taguette_Post-Process.py @@ -18,7 +18,7 @@ def _(): client, _models = connect_qumo_ollama(OLLAMA_LOCATION, print_models=False) - TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results') + TAGUETTE_EXPORT_DIR = Path('./data/processing/02_taguette_export') WORKING_DIR = Path('./data/processing/02_taguette_postprocess') if not WORKING_DIR.exists(): @@ -47,13 +47,18 @@ def _(): @app.cell(hide_code=True) def _(TAGUETTE_EXPORT_DIR, mo): mo.md(rf""" - # Step 1: Export All Highlights out of Taguette + # Step 1: Export Data out of Taguette - 1. Go to: http://taguette.tail44fa00.ts.net/project/1 - 2. Select 'Highlights' on left - 3. Select 'See all hightlights' - 4. Top right 'Export this view' > 'CSV' - 5. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv' + **Highlights** + 1. Go to: https://taguette.qumo.io/project/1 + 2. Select 'Highlights' (left side) > 'See all hightlights' > 'Export this view' (top right) > 'CSV' + 3. Save to '{TAGUETTE_EXPORT_DIR}/all_tags.csv' + + **Tags Codebook** + 1. Select 'Project Info' (left side) > 'Export codebook' > 'CSV' + 2. Save to '{TAGUETTE_EXPORT_DIR}/codebook.csv' + + _NOTE: Sometimes you need to explicitly allow 'Unsafe Download' in the browser's download manager_ """) return @@ -67,13 +72,21 @@ def _(mo): @app.cell -def _(pd): - all_tags_df = pd.read_csv('data/transcripts/taguette_results/all_tags.csv') +def _(TAGUETTE_EXPORT_DIR, pd): + all_tags_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/all_tags.csv') all_tags_df['_seq_id'] = range(len(all_tags_df)) - all_tags_df.head(20) + all_tags_df return (all_tags_df,) +@app.cell +def _(TAGUETTE_EXPORT_DIR, pd): + codebook_df = pd.read_csv(f'{TAGUETTE_EXPORT_DIR}/codebook.csv') + codebook_df.rename(columns={'description': 'theme_description'}, inplace=True) + codebook_df + return (codebook_df,) + + @app.cell(hide_code=True) def _(mo): mo.md(r""" @@ -255,30 +268,51 @@ def _(mo): @app.cell -def _(client, model_select, pd, sentiment_df): - # for now, create an empty sentiment column with randomized dummy values for testing - # only for 'VT -' and 'CT -' tags +def _(mo): + start_processing_btn = mo.ui.button( + label="Start Sentiment Extraction", + kind="warn", + on_click=lambda val: True + ) + start_processing_btn + return (start_processing_btn,) + +@app.cell +def _( + client, + codebook_df, + mo, + model_select, + pd, + sentiment_df, + start_processing_btn, +): from utils import dummy_sentiment_analysis, ollama_sentiment_analysis - # Only run on rows without manual_analysis + # add theme_description to be used in LLM prompt + _df = sentiment_df.merge(codebook_df, on='tag', how='left', suffixes=('', '_codebook')) - # sentiment_df[['sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply( - # lambda row: pd.Series(dummy_sentiment_analysis(row['content'], row['tag'])), - # axis=1 - # ) + # Wait for start processing button + mo.stop(not start_processing_btn.value, "Click button above to start processing") - sentiment_df[['keywords', 'sentiment', 'reason']] = sentiment_df[~sentiment_df['manual_analysis']].apply( - lambda row: pd.Series(ollama_sentiment_analysis(row['content'], row['theme'], client=client, model=model_select.value)), + + sentiment_df[['keywords', 'sentiment', 'reason']] = _df[~_df['manual_analysis']].apply( + lambda row: pd.Series(ollama_sentiment_analysis( + content=row['content'], + theme=row['theme'], + theme_description=row['theme_description'], + client=client, + model=model_select.value + )), axis=1 ) - - return @app.cell -def _(sentiment_df): +def _(mo, sentiment_df): + mo.stop(('sentiment' not in sentiment_df.columns), "Run above cells to extract sentiment analysis") sentiment_df.loc[~sentiment_df['manual_analysis'], ['theme', 'content', 'sentiment', 'reason', 'keywords']] return @@ -318,6 +352,13 @@ def _(mo, sentiment_df): return rows_to_edit, split_rows_editor +@app.cell +def _(split_rows_editor): + split_rows_editor + + return + + @app.cell(hide_code=True) def _(mo, rows_to_edit, split_rows_editor): if split_rows_editor is not None: diff --git a/04_Results_Aggregation.py b/04_Results_Aggregation.py index 3b99d8b..ac670ea 100644 --- a/04_Results_Aggregation.py +++ b/04_Results_Aggregation.py @@ -32,7 +32,7 @@ def _(INPUT_DIR, mo): file_options = {f.stem: str(f) for f in voice_csv_files} voice_multiselect = mo.ui.multiselect(options=file_options, label="Select Voice CSV Files for Aggregation") - voice_multiselect + return (voice_multiselect,) diff --git a/ollama/docker-compose.yml b/ollama/docker-compose.yml index 75647e7..b34364b 100644 --- a/ollama/docker-compose.yml +++ b/ollama/docker-compose.yml @@ -17,6 +17,8 @@ services: # c) Explicitly override: docker compose run --gpus all ollama # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the # 'devices' section further down as a fallback (less portable). + + ## UNCOMMENT THE FOLLOWING BLOCK FOR NVIDIA GPU SUPPORT ### deploy: resources: reservations: @@ -29,6 +31,8 @@ services: # Visible devices / capabilities for the NVIDIA container runtime - NVIDIA_VISIBLE_DEVICES=all - NVIDIA_DRIVER_CAPABILITIES=compute,utility + ## ---------- END GPU SUPPORT BLOCK ------------### + # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors): # devices: diff --git a/utils/sentiment_analysis.py b/utils/sentiment_analysis.py index ed46edc..8b4c280 100644 --- a/utils/sentiment_analysis.py +++ b/utils/sentiment_analysis.py @@ -12,7 +12,7 @@ def dummy_sentiment_analysis(content, tag): -def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[list[str], int, str]: +def ollama_sentiment_analysis(content, theme, theme_description, client: Client, model) -> tuple[list[str], int, str]: """ Perform sentiment analysis using Ollama model. @@ -24,79 +24,86 @@ def ollama_sentiment_analysis(content, theme, client: Client, model) -> tuple[li - sentiment score and reason """ prompt = f""" - # Instructions - You are an expert in sentiment analysis and natural language processing. You are given a quote from an interview along with a theme tag. Your task is to analyze the sentiment expressed in the quote in relation to the provided theme, and provide a short explanation for your assessment (max 10 words). - - You need to deliver three pieces of information: - 1. A list of keywords from the quote quantify or qualify the theme, and that influenced your sentiment analysis (if any). - 2. A sentiment score: -1 for negative, 0 for neutral, and 1 for positive sentiment. - 3. A brief reason (max 10 words) explaining your sentiment score. - + # Role + You are an expert in sentiment analysis. Your task is to analyze the sentiment of a quote in relation to a specific theme. - # Guidelines - Keywords should be directly relevant to the theme. - - The reason should be extremely concise and to the point: - - Does not need to be a full sentence. - - Sentiment itself does not need to be stated in the explanation. - - If keywords are present in the quote that directly capture the sentiment, give that as the reason.. - - # Input - Theme: `{theme}` - + Theme Description: `{theme_description}` Quote: ``` {content} ``` - + + # Instructions + 1. Analyze the sentiment of the quote specifically regarding the theme. + 2. Extract relevant keywords or phrases from the quote. Prioritize specific descriptors found in the text that match or relate to the theme. + 3. Assign a sentiment score: + - -1: Negative (complaint, dissatisfaction, criticism) + - 0: Neutral (factual, mixed, or no strong opinion) + - 1: Positive (praise, satisfaction, agreement) + 4. Provide a concise reason (max 10 words). + + # Constraints + - Return ONLY a valid JSON object. + - Do not use Markdown formatting (no ```json blocks). + - Do not write any Python code or explanations outside the JSON. + - If the quote is irrelevant to the theme, return sentiment 0. + # Response Format - Provide your response in the following JSON format: {{ - "keywords": [""], - "sentiment": , - "reason": "" + "keywords": [""], + "sentiment": , + "reason": "" }} - # Examples - - ** Example 1** - - Theme: `Speed` - - Quote: `It just was a little toned down. It was almost like he was talking like this. You know? It almost kind of this was a little slow for me.` - - Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "States speed is slow, indicates dissatisfaction"}} - - ** Example 2** - - Theme: `Friendliness / Empathy` - - Quote: `Sound very welcoming` - - - Response: {{ "keywords": ["welcoming"], "sentiment": 1, "reason": "Uses 'welcoming'" }} - + Example 1: + Theme: `Speed` + Quote: `It was a little slow for me.` + Response: {{"keywords": ["slow"], "sentiment": -1, "reason": "Dissatisfaction with speed"}} + + Example 2: + Theme: `Price` + Quote: `It costs $50.` + Response: {{"keywords": [], "sentiment": 0, "reason": "Factual statement"}} + + Example 3: + Theme: `Friendliness` + Quote: `Sound very welcoming.` + Response: {{"keywords": ["welcoming"], "sentiment": 1, "reason": "Positive descriptor used"}} """ - resp = client.generate( - model=model, - prompt=prompt, - ) - - try: - response_text = resp.response.strip() + max_retries = 3 + for attempt in range(max_retries): + try: + resp = client.generate( + model=model, + prompt=prompt, + ) + + response_text = resp.response.strip() - # Extract JSON from response - start_index = response_text.find('{') - end_index = response_text.rfind('}') + 1 - json_str = response_text[start_index:end_index] + # Extract JSON from response + start_index = response_text.find('{') + end_index = response_text.rfind('}') + 1 + + if start_index == -1 or end_index == 0: + raise ValueError("No JSON found") + + json_str = response_text[start_index:end_index] + + response_json = json.loads(json_str) + keywords = response_json.get('keywords', []) + sentiment = response_json.get('sentiment', 'test') + reason = response_json.get('reason', 'no reason provided') + return keywords, sentiment, reason - response_json = json.loads(json_str) - keywords = response_json.get('keywords', []) - sentiment = response_json.get('sentiment', 'test') - reason = response_json.get('reason', 'no reason provided') - return keywords, sentiment, reason - except Exception as e: - print(f"Error parsing response: {e}") - return [], None, 'parsing error' + except Exception as e: + print(f"Attempt {attempt + 1}/{max_retries} failed: {e}") + if attempt == max_retries - 1: + return [], None, 'parsing error' if __name__ == "__main__":