From ad00860fa128a238959780c1f7f5f17870b3fc8d Mon Sep 17 00:00:00 2001
From: Luigi Maiorano <luigi.maiorano@qumo.io>
Date: Wed, 10 Dec 2025 08:28:01 +0100
Subject: [PATCH] added local ollama support

---
 .gitignore                  |  4 ++-
 02_Taguette_Post-Process.py |  7 +++++
 ollama/docker-compose.yml   | 56 +++++++++++++++++++++++++++++++++++++
 utils.py                    | 10 +++++--
 4 files changed, 73 insertions(+), 4 deletions(-)
 create mode 100644 ollama/docker-compose.yml

diff --git a/.gitignore b/.gitignore
index dceb869..3c1ca91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,6 @@
 __marimo__
 __pycache__/
 
-data/
\ No newline at end of file
+data/
+docker-volumes/
+logs/
\ No newline at end of file
diff --git a/02_Taguette_Post-Process.py b/02_Taguette_Post-Process.py
index f456a3a..8ea44cc 100644
--- a/02_Taguette_Post-Process.py
+++ b/02_Taguette_Post-Process.py
@@ -11,6 +11,13 @@ def _():
     from pathlib import Path
     from datetime import datetime
 
+    from utils import connect_qumo_ollama
+
+    OLLAMA_LOCATION= 'localhost'
+    # VM_NAME = 'ollama-lite'
+
+    client = connect_qumo_ollama(OLLAMA_LOCATION)
+
     TAGUETTE_EXPORT_DIR = Path('./data/transcripts/taguette_results')
     WORKING_DIR = Path('./data/processing/02_taguette_postprocess')
 
diff --git a/ollama/docker-compose.yml b/ollama/docker-compose.yml
new file mode 100644
index 0000000..c5f903f
--- /dev/null
+++ b/ollama/docker-compose.yml
@@ -0,0 +1,56 @@
+services:
+  ollama:
+    image: ollama/ollama:latest
+    ports:
+      - 11434:11434
+    volumes:
+      - ./docker-volumes/ollama:/root/.ollama
+    container_name: ollama
+    tty: true
+    restart: unless-stopped
+    # GPU SUPPORT NOTES:
+    # 1. The "deploy" section is ignored by classic 'docker-compose'; it's honored in Swarm.
+    # 2. For local 'docker compose up' with NVIDIA GPUs you need the host configured with
+    #    nvidia-container-toolkit. Then either:
+    #       a) Leave the reservation block (Compose V2 now honors it) OR
+    #       b) Start with: docker compose up --build (Compose will request GPUs) OR
+    #       c) Explicitly override: docker compose run --gpus all ollama
+    # 3. If your Docker/Compose version does NOT honor the reservation below, uncomment the
+    #    'devices' section further down as a fallback (less portable).
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: all
+    #           capabilities: [gpu]
+
+    # environment:
+      # Visible devices / capabilities for the NVIDIA container runtime
+      # - NVIDIA_VISIBLE_DEVICES=all
+      # - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+
+    # Fallback (UNCOMMENT ONLY if the reservation above is ignored and you still get errors):
+    # devices:
+    #   - /dev/nvidiactl:/dev/nvidiactl
+    #   - /dev/nvidia-uvm:/dev/nvidia-uvm
+    #   - /dev/nvidia-uvm-tools:/dev/nvidia-uvm-tools
+    #   - /dev/nvidia0:/dev/nvidia0
+
+  open-webui:
+    image: ghcr.io/open-webui/open-webui:main
+    container_name: open-webui
+    volumes:
+      - ./docker-volumes/open-webui:/app/backend/data
+    depends_on:
+      - ollama
+    ports:
+      - 3000:8080
+    environment:
+      - 'OLLAMA_BASE_URL=http://ollama:11434'
+      - 'ENABLE_OLLAMA_API=true'
+      - 'WEBUI_SECRET_KEY='
+
+    extra_hosts:
+      - host.docker.internal:host-gateway
+    restart: unless-stopped
diff --git a/utils.py b/utils.py
index 830a9ec..c40aad1 100644
--- a/utils.py
+++ b/utils.py
@@ -61,7 +61,7 @@ def load_srt(path: str | Path) -> str:
     return '\n\n'.join(transcript_lines)
 
 
-def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
+def connect_qumo_ollama(vm_name: str ='ollama-lite', port='11434') -> Client:
     """Establish connection to Qumo Ollama instance
     
     vm_name: str ('ollama-lite' or 'hiperf-gpu')
@@ -70,14 +70,18 @@ def connect_qumo_ollama(vm_name: str ='ollama-lite') -> Client:
     Returns:
         tuple(Client): Ollama client connected to the specified VM
     """
-    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:11434'
+    QUMO_OLLAMA_URL = f'http://{vm_name}.tail44fa00.ts.net:{port}'
+
+    if vm_name in ['localhost', '0.0.0.0']:
+        QUMO_OLLAMA_URL = f"http://{vm_name}:{port}"
+  
     try:
         requests.get(QUMO_OLLAMA_URL, timeout=5)
         client = Client(
             host=QUMO_OLLAMA_URL
         )
     
-        print(f"Connection succesful. WebUI available at: http://{vm_name}.tail44fa00.ts.net:3000\nAvailable models:")
+        print(f"Connection succesful. WebUI available at: {QUMO_OLLAMA_URL.replace(port, '3000')}\nAvailable models:")
         for m in client.list().models:
             print(f"  - '{m.model}' ")
         return client