Deploy Ollama LLM server on ringtail (#277)

## Summary - Deploy Ollama as a new ArgoCD-managed service on ringtail's k3s cluster with GPU acceleration - Declarative model management via `models.txt` + sidecar sync script (mirrors kiwix torrent pattern) - Initial models: `qwen2.5:14b`, `deepseek-r1:14b`, `phi4:14b`, `gemma3:12b` - hostPath PV on `/mnt/storage1/ollama` for fast local model storage (200Gi) - Tailscale ingress at `ollama.ops.eblu.me` for API access from tailnet - Enable GPU time-slicing (`replicas: 2`) on nvidia-device-plugin so Frigate and Ollama share the RTX 4080 ## Deployment and Testing - [ ] Deploy nvidia-device-plugin changes first: `argocd app sync nvidia-device-plugin` - [ ] Verify GPU time-slicing: `kubectl describe node ringtail --context=k3s-ringtail` shows `nvidia.com/gpu: 2` - [ ] Sync `apps` app with `--revision feature/ollama-ringtail` - [ ] Set ollama app to branch: `argocd app set ollama --revision feature/ollama-ringtail && argocd app sync ollama` - [ ] Verify model-sync sidecar pulls models: `kubectl logs -n ollama deploy/ollama -c model-sync --context=k3s-ringtail` - [ ] Test API: `curl https://ollama.ops.eblu.me/api/tags` - [ ] Test inference: `curl https://ollama.ops.eblu.me/api/generate -d '{"model":"qwen2.5:14b","prompt":"Hello"}'` - [ ] Verify Frigate still works after GPU sharing change - [ ] After merge: `argocd app set ollama --revision main && argocd app sync ollama` Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/277
2026-03-02 20:39:51 -08:00 · 2026-03-02 20:39:51 -08:00 · 31d925814f
commit 31d925814f
parent 0f79c61c42
15 changed files with 292 additions and 0 deletions
--- a/ansible/roles/caddy/defaults/main.yml
+++ b/ansible/roles/caddy/defaults/main.yml
@ -85,6 +85,9 @@ caddy_services:
  - name: ntfy
    host: "ntfy.{{ caddy_domain }}"
    backend: "https://ntfy.tail8d86e.ts.net"
+  - name: ollama
+    host: "ollama.{{ caddy_domain }}"
+    backend: "https://ollama.tail8d86e.ts.net"
  - name: sifaka
    host: "nas.{{ caddy_domain }}"
    backend: "http://sifaka:5000"
--- a/argocd/apps/ollama.yaml
+++ b/argocd/apps/ollama.yaml
@ -0,0 +1,18 @@
+---
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: ollama
+  namespace: argocd
+spec:
+  project: default
+  source:
+    repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
+    targetRevision: main
+    path: argocd/manifests/ollama
+  destination:
+    server: https://ringtail.tail8d86e.ts.net:6443
+    namespace: ollama
+  syncPolicy:
+    syncOptions:
+      - CreateNamespace=true
--- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml
+++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml
@ -25,6 +25,7 @@ spec:
          image: nvcr.io/nvidia/k8s-device-plugin
          args:
            - --device-id-strategy=index
+            - --config-file=/config/config.yaml
          env:
            - name: LD_LIBRARY_PATH
              value: /run/nvidia/lib
@ -39,6 +40,9 @@ spec:
            - name: nvidia-libs
              mountPath: /run/nvidia/lib
              readOnly: true
+            - name: plugin-config
+              mountPath: /config
+              readOnly: true
      volumes:
        - name: device-plugins
          hostPath:
@ -49,3 +53,6 @@ spec:
        - name: nvidia-libs
          hostPath:
            path: /etc/nvidia-driver/lib
+        - name: plugin-config
+          configMap:
+            name: nvidia-device-plugin-config
--- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml
+++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml
@ -6,6 +6,7 @@ namespace: nvidia-device-plugin
 resources:
  - daemonset.yaml
  - runtime-class.yaml
+  - time-slicing-config.yaml

 images:
  - name: nvcr.io/nvidia/k8s-device-plugin
--- a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml
+++ b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml
@ -0,0 +1,14 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nvidia-device-plugin-config
+  namespace: nvidia-device-plugin
+data:
+  config.yaml: |
+    version: v1
+    sharing:
+      timeSlicing:
+        resources:
+          - name: nvidia.com/gpu
+            replicas: 2
--- a/argocd/manifests/ollama/deployment.yaml
+++ b/argocd/manifests/ollama/deployment.yaml
@ -0,0 +1,84 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: ollama
+  namespace: ollama
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      app: ollama
+  template:
+    metadata:
+      labels:
+        app: ollama
+    spec:
+      runtimeClassName: nvidia
+      containers:
+        - name: ollama
+          image: ollama/ollama
+          ports:
+            - containerPort: 11434
+              name: http
+          env:
+            - name: OLLAMA_MODELS
+              value: /models
+            - name: OLLAMA_HOST
+              value: "0.0.0.0:11434"
+          volumeMounts:
+            - name: models
+              mountPath: /models
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "500m"
+            limits:
+              memory: "16Gi"
+              cpu: "4000m"
+              nvidia.com/gpu: "1"
+          livenessProbe:
+            httpGet:
+              path: /api/tags
+              port: 11434
+            initialDelaySeconds: 30
+            periodSeconds: 30
+          readinessProbe:
+            httpGet:
+              path: /api/tags
+              port: 11434
+            initialDelaySeconds: 10
+            periodSeconds: 10
+        - name: model-sync
+          image: ollama/ollama
+          command: ["/bin/bash", "/scripts/sync-models.sh"]
+          env:
+            - name: MODEL_LIST
+              value: /config/models.txt
+            - name: OLLAMA_HOST
+              value: "http://localhost:11434"
+          volumeMounts:
+            - name: models-config
+              mountPath: /config
+            - name: sync-script
+              mountPath: /scripts
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "256Mi"
+              cpu: "200m"
+      volumes:
+        - name: models
+          persistentVolumeClaim:
+            claimName: ollama-models
+        - name: models-config
+          configMap:
+            name: ollama-models
+        - name: sync-script
+          configMap:
+            name: ollama-sync-script
+            defaultMode: 0755 # yamllint disable-line rule:octal-values
--- a/argocd/manifests/ollama/ingress-tailscale.yaml
+++ b/argocd/manifests/ollama/ingress-tailscale.yaml
@ -0,0 +1,26 @@
+---
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: ollama-tailscale
+  namespace: ollama
+  annotations:
+    tailscale.com/proxy-class: "default"
+    tailscale.com/proxy-group: "ingress"
+    gethomepage.dev/enabled: "true"
+    gethomepage.dev/name: "Ollama"
+    gethomepage.dev/group: "AI"
+    gethomepage.dev/icon: "ollama.png"
+    gethomepage.dev/description: "LLM inference server"
+    gethomepage.dev/href: "https://ollama.ops.eblu.me"
+    gethomepage.dev/pod-selector: "app=ollama"
+spec:
+  ingressClassName: tailscale
+  defaultBackend:
+    service:
+      name: ollama
+      port:
+        number: 11434
+  tls:
+    - hosts:
+        - ollama
--- a/argocd/manifests/ollama/kustomization.yaml
+++ b/argocd/manifests/ollama/kustomization.yaml
@ -0,0 +1,22 @@
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+namespace: ollama
+resources:
+  - pv-hostpath.yaml
+  - pvc.yaml
+  - deployment.yaml
+  - service.yaml
+  - ingress-tailscale.yaml
+
+images:
+  - name: ollama/ollama
+    newTag: "0.17.5"
+
+configMapGenerator:
+  - name: ollama-models
+    files:
+      - models.txt
+  - name: ollama-sync-script
+    files:
+      - sync-models.sh
--- a/argocd/manifests/ollama/models.txt
+++ b/argocd/manifests/ollama/models.txt
@ -0,0 +1,6 @@
+# Models to pull from Ollama registry
+# One model per line. Comments with #.
+qwen2.5:14b
+deepseek-r1:14b
+phi4:14b
+gemma3:12b
--- a/argocd/manifests/ollama/pv-hostpath.yaml
+++ b/argocd/manifests/ollama/pv-hostpath.yaml
@ -0,0 +1,15 @@
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: ollama-models-pv
+spec:
+  capacity:
+    storage: 200Gi
+  accessModes:
+    - ReadWriteOnce
+  persistentVolumeReclaimPolicy: Retain
+  storageClassName: ""
+  hostPath:
+    path: /mnt/storage1/ollama
+    type: DirectoryOrCreate
--- a/argocd/manifests/ollama/pvc.yaml
+++ b/argocd/manifests/ollama/pvc.yaml
@ -0,0 +1,14 @@
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ollama-models
+  namespace: ollama
+spec:
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: ""
+  volumeName: ollama-models-pv
+  resources:
+    requests:
+      storage: 200Gi
--- a/argocd/manifests/ollama/service.yaml
+++ b/argocd/manifests/ollama/service.yaml
@ -0,0 +1,13 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: ollama
+  namespace: ollama
+spec:
+  selector:
+    app: ollama
+  ports:
+    - name: http
+      port: 11434
+      targetPort: 11434
--- a/argocd/manifests/ollama/sync-models.sh
+++ b/argocd/manifests/ollama/sync-models.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+# Sync models from ConfigMap to Ollama server
+# Runs as a sidecar in the ollama deployment, using the ollama CLI
+set -euo pipefail
+
+MODEL_LIST="${MODEL_LIST:-/config/models.txt}"
+OLLAMA_HOST="${OLLAMA_HOST:-http://localhost:11434}"
+SYNC_INTERVAL="${SYNC_INTERVAL:-1800}"
+
+export OLLAMA_HOST
+
+echo "Syncing models from ${MODEL_LIST} via ollama CLI (host: ${OLLAMA_HOST})"
+
+while true; do
+    # Wait for ollama server to be ready
+    echo "Waiting for Ollama API..."
+    max_attempts=60
+    attempt=0
+    until ollama list > /dev/null 2>&1; do
+        attempt=$((attempt + 1))
+        if [[ $attempt -ge $max_attempts ]]; then
+            echo "Ollama not ready after ${max_attempts} attempts, will retry next cycle"
+            sleep "$SYNC_INTERVAL"
+            continue 2
+        fi
+        sleep 5
+    done
+    echo "Ollama is ready"
+
+    # Get list of currently pulled models
+    current=$(ollama list 2>/dev/null | tail -n +2 | awk '{print $1}' || true)
+
+    pulled=0
+    skipped=0
+
+    while IFS= read -r model || [[ -n "$model" ]]; do
+        # Skip empty lines and comments
+        [[ -z "$model" || "$model" =~ ^[[:space:]]*# ]] && continue
+        # Trim whitespace
+        model=$(echo "$model" | xargs)
+        [[ -z "$model" ]] && continue
+
+        # Check if model is already pulled (ollama list shows name:tag)
+        if echo "$current" | grep -qF "$model"; then
+            echo "Already present: $model"
+            ((skipped++)) || true
+        else
+            echo "Pulling: $model"
+            if ollama pull "$model"; then
+                echo "Pulled: $model"
+                ((pulled++)) || true
+            else
+                echo "Warning: Failed to pull $model" >&2
+            fi
+        fi
+    done < "$MODEL_LIST"
+
+    echo "Sync complete: $pulled pulled, $skipped already present"
+    echo "Next sync in ${SYNC_INTERVAL}s"
+    sleep "$SYNC_INTERVAL"
+done
--- a/docs/changelog.d/feature-ollama-ringtail.feature.md
+++ b/docs/changelog.d/feature-ollama-ringtail.feature.md
@ -0,0 +1 @@
+Deploy Ollama LLM server on ringtail with GPU acceleration and declarative model management
--- a/service-versions.yaml
+++ b/service-versions.yaml
@ -135,6 +135,13 @@ services:
    current-version: "2026.2.0"
    upstream-source: https://github.com/goauthentik/authentik/releases

+  - name: ollama
+    type: argocd
+    last-reviewed: "2026-03-02"
+    current-version: "0.17.5"
+    upstream-source: https://github.com/ollama/ollama/releases
+    notes: LLM inference server on ringtail (GPU); upstream container image
+
  - name: navidrome
    type: argocd
    last-reviewed: 2026-03-02
				`@ -0,0 +1 @@`
				`Deploy Ollama LLM server on ringtail with GPU acceleration and declarative model management`