From 31d925814fc2aa1ff5c19b7b0707c7234af5537f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 2 Mar 2026 20:39:51 -0800 Subject: [PATCH] Deploy Ollama LLM server on ringtail (#277) ## Summary - Deploy Ollama as a new ArgoCD-managed service on ringtail's k3s cluster with GPU acceleration - Declarative model management via `models.txt` + sidecar sync script (mirrors kiwix torrent pattern) - Initial models: `qwen2.5:14b`, `deepseek-r1:14b`, `phi4:14b`, `gemma3:12b` - hostPath PV on `/mnt/storage1/ollama` for fast local model storage (200Gi) - Tailscale ingress at `ollama.ops.eblu.me` for API access from tailnet - Enable GPU time-slicing (`replicas: 2`) on nvidia-device-plugin so Frigate and Ollama share the RTX 4080 ## Deployment and Testing - [ ] Deploy nvidia-device-plugin changes first: `argocd app sync nvidia-device-plugin` - [ ] Verify GPU time-slicing: `kubectl describe node ringtail --context=k3s-ringtail` shows `nvidia.com/gpu: 2` - [ ] Sync `apps` app with `--revision feature/ollama-ringtail` - [ ] Set ollama app to branch: `argocd app set ollama --revision feature/ollama-ringtail && argocd app sync ollama` - [ ] Verify model-sync sidecar pulls models: `kubectl logs -n ollama deploy/ollama -c model-sync --context=k3s-ringtail` - [ ] Test API: `curl https://ollama.ops.eblu.me/api/tags` - [ ] Test inference: `curl https://ollama.ops.eblu.me/api/generate -d '{"model":"qwen2.5:14b","prompt":"Hello"}'` - [ ] Verify Frigate still works after GPU sharing change - [ ] After merge: `argocd app set ollama --revision main && argocd app sync ollama` Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/277 --- ansible/roles/caddy/defaults/main.yml | 3 + argocd/apps/ollama.yaml | 18 ++++ .../nvidia-device-plugin/daemonset.yaml | 7 ++ .../nvidia-device-plugin/kustomization.yaml | 1 + .../time-slicing-config.yaml | 14 ++++ argocd/manifests/ollama/deployment.yaml | 84 +++++++++++++++++++ .../manifests/ollama/ingress-tailscale.yaml | 26 ++++++ argocd/manifests/ollama/kustomization.yaml | 22 +++++ argocd/manifests/ollama/models.txt | 6 ++ argocd/manifests/ollama/pv-hostpath.yaml | 15 ++++ argocd/manifests/ollama/pvc.yaml | 14 ++++ argocd/manifests/ollama/service.yaml | 13 +++ argocd/manifests/ollama/sync-models.sh | 61 ++++++++++++++ .../feature-ollama-ringtail.feature.md | 1 + service-versions.yaml | 7 ++ 15 files changed, 292 insertions(+) create mode 100644 argocd/apps/ollama.yaml create mode 100644 argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml create mode 100644 argocd/manifests/ollama/deployment.yaml create mode 100644 argocd/manifests/ollama/ingress-tailscale.yaml create mode 100644 argocd/manifests/ollama/kustomization.yaml create mode 100644 argocd/manifests/ollama/models.txt create mode 100644 argocd/manifests/ollama/pv-hostpath.yaml create mode 100644 argocd/manifests/ollama/pvc.yaml create mode 100644 argocd/manifests/ollama/service.yaml create mode 100644 argocd/manifests/ollama/sync-models.sh create mode 100644 docs/changelog.d/feature-ollama-ringtail.feature.md diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index b0fc046..464d331 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -85,6 +85,9 @@ caddy_services: - name: ntfy host: "ntfy.{{ caddy_domain }}" backend: "https://ntfy.tail8d86e.ts.net" + - name: ollama + host: "ollama.{{ caddy_domain }}" + backend: "https://ollama.tail8d86e.ts.net" - name: sifaka host: "nas.{{ caddy_domain }}" backend: "http://sifaka:5000" diff --git a/argocd/apps/ollama.yaml b/argocd/apps/ollama.yaml new file mode 100644 index 0000000..bb7a6a9 --- /dev/null +++ b/argocd/apps/ollama.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: ollama + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/ollama + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: ollama + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 4c57a76..b484959 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -25,6 +25,7 @@ spec: image: nvcr.io/nvidia/k8s-device-plugin args: - --device-id-strategy=index + - --config-file=/config/config.yaml env: - name: LD_LIBRARY_PATH value: /run/nvidia/lib @@ -39,6 +40,9 @@ spec: - name: nvidia-libs mountPath: /run/nvidia/lib readOnly: true + - name: plugin-config + mountPath: /config + readOnly: true volumes: - name: device-plugins hostPath: @@ -49,3 +53,6 @@ spec: - name: nvidia-libs hostPath: path: /etc/nvidia-driver/lib + - name: plugin-config + configMap: + name: nvidia-device-plugin-config diff --git a/argocd/manifests/nvidia-device-plugin/kustomization.yaml b/argocd/manifests/nvidia-device-plugin/kustomization.yaml index 4ffe2d9..102127f 100644 --- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml +++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml @@ -6,6 +6,7 @@ namespace: nvidia-device-plugin resources: - daemonset.yaml - runtime-class.yaml + - time-slicing-config.yaml images: - name: nvcr.io/nvidia/k8s-device-plugin diff --git a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml new file mode 100644 index 0000000..dee2fd7 --- /dev/null +++ b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: nvidia-device-plugin-config + namespace: nvidia-device-plugin +data: + config.yaml: | + version: v1 + sharing: + timeSlicing: + resources: + - name: nvidia.com/gpu + replicas: 2 diff --git a/argocd/manifests/ollama/deployment.yaml b/argocd/manifests/ollama/deployment.yaml new file mode 100644 index 0000000..2b68e55 --- /dev/null +++ b/argocd/manifests/ollama/deployment.yaml @@ -0,0 +1,84 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ollama + namespace: ollama +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: ollama + template: + metadata: + labels: + app: ollama + spec: + runtimeClassName: nvidia + containers: + - name: ollama + image: ollama/ollama + ports: + - containerPort: 11434 + name: http + env: + - name: OLLAMA_MODELS + value: /models + - name: OLLAMA_HOST + value: "0.0.0.0:11434" + volumeMounts: + - name: models + mountPath: /models + resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "16Gi" + cpu: "4000m" + nvidia.com/gpu: "1" + livenessProbe: + httpGet: + path: /api/tags + port: 11434 + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /api/tags + port: 11434 + initialDelaySeconds: 10 + periodSeconds: 10 + - name: model-sync + image: ollama/ollama + command: ["/bin/bash", "/scripts/sync-models.sh"] + env: + - name: MODEL_LIST + value: /config/models.txt + - name: OLLAMA_HOST + value: "http://localhost:11434" + volumeMounts: + - name: models-config + mountPath: /config + - name: sync-script + mountPath: /scripts + resources: + requests: + memory: "64Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" + volumes: + - name: models + persistentVolumeClaim: + claimName: ollama-models + - name: models-config + configMap: + name: ollama-models + - name: sync-script + configMap: + name: ollama-sync-script + defaultMode: 0755 # yamllint disable-line rule:octal-values diff --git a/argocd/manifests/ollama/ingress-tailscale.yaml b/argocd/manifests/ollama/ingress-tailscale.yaml new file mode 100644 index 0000000..bada466 --- /dev/null +++ b/argocd/manifests/ollama/ingress-tailscale.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: ollama-tailscale + namespace: ollama + annotations: + tailscale.com/proxy-class: "default" + tailscale.com/proxy-group: "ingress" + gethomepage.dev/enabled: "true" + gethomepage.dev/name: "Ollama" + gethomepage.dev/group: "AI" + gethomepage.dev/icon: "ollama.png" + gethomepage.dev/description: "LLM inference server" + gethomepage.dev/href: "https://ollama.ops.eblu.me" + gethomepage.dev/pod-selector: "app=ollama" +spec: + ingressClassName: tailscale + defaultBackend: + service: + name: ollama + port: + number: 11434 + tls: + - hosts: + - ollama diff --git a/argocd/manifests/ollama/kustomization.yaml b/argocd/manifests/ollama/kustomization.yaml new file mode 100644 index 0000000..75add74 --- /dev/null +++ b/argocd/manifests/ollama/kustomization.yaml @@ -0,0 +1,22 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: ollama +resources: + - pv-hostpath.yaml + - pvc.yaml + - deployment.yaml + - service.yaml + - ingress-tailscale.yaml + +images: + - name: ollama/ollama + newTag: "0.17.5" + +configMapGenerator: + - name: ollama-models + files: + - models.txt + - name: ollama-sync-script + files: + - sync-models.sh diff --git a/argocd/manifests/ollama/models.txt b/argocd/manifests/ollama/models.txt new file mode 100644 index 0000000..a998019 --- /dev/null +++ b/argocd/manifests/ollama/models.txt @@ -0,0 +1,6 @@ +# Models to pull from Ollama registry +# One model per line. Comments with #. +qwen2.5:14b +deepseek-r1:14b +phi4:14b +gemma3:12b diff --git a/argocd/manifests/ollama/pv-hostpath.yaml b/argocd/manifests/ollama/pv-hostpath.yaml new file mode 100644 index 0000000..d25dbcc --- /dev/null +++ b/argocd/manifests/ollama/pv-hostpath.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: ollama-models-pv +spec: + capacity: + storage: 200Gi + accessModes: + - ReadWriteOnce + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + hostPath: + path: /mnt/storage1/ollama + type: DirectoryOrCreate diff --git a/argocd/manifests/ollama/pvc.yaml b/argocd/manifests/ollama/pvc.yaml new file mode 100644 index 0000000..76c79a8 --- /dev/null +++ b/argocd/manifests/ollama/pvc.yaml @@ -0,0 +1,14 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: ollama-models + namespace: ollama +spec: + accessModes: + - ReadWriteOnce + storageClassName: "" + volumeName: ollama-models-pv + resources: + requests: + storage: 200Gi diff --git a/argocd/manifests/ollama/service.yaml b/argocd/manifests/ollama/service.yaml new file mode 100644 index 0000000..d9680e1 --- /dev/null +++ b/argocd/manifests/ollama/service.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: ollama + namespace: ollama +spec: + selector: + app: ollama + ports: + - name: http + port: 11434 + targetPort: 11434 diff --git a/argocd/manifests/ollama/sync-models.sh b/argocd/manifests/ollama/sync-models.sh new file mode 100644 index 0000000..9430704 --- /dev/null +++ b/argocd/manifests/ollama/sync-models.sh @@ -0,0 +1,61 @@ +#!/bin/bash +# Sync models from ConfigMap to Ollama server +# Runs as a sidecar in the ollama deployment, using the ollama CLI +set -euo pipefail + +MODEL_LIST="${MODEL_LIST:-/config/models.txt}" +OLLAMA_HOST="${OLLAMA_HOST:-http://localhost:11434}" +SYNC_INTERVAL="${SYNC_INTERVAL:-1800}" + +export OLLAMA_HOST + +echo "Syncing models from ${MODEL_LIST} via ollama CLI (host: ${OLLAMA_HOST})" + +while true; do + # Wait for ollama server to be ready + echo "Waiting for Ollama API..." + max_attempts=60 + attempt=0 + until ollama list > /dev/null 2>&1; do + attempt=$((attempt + 1)) + if [[ $attempt -ge $max_attempts ]]; then + echo "Ollama not ready after ${max_attempts} attempts, will retry next cycle" + sleep "$SYNC_INTERVAL" + continue 2 + fi + sleep 5 + done + echo "Ollama is ready" + + # Get list of currently pulled models + current=$(ollama list 2>/dev/null | tail -n +2 | awk '{print $1}' || true) + + pulled=0 + skipped=0 + + while IFS= read -r model || [[ -n "$model" ]]; do + # Skip empty lines and comments + [[ -z "$model" || "$model" =~ ^[[:space:]]*# ]] && continue + # Trim whitespace + model=$(echo "$model" | xargs) + [[ -z "$model" ]] && continue + + # Check if model is already pulled (ollama list shows name:tag) + if echo "$current" | grep -qF "$model"; then + echo "Already present: $model" + ((skipped++)) || true + else + echo "Pulling: $model" + if ollama pull "$model"; then + echo "Pulled: $model" + ((pulled++)) || true + else + echo "Warning: Failed to pull $model" >&2 + fi + fi + done < "$MODEL_LIST" + + echo "Sync complete: $pulled pulled, $skipped already present" + echo "Next sync in ${SYNC_INTERVAL}s" + sleep "$SYNC_INTERVAL" +done diff --git a/docs/changelog.d/feature-ollama-ringtail.feature.md b/docs/changelog.d/feature-ollama-ringtail.feature.md new file mode 100644 index 0000000..648757e --- /dev/null +++ b/docs/changelog.d/feature-ollama-ringtail.feature.md @@ -0,0 +1 @@ +Deploy Ollama LLM server on ringtail with GPU acceleration and declarative model management diff --git a/service-versions.yaml b/service-versions.yaml index c1c48e1..00e1084 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -135,6 +135,13 @@ services: current-version: "2026.2.0" upstream-source: https://github.com/goauthentik/authentik/releases + - name: ollama + type: argocd + last-reviewed: "2026-03-02" + current-version: "0.17.5" + upstream-source: https://github.com/ollama/ollama/releases + notes: LLM inference server on ringtail (GPU); upstream container image + - name: navidrome type: argocd last-reviewed: 2026-03-02