Deploy Ollama LLM server on ringtail (#277)

## Summary - Deploy Ollama as a new ArgoCD-managed service on ringtail's k3s cluster with GPU acceleration - Declarative model management via `models.txt` + sidecar sync script (mirrors kiwix torrent pattern) - Initial models: `qwen2.5:14b`, `deepseek-r1:14b`, `phi4:14b`, `gemma3:12b` - hostPath PV on `/mnt/storage1/ollama` for fast local model storage (200Gi) - Tailscale ingress at `ollama.ops.eblu.me` for API access from tailnet - Enable GPU time-slicing (`replicas: 2`) on nvidia-device-plugin so Frigate and Ollama share the RTX 4080 ## Deployment and Testing - [ ] Deploy nvidia-device-plugin changes first: `argocd app sync nvidia-device-plugin` - [ ] Verify GPU time-slicing: `kubectl describe node ringtail --context=k3s-ringtail` shows `nvidia.com/gpu: 2` - [ ] Sync `apps` app with `--revision feature/ollama-ringtail` - [ ] Set ollama app to branch: `argocd app set ollama --revision feature/ollama-ringtail && argocd app sync ollama` - [ ] Verify model-sync sidecar pulls models: `kubectl logs -n ollama deploy/ollama -c model-sync --context=k3s-ringtail` - [ ] Test API: `curl https://ollama.ops.eblu.me/api/tags` - [ ] Test inference: `curl https://ollama.ops.eblu.me/api/generate -d '{"model":"qwen2.5:14b","prompt":"Hello"}'` - [ ] Verify Frigate still works after GPU sharing change - [ ] After merge: `argocd app set ollama --revision main && argocd app sync ollama` Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/277
2026-03-02 20:39:51 -08:00 · 2026-03-02 20:39:51 -08:00 · 31d925814f
commit 31d925814f
parent 0f79c61c42
15 changed files with 292 additions and 0 deletions
--- a/ansible/roles/caddy/defaults/main.yml
+++ b/ansible/roles/caddy/defaults/main.yml
@ -85,6 +85,9 @@ caddy_services:
  - name: ntfy
    host: "ntfy.{{ caddy_domain }}"
    backend: "https://ntfy.tail8d86e.ts.net"
  - name: ollama
    host: "ollama.{{ caddy_domain }}"
    backend: "https://ollama.tail8d86e.ts.net"
  - name: sifaka
    host: "nas.{{ caddy_domain }}"
    backend: "http://sifaka:5000"
--- a/argocd/apps/ollama.yaml
+++ b/argocd/apps/ollama.yaml
@ -0,0 +1,18 @@
 ---
 apiVersion: argoproj.io/v1alpha1
 kind: Application
 metadata:
  name: ollama
  namespace: argocd
 spec:
  project: default
  source:
    repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
    targetRevision: main
    path: argocd/manifests/ollama
  destination:
    server: https://ringtail.tail8d86e.ts.net:6443
    namespace: ollama
  syncPolicy:
    syncOptions:
      - CreateNamespace=true
--- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml
+++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml
@ -25,6 +25,7 @@ spec:
          image: nvcr.io/nvidia/k8s-device-plugin
          args:
            - --device-id-strategy=index
            - --config-file=/config/config.yaml
          env:
            - name: LD_LIBRARY_PATH
              value: /run/nvidia/lib
@ -39,6 +40,9 @@ spec:
            - name: nvidia-libs
              mountPath: /run/nvidia/lib
              readOnly: true
            - name: plugin-config
              mountPath: /config
              readOnly: true
      volumes:
        - name: device-plugins
          hostPath:
@ -49,3 +53,6 @@ spec:
        - name: nvidia-libs
          hostPath:
            path: /etc/nvidia-driver/lib
        - name: plugin-config
          configMap:
            name: nvidia-device-plugin-config
--- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml
+++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml
@ -6,6 +6,7 @@ namespace: nvidia-device-plugin
 resources:
  - daemonset.yaml
  - runtime-class.yaml
  - time-slicing-config.yaml
 images:
  - name: nvcr.io/nvidia/k8s-device-plugin
--- a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml
+++ b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml
@ -0,0 +1,14 @@
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: nvidia-device-plugin-config
  namespace: nvidia-device-plugin
 data:
  config.yaml: |
    version: v1
    sharing:
      timeSlicing:
        resources:
          - name: nvidia.com/gpu
            replicas: 2
--- a/argocd/manifests/ollama/deployment.yaml
+++ b/argocd/manifests/ollama/deployment.yaml
@ -0,0 +1,84 @@
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: ollama
  namespace: ollama
 spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: ollama
  template:
    metadata:
      labels:
        app: ollama
    spec:
      runtimeClassName: nvidia
      containers:
        - name: ollama
          image: ollama/ollama
          ports:
            - containerPort: 11434
              name: http
          env:
            - name: OLLAMA_MODELS
              value: /models
            - name: OLLAMA_HOST
              value: "0.0.0.0:11434"
          volumeMounts:
            - name: models
              mountPath: /models
          resources:
            requests:
              memory: "512Mi"
              cpu: "500m"
            limits:
              memory: "16Gi"
              cpu: "4000m"
              nvidia.com/gpu: "1"
          livenessProbe:
            httpGet:
              path: /api/tags
              port: 11434
            initialDelaySeconds: 30
            periodSeconds: 30
          readinessProbe:
            httpGet:
              path: /api/tags
              port: 11434
            initialDelaySeconds: 10
            periodSeconds: 10
        - name: model-sync
          image: ollama/ollama
          command: ["/bin/bash", "/scripts/sync-models.sh"]
          env:
            - name: MODEL_LIST
              value: /config/models.txt
            - name: OLLAMA_HOST
              value: "http://localhost:11434"
          volumeMounts:
            - name: models-config
              mountPath: /config
            - name: sync-script
              mountPath: /scripts
          resources:
            requests:
              memory: "64Mi"
              cpu: "50m"
            limits:
              memory: "256Mi"
              cpu: "200m"
      volumes:
        - name: models
          persistentVolumeClaim:
            claimName: ollama-models
        - name: models-config
          configMap:
            name: ollama-models
        - name: sync-script
          configMap:
            name: ollama-sync-script
            defaultMode: 0755 # yamllint disable-line rule:octal-values
--- a/argocd/manifests/ollama/ingress-tailscale.yaml
+++ b/argocd/manifests/ollama/ingress-tailscale.yaml
@ -0,0 +1,26 @@
 ---
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: ollama-tailscale
  namespace: ollama
  annotations:
    tailscale.com/proxy-class: "default"
    tailscale.com/proxy-group: "ingress"
    gethomepage.dev/enabled: "true"
    gethomepage.dev/name: "Ollama"
    gethomepage.dev/group: "AI"
    gethomepage.dev/icon: "ollama.png"
    gethomepage.dev/description: "LLM inference server"
    gethomepage.dev/href: "https://ollama.ops.eblu.me"
    gethomepage.dev/pod-selector: "app=ollama"
 spec:
  ingressClassName: tailscale
  defaultBackend:
    service:
      name: ollama
      port:
        number: 11434
  tls:
    - hosts:
        - ollama
--- a/argocd/manifests/ollama/kustomization.yaml
+++ b/argocd/manifests/ollama/kustomization.yaml
@ -0,0 +1,22 @@
 ---
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 namespace: ollama
 resources:
  - pv-hostpath.yaml
  - pvc.yaml
  - deployment.yaml
  - service.yaml
  - ingress-tailscale.yaml
 images:
  - name: ollama/ollama
    newTag: "0.17.5"
 configMapGenerator:
  - name: ollama-models
    files:
      - models.txt
  - name: ollama-sync-script
    files:
      - sync-models.sh
--- a/argocd/manifests/ollama/models.txt
+++ b/argocd/manifests/ollama/models.txt
@ -0,0 +1,6 @@
 # Models to pull from Ollama registry
 # One model per line. Comments with #.
 qwen2.5:14b
 deepseek-r1:14b
 phi4:14b
 gemma3:12b
--- a/argocd/manifests/ollama/pv-hostpath.yaml
+++ b/argocd/manifests/ollama/pv-hostpath.yaml
@ -0,0 +1,15 @@
 ---
 apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: ollama-models-pv
 spec:
  capacity:
    storage: 200Gi
  accessModes:
    - ReadWriteOnce
  persistentVolumeReclaimPolicy: Retain
  storageClassName: ""
  hostPath:
    path: /mnt/storage1/ollama
    type: DirectoryOrCreate
--- a/argocd/manifests/ollama/pvc.yaml
+++ b/argocd/manifests/ollama/pvc.yaml
@ -0,0 +1,14 @@
 ---
 apiVersion: v1
 kind: PersistentVolumeClaim
 metadata:
  name: ollama-models
  namespace: ollama
 spec:
  accessModes:
    - ReadWriteOnce
  storageClassName: ""
  volumeName: ollama-models-pv
  resources:
    requests:
      storage: 200Gi
--- a/argocd/manifests/ollama/service.yaml
+++ b/argocd/manifests/ollama/service.yaml
@ -0,0 +1,13 @@
 ---
 apiVersion: v1
 kind: Service
 metadata:
  name: ollama
  namespace: ollama
 spec:
  selector:
    app: ollama
  ports:
    - name: http
      port: 11434
      targetPort: 11434
--- a/argocd/manifests/ollama/sync-models.sh
+++ b/argocd/manifests/ollama/sync-models.sh
@ -0,0 +1,61 @@
 #!/bin/bash
 # Sync models from ConfigMap to Ollama server
 # Runs as a sidecar in the ollama deployment, using the ollama CLI
 set -euo pipefail
 MODEL_LIST="${MODEL_LIST:-/config/models.txt}"
 OLLAMA_HOST="${OLLAMA_HOST:-http://localhost:11434}"
 SYNC_INTERVAL="${SYNC_INTERVAL:-1800}"
 export OLLAMA_HOST
 echo "Syncing models from ${MODEL_LIST} via ollama CLI (host: ${OLLAMA_HOST})"
 while true; do
    # Wait for ollama server to be ready
    echo "Waiting for Ollama API..."
    max_attempts=60
    attempt=0
    until ollama list > /dev/null 2>&1; do
        attempt=$((attempt + 1))
        if [[ $attempt -ge $max_attempts ]]; then
            echo "Ollama not ready after ${max_attempts} attempts, will retry next cycle"
            sleep "$SYNC_INTERVAL"
            continue 2
        fi
        sleep 5
    done
    echo "Ollama is ready"
    # Get list of currently pulled models
    current=$(ollama list 2>/dev/null | tail -n +2 | awk '{print $1}' || true)
    pulled=0
    skipped=0
    while IFS= read -r model || [[ -n "$model" ]]; do
        # Skip empty lines and comments
        [[ -z "$model" || "$model" =~ ^[[:space:]]*# ]] && continue
        # Trim whitespace
        model=$(echo "$model" | xargs)
        [[ -z "$model" ]] && continue
        # Check if model is already pulled (ollama list shows name:tag)
        if echo "$current" | grep -qF "$model"; then
            echo "Already present: $model"
            ((skipped++)) || true
        else
            echo "Pulling: $model"
            if ollama pull "$model"; then
                echo "Pulled: $model"
                ((pulled++)) || true
            else
                echo "Warning: Failed to pull $model" >&2
            fi
        fi
    done < "$MODEL_LIST"
    echo "Sync complete: $pulled pulled, $skipped already present"
    echo "Next sync in ${SYNC_INTERVAL}s"
    sleep "$SYNC_INTERVAL"
 done
--- a/docs/changelog.d/feature-ollama-ringtail.feature.md
+++ b/docs/changelog.d/feature-ollama-ringtail.feature.md
@ -0,0 +1 @@
 Deploy Ollama LLM server on ringtail with GPU acceleration and declarative model management
--- a/service-versions.yaml
+++ b/service-versions.yaml
@ -135,6 +135,13 @@ services:
    current-version: "2026.2.0"
    upstream-source: https://github.com/goauthentik/authentik/releases
  - name: ollama
    type: argocd
    last-reviewed: "2026-03-02"
    current-version: "0.17.5"
    upstream-source: https://github.com/ollama/ollama/releases
    notes: LLM inference server on ringtail (GPU); upstream container image
  - name: navidrome
    type: argocd
    last-reviewed: 2026-03-02
		`@ -0,0 +1 @@`
							`Deploy Ollama LLM server on ringtail with GPU acceleration and declarative model management`