Deploy Ollama LLM server on ringtail (#277)

## Summary
- Deploy Ollama as a new ArgoCD-managed service on ringtail's k3s cluster with GPU acceleration
- Declarative model management via `models.txt` + sidecar sync script (mirrors kiwix torrent pattern)
- Initial models: `qwen2.5:14b`, `deepseek-r1:14b`, `phi4:14b`, `gemma3:12b`
- hostPath PV on `/mnt/storage1/ollama` for fast local model storage (200Gi)
- Tailscale ingress at `ollama.ops.eblu.me` for API access from tailnet
- Enable GPU time-slicing (`replicas: 2`) on nvidia-device-plugin so Frigate and Ollama share the RTX 4080

## Deployment and Testing
- [ ] Deploy nvidia-device-plugin changes first: `argocd app sync nvidia-device-plugin`
- [ ] Verify GPU time-slicing: `kubectl describe node ringtail --context=k3s-ringtail` shows `nvidia.com/gpu: 2`
- [ ] Sync `apps` app with `--revision feature/ollama-ringtail`
- [ ] Set ollama app to branch: `argocd app set ollama --revision feature/ollama-ringtail && argocd app sync ollama`
- [ ] Verify model-sync sidecar pulls models: `kubectl logs -n ollama deploy/ollama -c model-sync --context=k3s-ringtail`
- [ ] Test API: `curl https://ollama.ops.eblu.me/api/tags`
- [ ] Test inference: `curl https://ollama.ops.eblu.me/api/generate -d '{"model":"qwen2.5:14b","prompt":"Hello"}'`
- [ ] Verify Frigate still works after GPU sharing change
- [ ] After merge: `argocd app set ollama --revision main && argocd app sync ollama`

Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/277
This commit is contained in:
Erich Blume 2026-03-02 20:39:51 -08:00
commit 31d925814f
15 changed files with 292 additions and 0 deletions

View file

@ -85,6 +85,9 @@ caddy_services:
- name: ntfy
host: "ntfy.{{ caddy_domain }}"
backend: "https://ntfy.tail8d86e.ts.net"
- name: ollama
host: "ollama.{{ caddy_domain }}"
backend: "https://ollama.tail8d86e.ts.net"
- name: sifaka
host: "nas.{{ caddy_domain }}"
backend: "http://sifaka:5000"

18
argocd/apps/ollama.yaml Normal file
View file

@ -0,0 +1,18 @@
---
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: ollama
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/ollama
destination:
server: https://ringtail.tail8d86e.ts.net:6443
namespace: ollama
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -25,6 +25,7 @@ spec:
image: nvcr.io/nvidia/k8s-device-plugin
args:
- --device-id-strategy=index
- --config-file=/config/config.yaml
env:
- name: LD_LIBRARY_PATH
value: /run/nvidia/lib
@ -39,6 +40,9 @@ spec:
- name: nvidia-libs
mountPath: /run/nvidia/lib
readOnly: true
- name: plugin-config
mountPath: /config
readOnly: true
volumes:
- name: device-plugins
hostPath:
@ -49,3 +53,6 @@ spec:
- name: nvidia-libs
hostPath:
path: /etc/nvidia-driver/lib
- name: plugin-config
configMap:
name: nvidia-device-plugin-config

View file

@ -6,6 +6,7 @@ namespace: nvidia-device-plugin
resources:
- daemonset.yaml
- runtime-class.yaml
- time-slicing-config.yaml
images:
- name: nvcr.io/nvidia/k8s-device-plugin

View file

@ -0,0 +1,14 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-device-plugin-config
namespace: nvidia-device-plugin
data:
config.yaml: |
version: v1
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 2

View file

@ -0,0 +1,84 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: ollama
namespace: ollama
spec:
replicas: 1
strategy:
type: Recreate
selector:
matchLabels:
app: ollama
template:
metadata:
labels:
app: ollama
spec:
runtimeClassName: nvidia
containers:
- name: ollama
image: ollama/ollama
ports:
- containerPort: 11434
name: http
env:
- name: OLLAMA_MODELS
value: /models
- name: OLLAMA_HOST
value: "0.0.0.0:11434"
volumeMounts:
- name: models
mountPath: /models
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "16Gi"
cpu: "4000m"
nvidia.com/gpu: "1"
livenessProbe:
httpGet:
path: /api/tags
port: 11434
initialDelaySeconds: 30
periodSeconds: 30
readinessProbe:
httpGet:
path: /api/tags
port: 11434
initialDelaySeconds: 10
periodSeconds: 10
- name: model-sync
image: ollama/ollama
command: ["/bin/bash", "/scripts/sync-models.sh"]
env:
- name: MODEL_LIST
value: /config/models.txt
- name: OLLAMA_HOST
value: "http://localhost:11434"
volumeMounts:
- name: models-config
mountPath: /config
- name: sync-script
mountPath: /scripts
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "256Mi"
cpu: "200m"
volumes:
- name: models
persistentVolumeClaim:
claimName: ollama-models
- name: models-config
configMap:
name: ollama-models
- name: sync-script
configMap:
name: ollama-sync-script
defaultMode: 0755 # yamllint disable-line rule:octal-values

View file

@ -0,0 +1,26 @@
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: ollama-tailscale
namespace: ollama
annotations:
tailscale.com/proxy-class: "default"
tailscale.com/proxy-group: "ingress"
gethomepage.dev/enabled: "true"
gethomepage.dev/name: "Ollama"
gethomepage.dev/group: "AI"
gethomepage.dev/icon: "ollama.png"
gethomepage.dev/description: "LLM inference server"
gethomepage.dev/href: "https://ollama.ops.eblu.me"
gethomepage.dev/pod-selector: "app=ollama"
spec:
ingressClassName: tailscale
defaultBackend:
service:
name: ollama
port:
number: 11434
tls:
- hosts:
- ollama

View file

@ -0,0 +1,22 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: ollama
resources:
- pv-hostpath.yaml
- pvc.yaml
- deployment.yaml
- service.yaml
- ingress-tailscale.yaml
images:
- name: ollama/ollama
newTag: "0.17.5"
configMapGenerator:
- name: ollama-models
files:
- models.txt
- name: ollama-sync-script
files:
- sync-models.sh

View file

@ -0,0 +1,6 @@
# Models to pull from Ollama registry
# One model per line. Comments with #.
qwen2.5:14b
deepseek-r1:14b
phi4:14b
gemma3:12b

View file

@ -0,0 +1,15 @@
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: ollama-models-pv
spec:
capacity:
storage: 200Gi
accessModes:
- ReadWriteOnce
persistentVolumeReclaimPolicy: Retain
storageClassName: ""
hostPath:
path: /mnt/storage1/ollama
type: DirectoryOrCreate

View file

@ -0,0 +1,14 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: ollama-models
namespace: ollama
spec:
accessModes:
- ReadWriteOnce
storageClassName: ""
volumeName: ollama-models-pv
resources:
requests:
storage: 200Gi

View file

@ -0,0 +1,13 @@
---
apiVersion: v1
kind: Service
metadata:
name: ollama
namespace: ollama
spec:
selector:
app: ollama
ports:
- name: http
port: 11434
targetPort: 11434

View file

@ -0,0 +1,61 @@
#!/bin/bash
# Sync models from ConfigMap to Ollama server
# Runs as a sidecar in the ollama deployment, using the ollama CLI
set -euo pipefail
MODEL_LIST="${MODEL_LIST:-/config/models.txt}"
OLLAMA_HOST="${OLLAMA_HOST:-http://localhost:11434}"
SYNC_INTERVAL="${SYNC_INTERVAL:-1800}"
export OLLAMA_HOST
echo "Syncing models from ${MODEL_LIST} via ollama CLI (host: ${OLLAMA_HOST})"
while true; do
# Wait for ollama server to be ready
echo "Waiting for Ollama API..."
max_attempts=60
attempt=0
until ollama list > /dev/null 2>&1; do
attempt=$((attempt + 1))
if [[ $attempt -ge $max_attempts ]]; then
echo "Ollama not ready after ${max_attempts} attempts, will retry next cycle"
sleep "$SYNC_INTERVAL"
continue 2
fi
sleep 5
done
echo "Ollama is ready"
# Get list of currently pulled models
current=$(ollama list 2>/dev/null | tail -n +2 | awk '{print $1}' || true)
pulled=0
skipped=0
while IFS= read -r model || [[ -n "$model" ]]; do
# Skip empty lines and comments
[[ -z "$model" || "$model" =~ ^[[:space:]]*# ]] && continue
# Trim whitespace
model=$(echo "$model" | xargs)
[[ -z "$model" ]] && continue
# Check if model is already pulled (ollama list shows name:tag)
if echo "$current" | grep -qF "$model"; then
echo "Already present: $model"
((skipped++)) || true
else
echo "Pulling: $model"
if ollama pull "$model"; then
echo "Pulled: $model"
((pulled++)) || true
else
echo "Warning: Failed to pull $model" >&2
fi
fi
done < "$MODEL_LIST"
echo "Sync complete: $pulled pulled, $skipped already present"
echo "Next sync in ${SYNC_INTERVAL}s"
sleep "$SYNC_INTERVAL"
done

View file

@ -0,0 +1 @@
Deploy Ollama LLM server on ringtail with GPU acceleration and declarative model management

View file

@ -135,6 +135,13 @@ services:
current-version: "2026.2.0"
upstream-source: https://github.com/goauthentik/authentik/releases
- name: ollama
type: argocd
last-reviewed: "2026-03-02"
current-version: "0.17.5"
upstream-source: https://github.com/ollama/ollama/releases
notes: LLM inference server on ringtail (GPU); upstream container image
- name: navidrome
type: argocd
last-reviewed: 2026-03-02