## Summary - Deploy Ollama as a new ArgoCD-managed service on ringtail's k3s cluster with GPU acceleration - Declarative model management via `models.txt` + sidecar sync script (mirrors kiwix torrent pattern) - Initial models: `qwen2.5:14b`, `deepseek-r1:14b`, `phi4:14b`, `gemma3:12b` - hostPath PV on `/mnt/storage1/ollama` for fast local model storage (200Gi) - Tailscale ingress at `ollama.ops.eblu.me` for API access from tailnet - Enable GPU time-slicing (`replicas: 2`) on nvidia-device-plugin so Frigate and Ollama share the RTX 4080 ## Deployment and Testing - [ ] Deploy nvidia-device-plugin changes first: `argocd app sync nvidia-device-plugin` - [ ] Verify GPU time-slicing: `kubectl describe node ringtail --context=k3s-ringtail` shows `nvidia.com/gpu: 2` - [ ] Sync `apps` app with `--revision feature/ollama-ringtail` - [ ] Set ollama app to branch: `argocd app set ollama --revision feature/ollama-ringtail && argocd app sync ollama` - [ ] Verify model-sync sidecar pulls models: `kubectl logs -n ollama deploy/ollama -c model-sync --context=k3s-ringtail` - [ ] Test API: `curl https://ollama.ops.eblu.me/api/tags` - [ ] Test inference: `curl https://ollama.ops.eblu.me/api/generate -d '{"model":"qwen2.5:14b","prompt":"Hello"}'` - [ ] Verify Frigate still works after GPU sharing change - [ ] After merge: `argocd app set ollama --revision main && argocd app sync ollama` Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/277
61 lines
1.9 KiB
Bash
61 lines
1.9 KiB
Bash
#!/bin/bash
|
|
# Sync models from ConfigMap to Ollama server
|
|
# Runs as a sidecar in the ollama deployment, using the ollama CLI
|
|
set -euo pipefail
|
|
|
|
MODEL_LIST="${MODEL_LIST:-/config/models.txt}"
|
|
OLLAMA_HOST="${OLLAMA_HOST:-http://localhost:11434}"
|
|
SYNC_INTERVAL="${SYNC_INTERVAL:-1800}"
|
|
|
|
export OLLAMA_HOST
|
|
|
|
echo "Syncing models from ${MODEL_LIST} via ollama CLI (host: ${OLLAMA_HOST})"
|
|
|
|
while true; do
|
|
# Wait for ollama server to be ready
|
|
echo "Waiting for Ollama API..."
|
|
max_attempts=60
|
|
attempt=0
|
|
until ollama list > /dev/null 2>&1; do
|
|
attempt=$((attempt + 1))
|
|
if [[ $attempt -ge $max_attempts ]]; then
|
|
echo "Ollama not ready after ${max_attempts} attempts, will retry next cycle"
|
|
sleep "$SYNC_INTERVAL"
|
|
continue 2
|
|
fi
|
|
sleep 5
|
|
done
|
|
echo "Ollama is ready"
|
|
|
|
# Get list of currently pulled models
|
|
current=$(ollama list 2>/dev/null | tail -n +2 | awk '{print $1}' || true)
|
|
|
|
pulled=0
|
|
skipped=0
|
|
|
|
while IFS= read -r model || [[ -n "$model" ]]; do
|
|
# Skip empty lines and comments
|
|
[[ -z "$model" || "$model" =~ ^[[:space:]]*# ]] && continue
|
|
# Trim whitespace
|
|
model=$(echo "$model" | xargs)
|
|
[[ -z "$model" ]] && continue
|
|
|
|
# Check if model is already pulled (ollama list shows name:tag)
|
|
if echo "$current" | grep -qF "$model"; then
|
|
echo "Already present: $model"
|
|
((skipped++)) || true
|
|
else
|
|
echo "Pulling: $model"
|
|
if ollama pull "$model"; then
|
|
echo "Pulled: $model"
|
|
((pulled++)) || true
|
|
else
|
|
echo "Warning: Failed to pull $model" >&2
|
|
fi
|
|
fi
|
|
done < "$MODEL_LIST"
|
|
|
|
echo "Sync complete: $pulled pulled, $skipped already present"
|
|
echo "Next sync in ${SYNC_INTERVAL}s"
|
|
sleep "$SYNC_INTERVAL"
|
|
done
|