Limit ollama to one loaded model and one parallel request

Prevents OOM when switching between models — only one 14B model fits in 16GB VRAM at a time with KV cache for context. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-02 21:23:12 -08:00 · 2026-03-02 21:23:12 -08:00 · a32c99a252
commit a32c99a252
parent 203e3cd567
1 changed files with 4 additions and 0 deletions
--- a/argocd/manifests/ollama/deployment.yaml
+++ b/argocd/manifests/ollama/deployment.yaml
@ -28,6 +28,10 @@ spec:
              value: /models
            - name: OLLAMA_HOST
              value: "0.0.0.0:11434"
+            - name: OLLAMA_MAX_LOADED_MODELS
+              value: "1"
+            - name: OLLAMA_NUM_PARALLEL
+              value: "1"
          volumeMounts:
            - name: models
              mountPath: /models