From a32c99a2526f87ecd9881e04465244a8ca0b53b2 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 2 Mar 2026 21:23:12 -0800 Subject: [PATCH] Limit ollama to one loaded model and one parallel request MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevents OOM when switching between models — only one 14B model fits in 16GB VRAM at a time with KV cache for context. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/ollama/deployment.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/argocd/manifests/ollama/deployment.yaml b/argocd/manifests/ollama/deployment.yaml index 2b68e55..795c458 100644 --- a/argocd/manifests/ollama/deployment.yaml +++ b/argocd/manifests/ollama/deployment.yaml @@ -28,6 +28,10 @@ spec: value: /models - name: OLLAMA_HOST value: "0.0.0.0:11434" + - name: OLLAMA_MAX_LOADED_MODELS + value: "1" + - name: OLLAMA_NUM_PARALLEL + value: "1" volumeMounts: - name: models mountPath: /models