From a32c99a2526f87ecd9881e04465244a8ca0b53b2 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Mon, 2 Mar 2026 21:23:12 -0800
Subject: [PATCH] Limit ollama to one loaded model and one parallel request
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Prevents OOM when switching between models — only one 14B model
fits in 16GB VRAM at a time with KV cache for context.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 argocd/manifests/ollama/deployment.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/argocd/manifests/ollama/deployment.yaml b/argocd/manifests/ollama/deployment.yaml
index 2b68e55..795c458 100644
--- a/argocd/manifests/ollama/deployment.yaml
+++ b/argocd/manifests/ollama/deployment.yaml
@@ -28,6 +28,10 @@ spec:
               value: /models
             - name: OLLAMA_HOST
               value: "0.0.0.0:11434"
+            - name: OLLAMA_MAX_LOADED_MODELS
+              value: "1"
+            - name: OLLAMA_NUM_PARALLEL
+              value: "1"
           volumeMounts:
             - name: models
               mountPath: /models