From c26026f4e9010d2a51c1464264d86d16baaf4187 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 11 Mar 2026 20:33:22 -0700 Subject: [PATCH] Bump Ollama memory to 24Gi and enable flash attention The 27B Q4_K_M model needs ~7.3 GiB system RAM for CPU-offloaded layers but only 6.8 GiB was available within the 22Gi cgroup. Bumping to 24Gi and enabling flash attention (reduces KV cache memory) should provide enough headroom. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/ollama/deployment.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/argocd/manifests/ollama/deployment.yaml b/argocd/manifests/ollama/deployment.yaml index 6d02ca3..060fe8f 100644 --- a/argocd/manifests/ollama/deployment.yaml +++ b/argocd/manifests/ollama/deployment.yaml @@ -32,6 +32,8 @@ spec: value: "1" - name: OLLAMA_NUM_PARALLEL value: "1" + - name: OLLAMA_FLASH_ATTENTION + value: "1" volumeMounts: - name: models mountPath: /models @@ -40,7 +42,7 @@ spec: memory: "512Mi" cpu: "500m" limits: - memory: "22Gi" + memory: "24Gi" cpu: "4000m" nvidia.com/gpu: "1" livenessProbe: