From c26026f4e9010d2a51c1464264d86d16baaf4187 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Wed, 11 Mar 2026 20:33:22 -0700
Subject: [PATCH] Bump Ollama memory to 24Gi and enable flash attention

The 27B Q4_K_M model needs ~7.3 GiB system RAM for CPU-offloaded layers
but only 6.8 GiB was available within the 22Gi cgroup. Bumping to 24Gi
and enabling flash attention (reduces KV cache memory) should provide
enough headroom.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 argocd/manifests/ollama/deployment.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/argocd/manifests/ollama/deployment.yaml b/argocd/manifests/ollama/deployment.yaml
index 6d02ca3..060fe8f 100644
--- a/argocd/manifests/ollama/deployment.yaml
+++ b/argocd/manifests/ollama/deployment.yaml
@@ -32,6 +32,8 @@ spec:
               value: "1"
             - name: OLLAMA_NUM_PARALLEL
               value: "1"
+            - name: OLLAMA_FLASH_ATTENTION
+              value: "1"
           volumeMounts:
             - name: models
               mountPath: /models
@@ -40,7 +42,7 @@ spec:
               memory: "512Mi"
               cpu: "500m"
             limits:
-              memory: "22Gi"
+              memory: "24Gi"
               cpu: "4000m"
               nvidia.com/gpu: "1"
           livenessProbe: