From 91d84e54d58b539ecfe44d9cd8226ca2ae4894ed Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 4 Mar 2026 20:58:11 -0800 Subject: [PATCH] Replace OOMKilled stat with detail table, shrink waiting reason panel The count-only stat wasn't actionable. New table shows pod name, container, restart count, and memory limit for each OOMKilled container. Waiting reason panel narrowed to make room. Co-Authored-By: Claude Opus 4.6 --- .../dashboards/configmap-kubernetes.yaml | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml b/argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml index 0b74e9d..61258de 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml @@ -96,16 +96,29 @@ data: "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } - } + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Memory Limit" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { "matcher": { "id": "byName", "options": "Restarts" }, "properties": [{ "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "red", "value": 5 }] } }] } + ] }, - "gridPos": { "h": 4, "w": 4, "x": 4, "y": 3 }, + "gridPos": { "h": 4, "w": 12, "x": 4, "y": 3 }, "id": 13, - "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"} == 1) or vector(0)", "refId": "A" }], + "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Restarts" }] }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"} == 1", "format": "table", "instant": true, "refId": "oom" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "kube_pod_container_status_restarts_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} * on(namespace, pod, container, cluster) kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"}", "format": "table", "instant": true, "refId": "restarts" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"} * on(namespace, pod, container, cluster) kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"}", "format": "table", "instant": true, "refId": "memlim" } + ], "title": "OOMKilled Containers", - "type": "stat" + "transformations": [ + { "id": "seriesToColumns", "options": { "byField": "pod" } }, + { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Value #oom": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "cluster": true, "cluster 1": true, "cluster 2": true, "container 1": true, "container 2": true, "instance": true, "instance 1": true, "instance 2": true, "job": true, "job 1": true, "job 2": true, "namespace 1": true, "namespace 2": true, "reason": true, "resource": true, "uid": true, "uid 1": true, "uid 2": true, "unit": true }, "renameByName": { "namespace": "Namespace", "pod": "Pod", "container": "Container", "Value #restarts": "Restarts", "Value #memlim": "Memory Limit" } } } + ], + "type": "table" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, @@ -118,7 +131,7 @@ data: "unit": "short" } }, - "gridPos": { "h": 4, "w": 16, "x": 8, "y": 3 }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 3 }, "id": 8, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }],