From 49b4a9f5bee0f840a36819af2c201b343c9e1c86 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 3 Feb 2026 07:11:47 -0800 Subject: [PATCH 1/2] Add pod state observability to minikube dashboard - Add "Unhealthy Pods" stat panel that shows count of pods in error states (ImagePullBackOff, CrashLoopBackOff, etc.) with red background when > 0 - Add "Pods by Waiting Reason" time series showing container waiting states - This provides visibility into stuck pods that ArgoCD doesn't track Co-Authored-By: Claude Opus 4.5 --- .../dashboards/configmap-minikube.yaml | 42 +++++++++++++++++-- 1 file changed, 38 insertions(+), 4 deletions(-) diff --git a/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml b/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml index 2b1956f..cbca654 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml @@ -107,6 +107,22 @@ data: "title": "CPU Requests (cores)", "type": "stat" }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 3 }, + "id": 13, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }], + "title": "Unhealthy Pods", + "type": "stat" + }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { @@ -118,7 +134,25 @@ data: "unit": "short" } }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, + "gridPos": { "h": 4, "w": 18, "x": 6, "y": 3 }, + "id": 14, + "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }], + "title": "Pods by Waiting Reason", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "short" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, "id": 9, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], @@ -136,7 +170,7 @@ data: "unit": "bytes" } }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 }, "id": 10, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], @@ -159,7 +193,7 @@ data: { "matcher": { "id": "byName", "options": "CPU Limits" }, "properties": [{ "id": "unit", "value": "short" }] } ] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 11 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 15 }, "id": 11, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Pods" }] }, "targets": [ @@ -178,7 +212,7 @@ data: }, { "datasource": { "type": "loki", "uid": "loki" }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 19 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 }, "id": 12, "options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false }, "targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{namespace=~\"$namespace\"}", "refId": "A" }], -- 2.50.1 (Apple Git-155) From 4f2e49f2192b22b883c16766b81d0c766d084830 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 3 Feb 2026 07:18:49 -0800 Subject: [PATCH 2/2] Add TTL to zim-watcher CronJob for automatic cleanup Jobs created by the CronJob will now auto-delete 4 days after completion. This prevents zombie Jobs from accumulating when the CronJob spec changes (e.g., image updates), since ArgoCD only tracks the CronJob resource itself, not the Jobs it spawns. Co-Authored-By: Claude Opus 4.5 --- argocd/manifests/kiwix/cronjob-zim-watcher.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/argocd/manifests/kiwix/cronjob-zim-watcher.yaml b/argocd/manifests/kiwix/cronjob-zim-watcher.yaml index 3532676..5de0990 100644 --- a/argocd/manifests/kiwix/cronjob-zim-watcher.yaml +++ b/argocd/manifests/kiwix/cronjob-zim-watcher.yaml @@ -9,6 +9,7 @@ spec: concurrencyPolicy: Forbid jobTemplate: spec: + ttlSecondsAfterFinished: 345600 # Auto-delete after 4 days template: spec: serviceAccountName: zim-watcher -- 2.50.1 (Apple Git-155)