From 737371ab59bd7462dbc39edb06fb6cd093d04fa3 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 3 Feb 2026 07:20:05 -0800 Subject: [PATCH] Add pod state observability to minikube dashboard (#83) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Add "Unhealthy Pods" stat panel showing count of pods in error states (ImagePullBackOff, CrashLoopBackOff, etc.) with red background when > 0 - Add "Pods by Waiting Reason" time series chart showing container waiting states over time - Provides visibility into stuck pods that ArgoCD doesn't track (since it manages CronJobs, not the Jobs/Pods they spawn) ## Context This addresses the issue where a `zim-watcher` cronjob pod was stuck in `ImagePullBackOff` for 11 days without any alerting. ArgoCD showed the CronJob as "Synced, Healthy" because it only manages the CronJob resource, not its spawned Jobs/Pods. ## Deployment and Testing - [ ] Sync grafana-config app to test branch - [ ] Verify dashboard renders correctly - [ ] Confirm "Unhealthy Pods" shows 0 (green) when no issues - [ ] Reset to main after merge 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/83 --- .../dashboards/configmap-minikube.yaml | 42 +++++++++++++++++-- .../manifests/kiwix/cronjob-zim-watcher.yaml | 1 + 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml b/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml index 2b1956f..cbca654 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml @@ -107,6 +107,22 @@ data: "title": "CPU Requests (cores)", "type": "stat" }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 3 }, + "id": 13, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }], + "title": "Unhealthy Pods", + "type": "stat" + }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { @@ -118,7 +134,25 @@ data: "unit": "short" } }, - "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, + "gridPos": { "h": 4, "w": 18, "x": 6, "y": 3 }, + "id": 14, + "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }], + "title": "Pods by Waiting Reason", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "short" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, "id": 9, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], @@ -136,7 +170,7 @@ data: "unit": "bytes" } }, - "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 }, "id": 10, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], @@ -159,7 +193,7 @@ data: { "matcher": { "id": "byName", "options": "CPU Limits" }, "properties": [{ "id": "unit", "value": "short" }] } ] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 11 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 15 }, "id": 11, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Pods" }] }, "targets": [ @@ -178,7 +212,7 @@ data: }, { "datasource": { "type": "loki", "uid": "loki" }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 19 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 }, "id": 12, "options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false }, "targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{namespace=~\"$namespace\"}", "refId": "A" }], diff --git a/argocd/manifests/kiwix/cronjob-zim-watcher.yaml b/argocd/manifests/kiwix/cronjob-zim-watcher.yaml index 3532676..5de0990 100644 --- a/argocd/manifests/kiwix/cronjob-zim-watcher.yaml +++ b/argocd/manifests/kiwix/cronjob-zim-watcher.yaml @@ -9,6 +9,7 @@ spec: concurrencyPolicy: Forbid jobTemplate: spec: + ttlSecondsAfterFinished: 345600 # Auto-delete after 4 days template: spec: serviceAccountName: zim-watcher