The count-only stat wasn't actionable. New table shows pod name, container, restart count, and memory limit for each OOMKilled container. Waiting reason panel narrowed to make room. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
288 lines
21 KiB
YAML
288 lines
21 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: grafana-dashboard-kubernetes
|
|
namespace: monitoring
|
|
labels:
|
|
grafana_dashboard: "1"
|
|
data:
|
|
kubernetes.json: |
|
|
{
|
|
"annotations": { "list": [] },
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 0,
|
|
"id": null,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
|
"gridPos": { "h": 3, "w": 4, "x": 0, "y": 0 },
|
|
"id": 1,
|
|
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
|
"title": "Pods",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
|
"gridPos": { "h": 3, "w": 4, "x": 4, "y": 0 },
|
|
"id": 2,
|
|
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
|
"title": "Deployments",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
|
"gridPos": { "h": 3, "w": 4, "x": 8, "y": 0 },
|
|
"id": 3,
|
|
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
|
"title": "StatefulSets",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
|
"gridPos": { "h": 3, "w": 4, "x": 12, "y": 0 },
|
|
"id": 4,
|
|
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created{cluster=~\"$cluster\"})", "refId": "A" }],
|
|
"title": "Namespaces",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "bytes", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
|
"gridPos": { "h": 3, "w": 4, "x": 16, "y": 0 },
|
|
"id": 5,
|
|
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
|
"title": "Memory Requests",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "short", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
|
"gridPos": { "h": 3, "w": 4, "x": 20, "y": 0 },
|
|
"id": 6,
|
|
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
|
"title": "CPU Requests (cores)",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "thresholds" },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 3 },
|
|
"id": 7,
|
|
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
|
|
"title": "Unhealthy Pods",
|
|
"type": "stat"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "thresholds" },
|
|
"custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
|
},
|
|
"overrides": [
|
|
{ "matcher": { "id": "byName", "options": "Memory Limit" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
|
{ "matcher": { "id": "byName", "options": "Restarts" }, "properties": [{ "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "yellow", "value": null }, { "color": "red", "value": 5 }] } }] }
|
|
]
|
|
},
|
|
"gridPos": { "h": 4, "w": 12, "x": 4, "y": 3 },
|
|
"id": 13,
|
|
"options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Restarts" }] },
|
|
"targets": [
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"} == 1", "format": "table", "instant": true, "refId": "oom" },
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "kube_pod_container_status_restarts_total{cluster=~\"$cluster\", namespace=~\"$namespace\"} * on(namespace, pod, container, cluster) kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"}", "format": "table", "instant": true, "refId": "restarts" },
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"} * on(namespace, pod, container, cluster) kube_pod_container_status_last_terminated_reason{reason=\"OOMKilled\", cluster=~\"$cluster\", namespace=~\"$namespace\"}", "format": "table", "instant": true, "refId": "memlim" }
|
|
],
|
|
"title": "OOMKilled Containers",
|
|
"transformations": [
|
|
{ "id": "seriesToColumns", "options": { "byField": "pod" } },
|
|
{ "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Value #oom": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "cluster": true, "cluster 1": true, "cluster 2": true, "container 1": true, "container 2": true, "instance": true, "instance 1": true, "instance 2": true, "job": true, "job 1": true, "job 2": true, "namespace 1": true, "namespace 2": true, "reason": true, "resource": true, "uid": true, "uid 1": true, "uid 2": true, "unit": true }, "renameByName": { "namespace": "Namespace", "pod": "Pod", "container": "Container", "Value #restarts": "Restarts", "Value #memlim": "Memory Limit" } } }
|
|
],
|
|
"type": "table"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "palette-classic" },
|
|
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 4, "w": 8, "x": 16, "y": 3 },
|
|
"id": 8,
|
|
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }],
|
|
"title": "Pods by Waiting Reason",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "palette-classic" },
|
|
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 },
|
|
"id": 9,
|
|
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
|
"title": "Pods by Namespace",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "palette-classic" },
|
|
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
|
"unit": "bytes"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 },
|
|
"id": 10,
|
|
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
|
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
|
"title": "Memory Requests by Namespace",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "palette-classic" },
|
|
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
|
"unit": "short"
|
|
}
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 15 },
|
|
"id": 14,
|
|
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
|
"targets": [
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "round(increase(kube_pod_container_status_restarts_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[$__rate_interval])) > 0", "legendFormat": "{{namespace}}/{{pod}}", "refId": "A" }
|
|
],
|
|
"title": "Container Restarts",
|
|
"type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"color": { "mode": "thresholds" },
|
|
"custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false },
|
|
"mappings": [],
|
|
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
|
|
},
|
|
"overrides": [
|
|
{ "matcher": { "id": "byName", "options": "Memory Requests" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
|
{ "matcher": { "id": "byName", "options": "Memory Limits" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
|
{ "matcher": { "id": "byName", "options": "CPU Requests" }, "properties": [{ "id": "unit", "value": "short" }] },
|
|
{ "matcher": { "id": "byName", "options": "CPU Limits" }, "properties": [{ "id": "unit", "value": "short" }] }
|
|
]
|
|
},
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 },
|
|
"id": 11,
|
|
"options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Pods" }] },
|
|
"targets": [
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" },
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" },
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" },
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" },
|
|
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" }
|
|
],
|
|
"title": "Namespace Resource Summary",
|
|
"transformations": [
|
|
{ "id": "seriesToColumns", "options": { "byField": "namespace" } },
|
|
{ "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true }, "renameByName": { "Value #pods": "Pods", "Value #mem_req": "Memory Requests", "Value #mem_lim": "Memory Limits", "Value #cpu_req": "CPU Requests", "Value #cpu_lim": "CPU Limits", "namespace": "Namespace" } } }
|
|
],
|
|
"type": "table"
|
|
},
|
|
{
|
|
"datasource": { "type": "loki", "uid": "loki" },
|
|
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 31 },
|
|
"id": 12,
|
|
"options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false },
|
|
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\"}", "refId": "A" }],
|
|
"title": "Pod Logs",
|
|
"type": "logs"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 38,
|
|
"tags": ["kubernetes", "k8s", "multi-cluster"],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"current": { "selected": true, "text": "All", "value": "$__all" },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"definition": "label_values(kube_namespace_created, cluster)",
|
|
"hide": 0,
|
|
"includeAll": true,
|
|
"label": "Cluster",
|
|
"multi": true,
|
|
"name": "cluster",
|
|
"options": [],
|
|
"query": { "query": "label_values(kube_namespace_created, cluster)", "refId": "StandardVariableQuery" },
|
|
"refresh": 2,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"sort": 1,
|
|
"type": "query"
|
|
},
|
|
{
|
|
"current": { "selected": true, "text": "All", "value": "$__all" },
|
|
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
|
"definition": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)",
|
|
"hide": 0,
|
|
"includeAll": true,
|
|
"label": "Namespace",
|
|
"multi": true,
|
|
"name": "namespace",
|
|
"options": [],
|
|
"query": { "query": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)", "refId": "StandardVariableQuery" },
|
|
"refresh": 2,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"sort": 1,
|
|
"type": "query"
|
|
}
|
|
]
|
|
},
|
|
"time": { "from": "now-6h", "to": "now" },
|
|
"timepicker": {},
|
|
"timezone": "browser",
|
|
"title": "Kubernetes Clusters",
|
|
"uid": "kubernetes",
|
|
"version": 1,
|
|
"weekStart": ""
|
|
}
|