Add multi-cluster observability with ringtail metrics and dashboards (#270)

## Summary
- Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs
- Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests)
- Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki
- Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with:
  - **Kubernetes Clusters** dashboard — multi-cluster with `cluster` and `namespace` template variables
  - **Ringtail (k3s)** dashboard — dedicated ringtail view with GPU usage panels

## Deployment and Testing
1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`)
2. Sync `prometheus` → verify `cluster` label on scraped metrics
3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs
4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs
5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail
6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}`
7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values
8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods

## Notes
- Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later
- DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve
- The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard

Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270
This commit is contained in:
Erich Blume 2026-02-25 22:01:00 -08:00
commit 03d71544ec
19 changed files with 910 additions and 217 deletions

View file

@ -29,6 +29,11 @@ prometheus.relabel "instance" {
target_label = "instance"
replacement = "{{ alloy_instance_label }}"
}
rule {
target_label = "cluster"
replacement = "indri"
}
}
// Push metrics to Prometheus via remote_write
@ -110,6 +115,11 @@ loki.relabel "add_host" {
target_label = "host"
replacement = "{{ alloy_instance_label }}"
}
rule {
target_label = "cluster"
replacement = "indri"
}
}
// Write logs to Loki

View file

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: alloy-ringtail
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/alloy-ringtail
destination:
server: https://ringtail.tail8d86e.ts.net:6443
namespace: alloy
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: kube-state-metrics-ringtail
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/kube-state-metrics-ringtail
destination:
server: https://ringtail.tail8d86e.ts.net:6443
namespace: monitoring
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -108,6 +108,11 @@ loki.process "pods" {
repository = "",
}
}
// Add cluster label for multi-cluster identification
stage.static_labels {
values = { cluster = "indri" }
}
}
// Write logs to Loki
@ -163,6 +168,8 @@ prometheus.scrape "blackbox" {
// Push metrics to Prometheus
prometheus.remote_write "prometheus" {
external_labels = { cluster = "indri" }
endpoint {
url = "http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
}

View file

@ -0,0 +1,165 @@
// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics
// Remote-writes metrics to indri Prometheus, logs to indri Loki
// ============== HOST METRICS ==============
// System metrics exporter (Linux host via /host/proc, /host/sys mounts)
prometheus.exporter.unix "system" {
procfs_path = "/host/proc"
sysfs_path = "/host/sys"
rootfs_path = "/host/root"
}
// Scrape system metrics and add instance label
prometheus.scrape "system" {
targets = prometheus.exporter.unix.system.targets
forward_to = [prometheus.relabel.instance.receiver]
scrape_interval = "15s"
}
// Add instance label
prometheus.relabel "instance" {
forward_to = [prometheus.remote_write.prometheus.receiver]
rule {
target_label = "instance"
replacement = "ringtail"
}
}
// ============== KUBE-STATE-METRICS SCRAPE ==============
prometheus.scrape "kube_state_metrics" {
targets = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]
scrape_interval = "15s"
forward_to = [prometheus.remote_write.prometheus.receiver]
}
// Push metrics to indri Prometheus
prometheus.remote_write "prometheus" {
external_labels = { cluster = "ringtail" }
endpoint {
url = "https://prometheus.tail8d86e.ts.net/api/v1/write"
tls_config {
insecure_skip_verify = true
}
}
}
// ============== K8S POD LOG DISCOVERY ==============
// Discover all pods in the cluster
discovery.kubernetes "pods" {
role = "pod"
}
// Relabel to extract useful metadata
discovery.relabel "pods" {
targets = discovery.kubernetes.pods.targets
// Keep only running pods
rule {
source_labels = ["__meta_kubernetes_pod_phase"]
regex = "Pending|Succeeded|Failed|Unknown"
action = "drop"
}
// Set namespace label
rule {
source_labels = ["__meta_kubernetes_namespace"]
target_label = "namespace"
}
// Set pod name label
rule {
source_labels = ["__meta_kubernetes_pod_name"]
target_label = "pod"
}
// Set container name label
rule {
source_labels = ["__meta_kubernetes_pod_container_name"]
target_label = "container"
}
// Set app label from pod labels
rule {
source_labels = ["__meta_kubernetes_pod_label_app"]
target_label = "app"
}
// Fallback: use app.kubernetes.io/name if no app label
rule {
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
target_label = "app"
regex = "(.+)"
action = "replace"
}
// Set node name
rule {
source_labels = ["__meta_kubernetes_pod_node_name"]
target_label = "node"
}
// Build the log path for the pod container
rule {
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
target_label = "__path__"
separator = "/"
replacement = "/var/log/pods/*$1/$2/*.log"
}
}
// Tail pod logs
loki.source.kubernetes "pods" {
targets = discovery.relabel.pods.output
forward_to = [loki.process.pods.receiver]
}
// Process logs - parse JSON if present, add labels
loki.process "pods" {
forward_to = [loki.write.loki.receiver]
// Try to parse JSON logs
stage.json {
expressions = {
level = "level",
msg = "msg",
message = "message",
time = "time",
caller = "caller",
}
}
// Drop JSON parsing error labels (non-JSON logs are fine)
stage.label_drop {
values = ["__error__", "__error_details__"]
}
// Extract labels from parsed JSON data
stage.labels {
values = {
level = "",
caller = "",
}
}
// Add cluster label for multi-cluster identification
stage.static_labels {
values = { cluster = "ringtail" }
}
}
// Write logs to indri Loki
loki.write "loki" {
endpoint {
url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"
tls_config {
insecure_skip_verify = true
}
}
}

View file

@ -0,0 +1,86 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: alloy
namespace: alloy
labels:
app: alloy
spec:
selector:
matchLabels:
app: alloy
template:
metadata:
labels:
app: alloy
spec:
serviceAccountName: alloy
securityContext:
fsGroup: 473 # alloy user group
containers:
- name: alloy
image: grafana/alloy
args:
- run
- --server.http.listen-addr=0.0.0.0:12345
- --storage.path=/var/lib/alloy/data
- /etc/alloy/config.alloy
ports:
- containerPort: 12345
name: http
env:
- name: HOSTNAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumeMounts:
- name: config
mountPath: /etc/alloy
- name: varlog
mountPath: /var/log
readOnly: true
- name: data
mountPath: /var/lib/alloy/data
- name: proc
mountPath: /host/proc
readOnly: true
- name: sys
mountPath: /host/sys
readOnly: true
- name: root
mountPath: /host/root
mountPropagation: HostToContainer
readOnly: true
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
capabilities:
drop:
- ALL
tolerations:
- operator: Exists
volumes:
- name: config
configMap:
name: alloy-config
- name: varlog
hostPath:
path: /var/log
- name: data
emptyDir: {}
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys
- name: root
hostPath:
path: /

View file

@ -0,0 +1,18 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: alloy
resources:
- namespace.yaml
- rbac.yaml
- daemonset.yaml
images:
- name: grafana/alloy
newTag: v1.13.1
configMapGenerator:
- name: alloy-config
files:
- config.alloy

View file

@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: alloy

View file

@ -0,0 +1,35 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: alloy
namespace: alloy
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: alloy
rules:
- apiGroups: [""]
resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods", "pods/log", "namespaces"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["get"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: alloy
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: alloy
subjects:
- kind: ServiceAccount
name: alloy
namespace: alloy

View file

@ -1,12 +1,12 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-minikube
name: grafana-dashboard-kubernetes
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
minikube.json: |
kubernetes.json: |
{
"annotations": { "list": [] },
"editable": true,
@ -15,95 +15,63 @@ data:
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
}
},
"gridPos": { "h": 3, "w": 3, "x": 0, "y": 0 },
"id": 1,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "minikube_up", "refId": "A" }],
"title": "Cluster",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
}
},
"gridPos": { "h": 3, "w": 3, "x": 3, "y": 0 },
"id": 2,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "minikube_apiserver_up", "refId": "A" }],
"title": "API Server",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 3, "w": 3, "x": 6, "y": 0 },
"id": 3,
"gridPos": { "h": 3, "w": 4, "x": 0, "y": 0 },
"id": 1,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
"title": "Pods",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 3, "w": 3, "x": 9, "y": 0 },
"id": 4,
"gridPos": { "h": 3, "w": 4, "x": 4, "y": 0 },
"id": 2,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{namespace=~\"$namespace\"})", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
"title": "Deployments",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 3, "w": 3, "x": 12, "y": 0 },
"id": 5,
"gridPos": { "h": 3, "w": 4, "x": 8, "y": 0 },
"id": 3,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{namespace=~\"$namespace\"})", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
"title": "StatefulSets",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 3, "w": 3, "x": 15, "y": 0 },
"id": 6,
"gridPos": { "h": 3, "w": 4, "x": 12, "y": 0 },
"id": 4,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created)", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created{cluster=~\"$cluster\"})", "refId": "A" }],
"title": "Namespaces",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "bytes", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 3, "w": 3, "x": 18, "y": 0 },
"id": 7,
"gridPos": { "h": 3, "w": 4, "x": 16, "y": 0 },
"id": 5,
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
"title": "Memory Requests",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "short", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 3, "w": 3, "x": 21, "y": 0 },
"id": 8,
"gridPos": { "h": 3, "w": 4, "x": 20, "y": 0 },
"id": 6,
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\"})", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
"title": "CPU Requests (cores)",
"type": "stat"
},
@ -117,9 +85,9 @@ data:
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 3 },
"id": 13,
"id": 7,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
"title": "Unhealthy Pods",
"type": "stat"
},
@ -135,9 +103,9 @@ data:
}
},
"gridPos": { "h": 4, "w": 18, "x": 6, "y": 3 },
"id": 14,
"id": 8,
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }],
"title": "Pods by Waiting Reason",
"type": "timeseries"
},
@ -155,7 +123,7 @@ data:
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 },
"id": 9,
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
"title": "Pods by Namespace",
"type": "timeseries"
},
@ -173,7 +141,7 @@ data:
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 },
"id": 10,
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
"title": "Memory Requests by Namespace",
"type": "timeseries"
},
@ -197,11 +165,11 @@ data:
"id": 11,
"options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Pods" }] },
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" }
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" }
],
"title": "Namespace Resource Summary",
"transformations": [
@ -215,27 +183,44 @@ data:
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 },
"id": 12,
"options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false },
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{namespace=~\"$namespace\"}", "refId": "A" }],
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\"}", "refId": "A" }],
"title": "Pod Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": ["minikube", "kubernetes", "k8s"],
"tags": ["kubernetes", "k8s", "multi-cluster"],
"templating": {
"list": [
{
"current": { "selected": true, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(kube_namespace_created, namespace)",
"definition": "label_values(kube_namespace_created, cluster)",
"hide": 0,
"includeAll": true,
"label": "Cluster",
"multi": true,
"name": "cluster",
"options": [],
"query": { "query": "label_values(kube_namespace_created, cluster)", "refId": "StandardVariableQuery" },
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
},
{
"current": { "selected": true, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)",
"hide": 0,
"includeAll": true,
"label": "Namespace",
"multi": true,
"name": "namespace",
"options": [],
"query": { "query": "label_values(kube_namespace_created, namespace)", "refId": "StandardVariableQuery" },
"query": { "query": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)", "refId": "StandardVariableQuery" },
"refresh": 2,
"regex": "",
"skipUrlSync": false,
@ -247,8 +232,8 @@ data:
"time": { "from": "now-6h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Minikube Kubernetes",
"uid": "minikube",
"version": 2,
"title": "Kubernetes Clusters",
"uid": "kubernetes",
"version": 1,
"weekStart": ""
}

View file

@ -0,0 +1,314 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-ringtail
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
ringtail.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "System Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, "unit": "dtdurations" } },
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
"id": 1,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "time() - node_boot_time_seconds{instance=\"ringtail\"}", "refId": "A" }],
"title": "Uptime",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, "unit": "decbytes" } },
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
"id": 2,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemTotal_bytes{instance=\"ringtail\"}", "refId": "A" }],
"title": "Total Memory",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "purple", "value": null }] }, "unit": "short" } },
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
"id": 3,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(node_cpu_seconds_total{instance=\"ringtail\", mode=\"idle\"})", "refId": "A" }],
"title": "CPU Cores",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 4 }, { "color": "red", "value": 8 }] }, "unit": "short", "decimals": 2 } },
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
"id": 4,
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_load1{instance=\"ringtail\"}", "refId": "A" }],
"title": "Load (1m)",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
"id": 5,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=\"ringtail\"})", "refId": "A" }],
"title": "K8s Pods",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "percent", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 95 }] } } },
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
"id": 6,
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_usage_percent", "refId": "A" }],
"title": "GPU Usage %",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"panels": [],
"title": "CPU & Memory",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "percentunit",
"max": 1
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 7,
"options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (mode) (rate(node_cpu_seconds_total{instance=\"ringtail\", mode!=\"idle\"}[5m])) / on() group_left count(node_cpu_seconds_total{instance=\"ringtail\", mode=\"idle\"})", "legendFormat": "{{mode}}", "refId": "A" }
],
"title": "CPU Usage by Mode",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "bytes"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 8,
"options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemTotal_bytes{instance=\"ringtail\"} - node_memory_MemAvailable_bytes{instance=\"ringtail\"}", "legendFormat": "Used", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemAvailable_bytes{instance=\"ringtail\"}", "legendFormat": "Available", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_Cached_bytes{instance=\"ringtail\"}", "legendFormat": "Cached", "refId": "C" }
],
"title": "Memory Usage",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 102,
"panels": [],
"title": "Storage",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.8 }, { "color": "red", "value": 0.95 }] }
},
"overrides": [
{ "matcher": { "id": "byName", "options": "Size" }, "properties": [{ "id": "unit", "value": "bytes" }] },
{ "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] },
{ "matcher": { "id": "byName", "options": "Used %" }, "properties": [{ "id": "unit", "value": "percentunit" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.8 }, { "color": "red", "value": 0.95 }] } }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } }] }
]
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 15 },
"id": 9,
"options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Size" }] },
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_filesystem_size_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"}", "format": "table", "instant": true, "refId": "size" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_filesystem_avail_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"}", "format": "table", "instant": true, "refId": "avail" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "1 - (node_filesystem_avail_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"})", "format": "table", "instant": true, "refId": "pct" }
],
"title": "Filesystem Usage",
"transformations": [
{ "id": "seriesToColumns", "options": { "byField": "mountpoint" } },
{ "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "device": true, "device 1": true, "device 2": true, "fstype": true, "fstype 1": true, "fstype 2": true, "instance": true, "instance 1": true, "instance 2": true, "job": true, "job 1": true, "job 2": true, "cluster": true, "cluster 1": true, "cluster 2": true }, "renameByName": { "mountpoint": "Mount", "Value #size": "Size", "Value #avail": "Available", "Value #pct": "Used %" } } }
],
"type": "table"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
"id": 103,
"panels": [],
"title": "Network",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "axisBorderShow": false, "axisCenteredZero": true, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "Bps"
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 },
"id": 10,
"options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(node_network_receive_bytes_total{instance=\"ringtail\", device!~\"lo|veth.*|cali.*|flannel.*|cni.*\"}[5m])", "legendFormat": "{{device}} rx", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "-rate(node_network_transmit_bytes_total{instance=\"ringtail\", device!~\"lo|veth.*|cali.*|flannel.*|cni.*\"}[5m])", "legendFormat": "{{device}} tx", "refId": "B" }
],
"title": "Network Traffic",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
"id": 104,
"panels": [],
"title": "GPU",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "percent"
}
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 31 },
"id": 11,
"options": { "legend": { "calcs": ["mean", "lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_usage_percent", "legendFormat": "GPU Usage", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_mem_usage_percent", "legendFormat": "GPU Memory", "refId": "B" }
],
"title": "GPU Overview",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 },
"id": 105,
"panels": [],
"title": "Kubernetes",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "short"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 },
"id": 12,
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=\"ringtail\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
"title": "Pods by Namespace",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 40 },
"id": 13,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=\"ringtail\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
"title": "Unhealthy Pods",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 40 },
"id": 14,
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=\"ringtail\"})", "refId": "A" }],
"title": "Deployments",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 48 },
"id": 106,
"panels": [],
"title": "Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 49 },
"id": 15,
"options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false },
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=\"ringtail\"}", "refId": "A" }],
"title": "Pod Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": ["ringtail", "k3s", "gpu", "system"],
"templating": { "list": [] },
"time": { "from": "now-6h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Ringtail",
"uid": "ringtail",
"version": 1,
"weekStart": ""
}

View file

@ -1,145 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-services
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
services.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"panels": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
"unit": "short"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"pluginVersion": "11.0.0",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/miniflux\"}", "legendFormat": "Miniflux", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/kiwix\"}", "legendFormat": "Kiwix", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/transmission\"}", "legendFormat": "Transmission", "refId": "C" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/devpi\"}", "legendFormat": "Devpi", "refId": "D" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/argocd\"}", "legendFormat": "ArgoCD", "refId": "E" }
],
"title": "Service Status",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "s"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 },
"id": 2,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.0.0",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/miniflux\"}", "legendFormat": "Miniflux", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/kiwix\"}", "legendFormat": "Kiwix", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/transmission\"}", "legendFormat": "Transmission", "refId": "C" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/devpi\"}", "legendFormat": "Devpi", "refId": "D" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/argocd\"}", "legendFormat": "ArgoCD", "refId": "E" }
],
"title": "Response Time",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.95 }, { "color": "green", "value": 0.99 }] },
"unit": "percentunit"
},
"overrides": []
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 14 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"pluginVersion": "11.0.0",
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/miniflux\"}[$__range])", "legendFormat": "Miniflux", "refId": "A" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/kiwix\"}[$__range])", "legendFormat": "Kiwix", "refId": "B" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/transmission\"}[$__range])", "legendFormat": "Transmission", "refId": "C" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/devpi\"}[$__range])", "legendFormat": "Devpi", "refId": "D" },
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/argocd\"}[$__range])", "legendFormat": "ArgoCD", "refId": "E" }
],
"title": "Uptime (selected period)",
"type": "stat"
}
],
"refresh": "30s",
"schemaVersion": 38,
"tags": ["services", "health"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "K8s Services Health",
"uid": "k8s-services",
"version": 1,
"weekStart": ""
}

View file

@ -13,10 +13,10 @@ resources:
- dashboards/configmap-devpi.yaml
- dashboards/configmap-loki.yaml
- dashboards/configmap-macos.yaml
- dashboards/configmap-minikube.yaml
- dashboards/configmap-kubernetes.yaml
- dashboards/configmap-jellyfin.yaml
- dashboards/configmap-postgresql.yaml
- dashboards/configmap-services.yaml
- dashboards/configmap-ringtail.yaml
- dashboards/configmap-zot.yaml
- dashboards/configmap-frigate.yaml
- dashboards/configmap-cv-apm.yaml

View file

@ -0,0 +1,53 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kube-state-metrics
namespace: monitoring
labels:
app: kube-state-metrics
spec:
replicas: 1
selector:
matchLabels:
app: kube-state-metrics
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
containers:
- name: kube-state-metrics
image: registry.k8s.io/kube-state-metrics/kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
- containerPort: 8081
name: telemetry
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
resources:
requests:
cpu: 10m
memory: 64Mi
limits:
cpu: 100m
memory: 256Mi
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 65534
capabilities:
drop:
- ALL

View file

@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- rbac.yaml
- deployment.yaml
- service.yaml
images:
- name: registry.k8s.io/kube-state-metrics/kube-state-metrics
newTag: v2.18.0

View file

@ -0,0 +1,79 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources:
- configmaps
- secrets
- nodes
- pods
- services
- serviceaccounts
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources:
- statefulsets
- daemonsets
- deployments
- replicasets
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
- jobs
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
- apiGroups: ["networking.k8s.io"]
resources:
- networkpolicies
- ingresses
verbs: ["list", "watch"]
- apiGroups: ["coordination.k8s.io"]
resources:
- leases
verbs: ["list", "watch"]
- apiGroups: ["certificates.k8s.io"]
resources:
- certificatesigningrequests
verbs: ["list", "watch"]
- apiGroups: ["storage.k8s.io"]
resources:
- storageclasses
- volumeattachments
verbs: ["list", "watch"]
- apiGroups: ["admissionregistration.k8s.io"]
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs: ["list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring

View file

@ -0,0 +1,17 @@
apiVersion: v1
kind: Service
metadata:
name: kube-state-metrics
namespace: monitoring
labels:
app: kube-state-metrics
spec:
selector:
app: kube-state-metrics
ports:
- name: http-metrics
port: 8080
targetPort: http-metrics
- name: telemetry
port: 8081
targetPort: telemetry

View file

@ -10,11 +10,17 @@ scrape_configs:
- job_name: "node-exporter-sifaka"
static_configs:
- targets: ["nas.ops.eblu.me:9100"]
metric_relabel_configs:
- target_label: cluster
replacement: indri
- job_name: "smartctl-sifaka"
scrape_interval: 60s
static_configs:
- targets: ["nas.ops.eblu.me:9633"]
metric_relabel_configs:
- target_label: cluster
replacement: indri
# CNPG PostgreSQL metrics (k8s internal)
- job_name: "cnpg-postgres"
@ -22,21 +28,33 @@ scrape_configs:
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
labels:
instance: "blumeops-pg"
metric_relabel_configs:
- target_label: cluster
replacement: indri
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
metric_relabel_configs:
- target_label: cluster
replacement: indri
# Loki metrics
- job_name: "loki"
static_configs:
- targets: ["loki.monitoring.svc.cluster.local:3100"]
metric_relabel_configs:
- target_label: cluster
replacement: indri
# Kubernetes state metrics (pods, deployments, resource usage, etc.)
- job_name: "kube-state-metrics"
static_configs:
- targets: ["kube-state-metrics.monitoring.svc.cluster.local:8080"]
metric_relabel_configs:
- target_label: cluster
replacement: indri
# Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail)
- job_name: "frigate"
@ -44,3 +62,6 @@ scrape_configs:
static_configs:
- targets: ["nvr.ops.eblu.me"]
metrics_path: /api/metrics
metric_relabel_configs:
- target_label: cluster
replacement: ringtail

View file

@ -0,0 +1 @@
Add multi-cluster Kubernetes observability: deploy kube-state-metrics and Alloy on ringtail (k3s), add `cluster` label to all metrics/logs, replace single-cluster dashboards with multi-cluster Kubernetes dashboard and dedicated Ringtail dashboard with GPU monitoring.