Add multi-cluster observability with ringtail metrics and dashboards (#270)
## Summary
- Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs
- Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests)
- Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki
- Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with:
- **Kubernetes Clusters** dashboard — multi-cluster with `cluster` and `namespace` template variables
- **Ringtail (k3s)** dashboard — dedicated ringtail view with GPU usage panels
## Deployment and Testing
1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`)
2. Sync `prometheus` → verify `cluster` label on scraped metrics
3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs
4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs
5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail
6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}`
7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values
8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods
## Notes
- Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later
- DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve
- The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270
This commit is contained in:
parent
2243f2e0a1
commit
03d71544ec
19 changed files with 910 additions and 217 deletions
|
|
@ -29,6 +29,11 @@ prometheus.relabel "instance" {
|
|||
target_label = "instance"
|
||||
replacement = "{{ alloy_instance_label }}"
|
||||
}
|
||||
|
||||
rule {
|
||||
target_label = "cluster"
|
||||
replacement = "indri"
|
||||
}
|
||||
}
|
||||
|
||||
// Push metrics to Prometheus via remote_write
|
||||
|
|
@ -110,6 +115,11 @@ loki.relabel "add_host" {
|
|||
target_label = "host"
|
||||
replacement = "{{ alloy_instance_label }}"
|
||||
}
|
||||
|
||||
rule {
|
||||
target_label = "cluster"
|
||||
replacement = "indri"
|
||||
}
|
||||
}
|
||||
|
||||
// Write logs to Loki
|
||||
|
|
|
|||
17
argocd/apps/alloy-ringtail.yaml
Normal file
17
argocd/apps/alloy-ringtail.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: alloy-ringtail
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/alloy-ringtail
|
||||
destination:
|
||||
server: https://ringtail.tail8d86e.ts.net:6443
|
||||
namespace: alloy
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
17
argocd/apps/kube-state-metrics-ringtail.yaml
Normal file
17
argocd/apps/kube-state-metrics-ringtail.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: kube-state-metrics-ringtail
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/kube-state-metrics-ringtail
|
||||
destination:
|
||||
server: https://ringtail.tail8d86e.ts.net:6443
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
|
|
@ -108,6 +108,11 @@ loki.process "pods" {
|
|||
repository = "",
|
||||
}
|
||||
}
|
||||
|
||||
// Add cluster label for multi-cluster identification
|
||||
stage.static_labels {
|
||||
values = { cluster = "indri" }
|
||||
}
|
||||
}
|
||||
|
||||
// Write logs to Loki
|
||||
|
|
@ -163,6 +168,8 @@ prometheus.scrape "blackbox" {
|
|||
|
||||
// Push metrics to Prometheus
|
||||
prometheus.remote_write "prometheus" {
|
||||
external_labels = { cluster = "indri" }
|
||||
|
||||
endpoint {
|
||||
url = "http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
|
||||
}
|
||||
|
|
|
|||
165
argocd/manifests/alloy-ringtail/config.alloy
Normal file
165
argocd/manifests/alloy-ringtail/config.alloy
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics
|
||||
// Remote-writes metrics to indri Prometheus, logs to indri Loki
|
||||
|
||||
// ============== HOST METRICS ==============
|
||||
|
||||
// System metrics exporter (Linux host via /host/proc, /host/sys mounts)
|
||||
prometheus.exporter.unix "system" {
|
||||
procfs_path = "/host/proc"
|
||||
sysfs_path = "/host/sys"
|
||||
rootfs_path = "/host/root"
|
||||
}
|
||||
|
||||
// Scrape system metrics and add instance label
|
||||
prometheus.scrape "system" {
|
||||
targets = prometheus.exporter.unix.system.targets
|
||||
forward_to = [prometheus.relabel.instance.receiver]
|
||||
scrape_interval = "15s"
|
||||
}
|
||||
|
||||
// Add instance label
|
||||
prometheus.relabel "instance" {
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
|
||||
rule {
|
||||
target_label = "instance"
|
||||
replacement = "ringtail"
|
||||
}
|
||||
}
|
||||
|
||||
// ============== KUBE-STATE-METRICS SCRAPE ==============
|
||||
|
||||
prometheus.scrape "kube_state_metrics" {
|
||||
targets = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]
|
||||
scrape_interval = "15s"
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
}
|
||||
|
||||
// Push metrics to indri Prometheus
|
||||
prometheus.remote_write "prometheus" {
|
||||
external_labels = { cluster = "ringtail" }
|
||||
|
||||
endpoint {
|
||||
url = "https://prometheus.tail8d86e.ts.net/api/v1/write"
|
||||
|
||||
tls_config {
|
||||
insecure_skip_verify = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============== K8S POD LOG DISCOVERY ==============
|
||||
|
||||
// Discover all pods in the cluster
|
||||
discovery.kubernetes "pods" {
|
||||
role = "pod"
|
||||
}
|
||||
|
||||
// Relabel to extract useful metadata
|
||||
discovery.relabel "pods" {
|
||||
targets = discovery.kubernetes.pods.targets
|
||||
|
||||
// Keep only running pods
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_phase"]
|
||||
regex = "Pending|Succeeded|Failed|Unknown"
|
||||
action = "drop"
|
||||
}
|
||||
|
||||
// Set namespace label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_namespace"]
|
||||
target_label = "namespace"
|
||||
}
|
||||
|
||||
// Set pod name label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_name"]
|
||||
target_label = "pod"
|
||||
}
|
||||
|
||||
// Set container name label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_container_name"]
|
||||
target_label = "container"
|
||||
}
|
||||
|
||||
// Set app label from pod labels
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app"]
|
||||
target_label = "app"
|
||||
}
|
||||
|
||||
// Fallback: use app.kubernetes.io/name if no app label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
||||
target_label = "app"
|
||||
regex = "(.+)"
|
||||
action = "replace"
|
||||
}
|
||||
|
||||
// Set node name
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_node_name"]
|
||||
target_label = "node"
|
||||
}
|
||||
|
||||
// Build the log path for the pod container
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
||||
target_label = "__path__"
|
||||
separator = "/"
|
||||
replacement = "/var/log/pods/*$1/$2/*.log"
|
||||
}
|
||||
}
|
||||
|
||||
// Tail pod logs
|
||||
loki.source.kubernetes "pods" {
|
||||
targets = discovery.relabel.pods.output
|
||||
forward_to = [loki.process.pods.receiver]
|
||||
}
|
||||
|
||||
// Process logs - parse JSON if present, add labels
|
||||
loki.process "pods" {
|
||||
forward_to = [loki.write.loki.receiver]
|
||||
|
||||
// Try to parse JSON logs
|
||||
stage.json {
|
||||
expressions = {
|
||||
level = "level",
|
||||
msg = "msg",
|
||||
message = "message",
|
||||
time = "time",
|
||||
caller = "caller",
|
||||
}
|
||||
}
|
||||
|
||||
// Drop JSON parsing error labels (non-JSON logs are fine)
|
||||
stage.label_drop {
|
||||
values = ["__error__", "__error_details__"]
|
||||
}
|
||||
|
||||
// Extract labels from parsed JSON data
|
||||
stage.labels {
|
||||
values = {
|
||||
level = "",
|
||||
caller = "",
|
||||
}
|
||||
}
|
||||
|
||||
// Add cluster label for multi-cluster identification
|
||||
stage.static_labels {
|
||||
values = { cluster = "ringtail" }
|
||||
}
|
||||
}
|
||||
|
||||
// Write logs to indri Loki
|
||||
loki.write "loki" {
|
||||
endpoint {
|
||||
url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"
|
||||
|
||||
tls_config {
|
||||
insecure_skip_verify = true
|
||||
}
|
||||
}
|
||||
}
|
||||
86
argocd/manifests/alloy-ringtail/daemonset.yaml
Normal file
86
argocd/manifests/alloy-ringtail/daemonset.yaml
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: alloy
|
||||
namespace: alloy
|
||||
labels:
|
||||
app: alloy
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alloy
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alloy
|
||||
spec:
|
||||
serviceAccountName: alloy
|
||||
securityContext:
|
||||
fsGroup: 473 # alloy user group
|
||||
containers:
|
||||
- name: alloy
|
||||
image: grafana/alloy
|
||||
args:
|
||||
- run
|
||||
- --server.http.listen-addr=0.0.0.0:12345
|
||||
- --storage.path=/var/lib/alloy/data
|
||||
- /etc/alloy/config.alloy
|
||||
ports:
|
||||
- containerPort: 12345
|
||||
name: http
|
||||
env:
|
||||
- name: HOSTNAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/alloy
|
||||
- name: varlog
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: data
|
||||
mountPath: /var/lib/alloy/data
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
- name: sys
|
||||
mountPath: /host/sys
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: alloy-config
|
||||
- name: varlog
|
||||
hostPath:
|
||||
path: /var/log
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
18
argocd/manifests/alloy-ringtail/kustomization.yaml
Normal file
18
argocd/manifests/alloy-ringtail/kustomization.yaml
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: alloy
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- rbac.yaml
|
||||
- daemonset.yaml
|
||||
|
||||
images:
|
||||
- name: grafana/alloy
|
||||
newTag: v1.13.1
|
||||
|
||||
configMapGenerator:
|
||||
- name: alloy-config
|
||||
files:
|
||||
- config.alloy
|
||||
4
argocd/manifests/alloy-ringtail/namespace.yaml
Normal file
4
argocd/manifests/alloy-ringtail/namespace.yaml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: alloy
|
||||
35
argocd/manifests/alloy-ringtail/rbac.yaml
Normal file
35
argocd/manifests/alloy-ringtail/rbac.yaml
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: alloy
|
||||
namespace: alloy
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: alloy
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods", "pods/log", "namespaces"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["get"]
|
||||
- apiGroups: ["discovery.k8s.io"]
|
||||
resources: ["endpointslices"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: alloy
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: alloy
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: alloy
|
||||
namespace: alloy
|
||||
|
|
@ -1,12 +1,12 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-minikube
|
||||
name: grafana-dashboard-kubernetes
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
minikube.json: |
|
||||
kubernetes.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
|
|
@ -15,95 +15,63 @@ data:
|
|||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 3, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "minikube_up", "refId": "A" }],
|
||||
"title": "Cluster",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 3, "w": 3, "x": 3, "y": 0 },
|
||||
"id": 2,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "minikube_apiserver_up", "refId": "A" }],
|
||||
"title": "API Server",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 3, "w": 3, "x": 6, "y": 0 },
|
||||
"id": 3,
|
||||
"gridPos": { "h": 3, "w": 4, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"title": "Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 3, "w": 3, "x": 9, "y": 0 },
|
||||
"id": 4,
|
||||
"gridPos": { "h": 3, "w": 4, "x": 4, "y": 0 },
|
||||
"id": 2,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"title": "Deployments",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 3, "w": 3, "x": 12, "y": 0 },
|
||||
"id": 5,
|
||||
"gridPos": { "h": 3, "w": 4, "x": 8, "y": 0 },
|
||||
"id": 3,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"title": "StatefulSets",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 3, "w": 3, "x": 15, "y": 0 },
|
||||
"id": 6,
|
||||
"gridPos": { "h": 3, "w": 4, "x": 12, "y": 0 },
|
||||
"id": 4,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created)", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created{cluster=~\"$cluster\"})", "refId": "A" }],
|
||||
"title": "Namespaces",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "bytes", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 3, "w": 3, "x": 18, "y": 0 },
|
||||
"id": 7,
|
||||
"gridPos": { "h": 3, "w": 4, "x": 16, "y": 0 },
|
||||
"id": 5,
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"title": "Memory Requests",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "short", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 3, "w": 3, "x": 21, "y": 0 },
|
||||
"id": 8,
|
||||
"gridPos": { "h": 3, "w": 4, "x": 20, "y": 0 },
|
||||
"id": 6,
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }],
|
||||
"title": "CPU Requests (cores)",
|
||||
"type": "stat"
|
||||
},
|
||||
|
|
@ -117,9 +85,9 @@ data:
|
|||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 3 },
|
||||
"id": 13,
|
||||
"id": 7,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
|
||||
"title": "Unhealthy Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
|
|
@ -135,9 +103,9 @@ data:
|
|||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 18, "x": 6, "y": 3 },
|
||||
"id": 14,
|
||||
"id": 8,
|
||||
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }],
|
||||
"title": "Pods by Waiting Reason",
|
||||
"type": "timeseries"
|
||||
},
|
||||
|
|
@ -155,7 +123,7 @@ data:
|
|||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 },
|
||||
"id": 9,
|
||||
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
||||
"title": "Pods by Namespace",
|
||||
"type": "timeseries"
|
||||
},
|
||||
|
|
@ -173,7 +141,7 @@ data:
|
|||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 },
|
||||
"id": 10,
|
||||
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
||||
"title": "Memory Requests by Namespace",
|
||||
"type": "timeseries"
|
||||
},
|
||||
|
|
@ -197,11 +165,11 @@ data:
|
|||
"id": 11,
|
||||
"options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Pods" }] },
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" }
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" }
|
||||
],
|
||||
"title": "Namespace Resource Summary",
|
||||
"transformations": [
|
||||
|
|
@ -215,27 +183,44 @@ data:
|
|||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 },
|
||||
"id": 12,
|
||||
"options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false },
|
||||
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{namespace=~\"$namespace\"}", "refId": "A" }],
|
||||
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\"}", "refId": "A" }],
|
||||
"title": "Pod Logs",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"tags": ["minikube", "kubernetes", "k8s"],
|
||||
"tags": ["kubernetes", "k8s", "multi-cluster"],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"definition": "label_values(kube_namespace_created, namespace)",
|
||||
"definition": "label_values(kube_namespace_created, cluster)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Cluster",
|
||||
"multi": true,
|
||||
"name": "cluster",
|
||||
"options": [],
|
||||
"query": { "query": "label_values(kube_namespace_created, cluster)", "refId": "StandardVariableQuery" },
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"type": "query"
|
||||
},
|
||||
{
|
||||
"current": { "selected": true, "text": "All", "value": "$__all" },
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"definition": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)",
|
||||
"hide": 0,
|
||||
"includeAll": true,
|
||||
"label": "Namespace",
|
||||
"multi": true,
|
||||
"name": "namespace",
|
||||
"options": [],
|
||||
"query": { "query": "label_values(kube_namespace_created, namespace)", "refId": "StandardVariableQuery" },
|
||||
"query": { "query": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)", "refId": "StandardVariableQuery" },
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
|
|
@ -247,8 +232,8 @@ data:
|
|||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Minikube Kubernetes",
|
||||
"uid": "minikube",
|
||||
"version": 2,
|
||||
"title": "Kubernetes Clusters",
|
||||
"uid": "kubernetes",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
@ -0,0 +1,314 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-ringtail
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
ringtail.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 100,
|
||||
"panels": [],
|
||||
"title": "System Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, "unit": "dtdurations" } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 },
|
||||
"id": 1,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "time() - node_boot_time_seconds{instance=\"ringtail\"}", "refId": "A" }],
|
||||
"title": "Uptime",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, "unit": "decbytes" } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 },
|
||||
"id": 2,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemTotal_bytes{instance=\"ringtail\"}", "refId": "A" }],
|
||||
"title": "Total Memory",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "purple", "value": null }] }, "unit": "short" } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 },
|
||||
"id": 3,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(node_cpu_seconds_total{instance=\"ringtail\", mode=\"idle\"})", "refId": "A" }],
|
||||
"title": "CPU Cores",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 4 }, { "color": "red", "value": 8 }] }, "unit": "short", "decimals": 2 } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 },
|
||||
"id": 4,
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_load1{instance=\"ringtail\"}", "refId": "A" }],
|
||||
"title": "Load (1m)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 },
|
||||
"id": 5,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=\"ringtail\"})", "refId": "A" }],
|
||||
"title": "K8s Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "percent", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 95 }] } } },
|
||||
"gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 },
|
||||
"id": 6,
|
||||
"options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_usage_percent", "refId": "A" }],
|
||||
"title": "GPU Usage %",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
|
||||
"id": 101,
|
||||
"panels": [],
|
||||
"title": "CPU & Memory",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
||||
"unit": "percentunit",
|
||||
"max": 1
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
|
||||
"id": 7,
|
||||
"options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (mode) (rate(node_cpu_seconds_total{instance=\"ringtail\", mode!=\"idle\"}[5m])) / on() group_left count(node_cpu_seconds_total{instance=\"ringtail\", mode=\"idle\"})", "legendFormat": "{{mode}}", "refId": "A" }
|
||||
],
|
||||
"title": "CPU Usage by Mode",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
||||
"unit": "bytes"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
|
||||
"id": 8,
|
||||
"options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemTotal_bytes{instance=\"ringtail\"} - node_memory_MemAvailable_bytes{instance=\"ringtail\"}", "legendFormat": "Used", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemAvailable_bytes{instance=\"ringtail\"}", "legendFormat": "Available", "refId": "B" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_Cached_bytes{instance=\"ringtail\"}", "legendFormat": "Cached", "refId": "C" }
|
||||
],
|
||||
"title": "Memory Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 102,
|
||||
"panels": [],
|
||||
"title": "Storage",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.8 }, { "color": "red", "value": 0.95 }] }
|
||||
},
|
||||
"overrides": [
|
||||
{ "matcher": { "id": "byName", "options": "Size" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
||||
{ "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] },
|
||||
{ "matcher": { "id": "byName", "options": "Used %" }, "properties": [{ "id": "unit", "value": "percentunit" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.8 }, { "color": "red", "value": 0.95 }] } }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } }] }
|
||||
]
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 15 },
|
||||
"id": 9,
|
||||
"options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Size" }] },
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_filesystem_size_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"}", "format": "table", "instant": true, "refId": "size" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_filesystem_avail_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"}", "format": "table", "instant": true, "refId": "avail" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "1 - (node_filesystem_avail_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"})", "format": "table", "instant": true, "refId": "pct" }
|
||||
],
|
||||
"title": "Filesystem Usage",
|
||||
"transformations": [
|
||||
{ "id": "seriesToColumns", "options": { "byField": "mountpoint" } },
|
||||
{ "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "device": true, "device 1": true, "device 2": true, "fstype": true, "fstype 1": true, "fstype 2": true, "instance": true, "instance 1": true, "instance 2": true, "job": true, "job 1": true, "job 2": true, "cluster": true, "cluster 1": true, "cluster 2": true }, "renameByName": { "mountpoint": "Mount", "Value #size": "Size", "Value #avail": "Available", "Value #pct": "Used %" } } }
|
||||
],
|
||||
"type": "table"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 },
|
||||
"id": 103,
|
||||
"panels": [],
|
||||
"title": "Network",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "axisBorderShow": false, "axisCenteredZero": true, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
||||
"unit": "Bps"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 },
|
||||
"id": 10,
|
||||
"options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(node_network_receive_bytes_total{instance=\"ringtail\", device!~\"lo|veth.*|cali.*|flannel.*|cni.*\"}[5m])", "legendFormat": "{{device}} rx", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "-rate(node_network_transmit_bytes_total{instance=\"ringtail\", device!~\"lo|veth.*|cali.*|flannel.*|cni.*\"}[5m])", "legendFormat": "{{device}} tx", "refId": "B" }
|
||||
],
|
||||
"title": "Network Traffic",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 },
|
||||
"id": 104,
|
||||
"panels": [],
|
||||
"title": "GPU",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
||||
"unit": "percent"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 31 },
|
||||
"id": 11,
|
||||
"options": { "legend": { "calcs": ["mean", "lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_usage_percent", "legendFormat": "GPU Usage", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_mem_usage_percent", "legendFormat": "GPU Memory", "refId": "B" }
|
||||
],
|
||||
"title": "GPU Overview",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 },
|
||||
"id": 105,
|
||||
"panels": [],
|
||||
"title": "Kubernetes",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
||||
"unit": "short"
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 },
|
||||
"id": 12,
|
||||
"options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=\"ringtail\"})", "legendFormat": "{{namespace}}", "refId": "A" }],
|
||||
"title": "Pods by Namespace",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] }
|
||||
}
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 40 },
|
||||
"id": 13,
|
||||
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=\"ringtail\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }],
|
||||
"title": "Unhealthy Pods",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } },
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 40 },
|
||||
"id": 14,
|
||||
"options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" },
|
||||
"targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=\"ringtail\"})", "refId": "A" }],
|
||||
"title": "Deployments",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 48 },
|
||||
"id": 106,
|
||||
"panels": [],
|
||||
"title": "Logs",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "loki", "uid": "loki" },
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 49 },
|
||||
"id": 15,
|
||||
"options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false },
|
||||
"targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=\"ringtail\"}", "refId": "A" }],
|
||||
"title": "Pod Logs",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"tags": ["ringtail", "k3s", "gpu", "system"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-6h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Ringtail",
|
||||
"uid": "ringtail",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
@ -1,145 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-services
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
services.json: |
|
||||
{
|
||||
"annotations": { "list": [] },
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [
|
||||
{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }
|
||||
],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] },
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "background",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "center",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "11.0.0",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/miniflux\"}", "legendFormat": "Miniflux", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/kiwix\"}", "legendFormat": "Kiwix", "refId": "B" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/transmission\"}", "legendFormat": "Transmission", "refId": "C" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/devpi\"}", "legendFormat": "Devpi", "refId": "D" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/argocd\"}", "legendFormat": "ArgoCD", "refId": "E" }
|
||||
],
|
||||
"title": "Service Status",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "palette-classic" },
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true },
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "11.0.0",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/miniflux\"}", "legendFormat": "Miniflux", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/kiwix\"}", "legendFormat": "Kiwix", "refId": "B" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/transmission\"}", "legendFormat": "Transmission", "refId": "C" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/devpi\"}", "legendFormat": "Devpi", "refId": "D" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/argocd\"}", "legendFormat": "ArgoCD", "refId": "E" }
|
||||
],
|
||||
"title": "Response Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": { "mode": "thresholds" },
|
||||
"mappings": [],
|
||||
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.95 }, { "color": "green", "value": 0.99 }] },
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 14 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "area",
|
||||
"justifyMode": "center",
|
||||
"orientation": "horizontal",
|
||||
"reduceOptions": { "calcs": ["mean"], "fields": "", "values": false },
|
||||
"textMode": "value_and_name"
|
||||
},
|
||||
"pluginVersion": "11.0.0",
|
||||
"targets": [
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/miniflux\"}[$__range])", "legendFormat": "Miniflux", "refId": "A" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/kiwix\"}[$__range])", "legendFormat": "Kiwix", "refId": "B" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/transmission\"}[$__range])", "legendFormat": "Transmission", "refId": "C" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/devpi\"}[$__range])", "legendFormat": "Devpi", "refId": "D" },
|
||||
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/argocd\"}[$__range])", "legendFormat": "ArgoCD", "refId": "E" }
|
||||
],
|
||||
"title": "Uptime (selected period)",
|
||||
"type": "stat"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 38,
|
||||
"tags": ["services", "health"],
|
||||
"templating": { "list": [] },
|
||||
"time": { "from": "now-24h", "to": "now" },
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "K8s Services Health",
|
||||
"uid": "k8s-services",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
@ -13,10 +13,10 @@ resources:
|
|||
- dashboards/configmap-devpi.yaml
|
||||
- dashboards/configmap-loki.yaml
|
||||
- dashboards/configmap-macos.yaml
|
||||
- dashboards/configmap-minikube.yaml
|
||||
- dashboards/configmap-kubernetes.yaml
|
||||
- dashboards/configmap-jellyfin.yaml
|
||||
- dashboards/configmap-postgresql.yaml
|
||||
- dashboards/configmap-services.yaml
|
||||
- dashboards/configmap-ringtail.yaml
|
||||
- dashboards/configmap-zot.yaml
|
||||
- dashboards/configmap-frigate.yaml
|
||||
- dashboards/configmap-cv-apm.yaml
|
||||
|
|
|
|||
53
argocd/manifests/kube-state-metrics-ringtail/deployment.yaml
Normal file
53
argocd/manifests/kube-state-metrics-ringtail/deployment.yaml
Normal file
|
|
@ -0,0 +1,53 @@
|
|||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: kube-state-metrics
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
serviceAccountName: kube-state-metrics
|
||||
containers:
|
||||
- name: kube-state-metrics
|
||||
image: registry.k8s.io/kube-state-metrics/kube-state-metrics
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http-metrics
|
||||
- containerPort: 8081
|
||||
name: telemetry
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 5
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 8080
|
||||
initialDelaySeconds: 5
|
||||
timeoutSeconds: 5
|
||||
resources:
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- rbac.yaml
|
||||
- deployment.yaml
|
||||
- service.yaml
|
||||
images:
|
||||
- name: registry.k8s.io/kube-state-metrics/kube-state-metrics
|
||||
newTag: v2.18.0
|
||||
79
argocd/manifests/kube-state-metrics-ringtail/rbac.yaml
Normal file
79
argocd/manifests/kube-state-metrics-ringtail/rbac.yaml
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
- secrets
|
||||
- nodes
|
||||
- pods
|
||||
- services
|
||||
- serviceaccounts
|
||||
- resourcequotas
|
||||
- replicationcontrollers
|
||||
- limitranges
|
||||
- persistentvolumeclaims
|
||||
- persistentvolumes
|
||||
- namespaces
|
||||
- endpoints
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- statefulsets
|
||||
- daemonsets
|
||||
- deployments
|
||||
- replicasets
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- cronjobs
|
||||
- jobs
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["autoscaling"]
|
||||
resources:
|
||||
- horizontalpodautoscalers
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["networking.k8s.io"]
|
||||
resources:
|
||||
- networkpolicies
|
||||
- ingresses
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["coordination.k8s.io"]
|
||||
resources:
|
||||
- leases
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["certificates.k8s.io"]
|
||||
resources:
|
||||
- certificatesigningrequests
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources:
|
||||
- storageclasses
|
||||
- volumeattachments
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["admissionregistration.k8s.io"]
|
||||
resources:
|
||||
- mutatingwebhookconfigurations
|
||||
- validatingwebhookconfigurations
|
||||
verbs: ["list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: kube-state-metrics
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
17
argocd/manifests/kube-state-metrics-ringtail/service.yaml
Normal file
17
argocd/manifests/kube-state-metrics-ringtail/service.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
selector:
|
||||
app: kube-state-metrics
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 8080
|
||||
targetPort: http-metrics
|
||||
- name: telemetry
|
||||
port: 8081
|
||||
targetPort: telemetry
|
||||
|
|
@ -10,11 +10,17 @@ scrape_configs:
|
|||
- job_name: "node-exporter-sifaka"
|
||||
static_configs:
|
||||
- targets: ["nas.ops.eblu.me:9100"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
- job_name: "smartctl-sifaka"
|
||||
scrape_interval: 60s
|
||||
static_configs:
|
||||
- targets: ["nas.ops.eblu.me:9633"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# CNPG PostgreSQL metrics (k8s internal)
|
||||
- job_name: "cnpg-postgres"
|
||||
|
|
@ -22,21 +28,33 @@ scrape_configs:
|
|||
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
|
||||
labels:
|
||||
instance: "blumeops-pg"
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Prometheus self-monitoring
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Loki metrics
|
||||
- job_name: "loki"
|
||||
static_configs:
|
||||
- targets: ["loki.monitoring.svc.cluster.local:3100"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Kubernetes state metrics (pods, deployments, resource usage, etc.)
|
||||
- job_name: "kube-state-metrics"
|
||||
static_configs:
|
||||
- targets: ["kube-state-metrics.monitoring.svc.cluster.local:8080"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail)
|
||||
- job_name: "frigate"
|
||||
|
|
@ -44,3 +62,6 @@ scrape_configs:
|
|||
static_configs:
|
||||
- targets: ["nvr.ops.eblu.me"]
|
||||
metrics_path: /api/metrics
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: ringtail
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
Add multi-cluster Kubernetes observability: deploy kube-state-metrics and Alloy on ringtail (k3s), add `cluster` label to all metrics/logs, replace single-cluster dashboards with multi-cluster Kubernetes dashboard and dedicated Ringtail dashboard with GPU monitoring.
|
||||
Loading…
Add table
Add a link
Reference in a new issue