Add multi-cluster observability with ringtail metrics and dashboards (#270)
## Summary
- Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs
- Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests)
- Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki
- Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with:
- **Kubernetes Clusters** dashboard — multi-cluster with `cluster` and `namespace` template variables
- **Ringtail (k3s)** dashboard — dedicated ringtail view with GPU usage panels
## Deployment and Testing
1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`)
2. Sync `prometheus` → verify `cluster` label on scraped metrics
3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs
4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs
5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail
6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}`
7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values
8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods
## Notes
- Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later
- DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve
- The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270
This commit is contained in:
parent
2243f2e0a1
commit
03d71544ec
19 changed files with 910 additions and 217 deletions
165
argocd/manifests/alloy-ringtail/config.alloy
Normal file
165
argocd/manifests/alloy-ringtail/config.alloy
Normal file
|
|
@ -0,0 +1,165 @@
|
|||
// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics
|
||||
// Remote-writes metrics to indri Prometheus, logs to indri Loki
|
||||
|
||||
// ============== HOST METRICS ==============
|
||||
|
||||
// System metrics exporter (Linux host via /host/proc, /host/sys mounts)
|
||||
prometheus.exporter.unix "system" {
|
||||
procfs_path = "/host/proc"
|
||||
sysfs_path = "/host/sys"
|
||||
rootfs_path = "/host/root"
|
||||
}
|
||||
|
||||
// Scrape system metrics and add instance label
|
||||
prometheus.scrape "system" {
|
||||
targets = prometheus.exporter.unix.system.targets
|
||||
forward_to = [prometheus.relabel.instance.receiver]
|
||||
scrape_interval = "15s"
|
||||
}
|
||||
|
||||
// Add instance label
|
||||
prometheus.relabel "instance" {
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
|
||||
rule {
|
||||
target_label = "instance"
|
||||
replacement = "ringtail"
|
||||
}
|
||||
}
|
||||
|
||||
// ============== KUBE-STATE-METRICS SCRAPE ==============
|
||||
|
||||
prometheus.scrape "kube_state_metrics" {
|
||||
targets = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]
|
||||
scrape_interval = "15s"
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
}
|
||||
|
||||
// Push metrics to indri Prometheus
|
||||
prometheus.remote_write "prometheus" {
|
||||
external_labels = { cluster = "ringtail" }
|
||||
|
||||
endpoint {
|
||||
url = "https://prometheus.tail8d86e.ts.net/api/v1/write"
|
||||
|
||||
tls_config {
|
||||
insecure_skip_verify = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============== K8S POD LOG DISCOVERY ==============
|
||||
|
||||
// Discover all pods in the cluster
|
||||
discovery.kubernetes "pods" {
|
||||
role = "pod"
|
||||
}
|
||||
|
||||
// Relabel to extract useful metadata
|
||||
discovery.relabel "pods" {
|
||||
targets = discovery.kubernetes.pods.targets
|
||||
|
||||
// Keep only running pods
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_phase"]
|
||||
regex = "Pending|Succeeded|Failed|Unknown"
|
||||
action = "drop"
|
||||
}
|
||||
|
||||
// Set namespace label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_namespace"]
|
||||
target_label = "namespace"
|
||||
}
|
||||
|
||||
// Set pod name label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_name"]
|
||||
target_label = "pod"
|
||||
}
|
||||
|
||||
// Set container name label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_container_name"]
|
||||
target_label = "container"
|
||||
}
|
||||
|
||||
// Set app label from pod labels
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app"]
|
||||
target_label = "app"
|
||||
}
|
||||
|
||||
// Fallback: use app.kubernetes.io/name if no app label
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
||||
target_label = "app"
|
||||
regex = "(.+)"
|
||||
action = "replace"
|
||||
}
|
||||
|
||||
// Set node name
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_node_name"]
|
||||
target_label = "node"
|
||||
}
|
||||
|
||||
// Build the log path for the pod container
|
||||
rule {
|
||||
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
||||
target_label = "__path__"
|
||||
separator = "/"
|
||||
replacement = "/var/log/pods/*$1/$2/*.log"
|
||||
}
|
||||
}
|
||||
|
||||
// Tail pod logs
|
||||
loki.source.kubernetes "pods" {
|
||||
targets = discovery.relabel.pods.output
|
||||
forward_to = [loki.process.pods.receiver]
|
||||
}
|
||||
|
||||
// Process logs - parse JSON if present, add labels
|
||||
loki.process "pods" {
|
||||
forward_to = [loki.write.loki.receiver]
|
||||
|
||||
// Try to parse JSON logs
|
||||
stage.json {
|
||||
expressions = {
|
||||
level = "level",
|
||||
msg = "msg",
|
||||
message = "message",
|
||||
time = "time",
|
||||
caller = "caller",
|
||||
}
|
||||
}
|
||||
|
||||
// Drop JSON parsing error labels (non-JSON logs are fine)
|
||||
stage.label_drop {
|
||||
values = ["__error__", "__error_details__"]
|
||||
}
|
||||
|
||||
// Extract labels from parsed JSON data
|
||||
stage.labels {
|
||||
values = {
|
||||
level = "",
|
||||
caller = "",
|
||||
}
|
||||
}
|
||||
|
||||
// Add cluster label for multi-cluster identification
|
||||
stage.static_labels {
|
||||
values = { cluster = "ringtail" }
|
||||
}
|
||||
}
|
||||
|
||||
// Write logs to indri Loki
|
||||
loki.write "loki" {
|
||||
endpoint {
|
||||
url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"
|
||||
|
||||
tls_config {
|
||||
insecure_skip_verify = true
|
||||
}
|
||||
}
|
||||
}
|
||||
86
argocd/manifests/alloy-ringtail/daemonset.yaml
Normal file
86
argocd/manifests/alloy-ringtail/daemonset.yaml
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: alloy
|
||||
namespace: alloy
|
||||
labels:
|
||||
app: alloy
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alloy
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alloy
|
||||
spec:
|
||||
serviceAccountName: alloy
|
||||
securityContext:
|
||||
fsGroup: 473 # alloy user group
|
||||
containers:
|
||||
- name: alloy
|
||||
image: grafana/alloy
|
||||
args:
|
||||
- run
|
||||
- --server.http.listen-addr=0.0.0.0:12345
|
||||
- --storage.path=/var/lib/alloy/data
|
||||
- /etc/alloy/config.alloy
|
||||
ports:
|
||||
- containerPort: 12345
|
||||
name: http
|
||||
env:
|
||||
- name: HOSTNAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/alloy
|
||||
- name: varlog
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: data
|
||||
mountPath: /var/lib/alloy/data
|
||||
- name: proc
|
||||
mountPath: /host/proc
|
||||
readOnly: true
|
||||
- name: sys
|
||||
mountPath: /host/sys
|
||||
readOnly: true
|
||||
- name: root
|
||||
mountPath: /host/root
|
||||
mountPropagation: HostToContainer
|
||||
readOnly: true
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
readOnlyRootFilesystem: true
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: alloy-config
|
||||
- name: varlog
|
||||
hostPath:
|
||||
path: /var/log
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
- name: root
|
||||
hostPath:
|
||||
path: /
|
||||
18
argocd/manifests/alloy-ringtail/kustomization.yaml
Normal file
18
argocd/manifests/alloy-ringtail/kustomization.yaml
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: alloy
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- rbac.yaml
|
||||
- daemonset.yaml
|
||||
|
||||
images:
|
||||
- name: grafana/alloy
|
||||
newTag: v1.13.1
|
||||
|
||||
configMapGenerator:
|
||||
- name: alloy-config
|
||||
files:
|
||||
- config.alloy
|
||||
4
argocd/manifests/alloy-ringtail/namespace.yaml
Normal file
4
argocd/manifests/alloy-ringtail/namespace.yaml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: alloy
|
||||
35
argocd/manifests/alloy-ringtail/rbac.yaml
Normal file
35
argocd/manifests/alloy-ringtail/rbac.yaml
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: alloy
|
||||
namespace: alloy
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: alloy
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods", "pods/log", "namespaces"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["configmaps"]
|
||||
verbs: ["get"]
|
||||
- apiGroups: ["discovery.k8s.io"]
|
||||
resources: ["endpointslices"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: alloy
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: alloy
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: alloy
|
||||
namespace: alloy
|
||||
Loading…
Add table
Add a link
Reference in a new issue