Add multi-cluster observability with ringtail metrics and dashboards (#270)

## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - **Kubernetes Clusters** dashboard — multi-cluster with `cluster` and `namespace` template variables - **Ringtail (k3s)** dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270
2026-02-25 22:01:00 -08:00 · 2026-02-25 22:01:00 -08:00 · 03d71544ec
commit 03d71544ec
parent 2243f2e0a1
19 changed files with 910 additions and 217 deletions
--- a/argocd/manifests/alloy-ringtail/config.alloy
+++ b/argocd/manifests/alloy-ringtail/config.alloy
@ -0,0 +1,165 @@
+// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics
+// Remote-writes metrics to indri Prometheus, logs to indri Loki
+
+// ============== HOST METRICS ==============
+
+// System metrics exporter (Linux host via /host/proc, /host/sys mounts)
+prometheus.exporter.unix "system" {
+  procfs_path = "/host/proc"
+  sysfs_path  = "/host/sys"
+  rootfs_path = "/host/root"
+}
+
+// Scrape system metrics and add instance label
+prometheus.scrape "system" {
+  targets         = prometheus.exporter.unix.system.targets
+  forward_to      = [prometheus.relabel.instance.receiver]
+  scrape_interval = "15s"
+}
+
+// Add instance label
+prometheus.relabel "instance" {
+  forward_to = [prometheus.remote_write.prometheus.receiver]
+
+  rule {
+    target_label = "instance"
+    replacement  = "ringtail"
+  }
+}
+
+// ============== KUBE-STATE-METRICS SCRAPE ==============
+
+prometheus.scrape "kube_state_metrics" {
+  targets         = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]
+  scrape_interval = "15s"
+  forward_to      = [prometheus.remote_write.prometheus.receiver]
+}
+
+// Push metrics to indri Prometheus
+prometheus.remote_write "prometheus" {
+  external_labels = { cluster = "ringtail" }
+
+  endpoint {
+    url = "https://prometheus.tail8d86e.ts.net/api/v1/write"
+
+    tls_config {
+      insecure_skip_verify = true
+    }
+  }
+}
+
+// ============== K8S POD LOG DISCOVERY ==============
+
+// Discover all pods in the cluster
+discovery.kubernetes "pods" {
+  role = "pod"
+}
+
+// Relabel to extract useful metadata
+discovery.relabel "pods" {
+  targets = discovery.kubernetes.pods.targets
+
+  // Keep only running pods
+  rule {
+    source_labels = ["__meta_kubernetes_pod_phase"]
+    regex         = "Pending|Succeeded|Failed|Unknown"
+    action        = "drop"
+  }
+
+  // Set namespace label
+  rule {
+    source_labels = ["__meta_kubernetes_namespace"]
+    target_label  = "namespace"
+  }
+
+  // Set pod name label
+  rule {
+    source_labels = ["__meta_kubernetes_pod_name"]
+    target_label  = "pod"
+  }
+
+  // Set container name label
+  rule {
+    source_labels = ["__meta_kubernetes_pod_container_name"]
+    target_label  = "container"
+  }
+
+  // Set app label from pod labels
+  rule {
+    source_labels = ["__meta_kubernetes_pod_label_app"]
+    target_label  = "app"
+  }
+
+  // Fallback: use app.kubernetes.io/name if no app label
+  rule {
+    source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
+    target_label  = "app"
+    regex         = "(.+)"
+    action        = "replace"
+  }
+
+  // Set node name
+  rule {
+    source_labels = ["__meta_kubernetes_pod_node_name"]
+    target_label  = "node"
+  }
+
+  // Build the log path for the pod container
+  rule {
+    source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
+    target_label  = "__path__"
+    separator     = "/"
+    replacement   = "/var/log/pods/*$1/$2/*.log"
+  }
+}
+
+// Tail pod logs
+loki.source.kubernetes "pods" {
+  targets    = discovery.relabel.pods.output
+  forward_to = [loki.process.pods.receiver]
+}
+
+// Process logs - parse JSON if present, add labels
+loki.process "pods" {
+  forward_to = [loki.write.loki.receiver]
+
+  // Try to parse JSON logs
+  stage.json {
+    expressions = {
+      level   = "level",
+      msg     = "msg",
+      message = "message",
+      time    = "time",
+      caller  = "caller",
+    }
+  }
+
+  // Drop JSON parsing error labels (non-JSON logs are fine)
+  stage.label_drop {
+    values = ["__error__", "__error_details__"]
+  }
+
+  // Extract labels from parsed JSON data
+  stage.labels {
+    values = {
+      level  = "",
+      caller = "",
+    }
+  }
+
+  // Add cluster label for multi-cluster identification
+  stage.static_labels {
+    values = { cluster = "ringtail" }
+  }
+}
+
+// Write logs to indri Loki
+loki.write "loki" {
+  endpoint {
+    url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"
+
+    tls_config {
+      insecure_skip_verify = true
+    }
+  }
+}
--- a/argocd/manifests/alloy-ringtail/daemonset.yaml
+++ b/argocd/manifests/alloy-ringtail/daemonset.yaml
@ -0,0 +1,86 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: alloy
+  namespace: alloy
+  labels:
+    app: alloy
+spec:
+  selector:
+    matchLabels:
+      app: alloy
+  template:
+    metadata:
+      labels:
+        app: alloy
+    spec:
+      serviceAccountName: alloy
+      securityContext:
+        fsGroup: 473  # alloy user group
+      containers:
+        - name: alloy
+          image: grafana/alloy
+          args:
+            - run
+            - --server.http.listen-addr=0.0.0.0:12345
+            - --storage.path=/var/lib/alloy/data
+            - /etc/alloy/config.alloy
+          ports:
+            - containerPort: 12345
+              name: http
+          env:
+            - name: HOSTNAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: spec.nodeName
+          resources:
+            requests:
+              cpu: 50m
+              memory: 128Mi
+            limits:
+              cpu: 500m
+              memory: 512Mi
+          volumeMounts:
+            - name: config
+              mountPath: /etc/alloy
+            - name: varlog
+              mountPath: /var/log
+              readOnly: true
+            - name: data
+              mountPath: /var/lib/alloy/data
+            - name: proc
+              mountPath: /host/proc
+              readOnly: true
+            - name: sys
+              mountPath: /host/sys
+              readOnly: true
+            - name: root
+              mountPath: /host/root
+              mountPropagation: HostToContainer
+              readOnly: true
+          securityContext:
+            allowPrivilegeEscalation: false
+            readOnlyRootFilesystem: true
+            capabilities:
+              drop:
+                - ALL
+      tolerations:
+        - operator: Exists
+      volumes:
+        - name: config
+          configMap:
+            name: alloy-config
+        - name: varlog
+          hostPath:
+            path: /var/log
+        - name: data
+          emptyDir: {}
+        - name: proc
+          hostPath:
+            path: /proc
+        - name: sys
+          hostPath:
+            path: /sys
+        - name: root
+          hostPath:
+            path: /
--- a/argocd/manifests/alloy-ringtail/kustomization.yaml
+++ b/argocd/manifests/alloy-ringtail/kustomization.yaml
@ -0,0 +1,18 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: alloy
+
+resources:
+  - namespace.yaml
+  - rbac.yaml
+  - daemonset.yaml
+
+images:
+  - name: grafana/alloy
+    newTag: v1.13.1
+
+configMapGenerator:
+  - name: alloy-config
+    files:
+      - config.alloy
--- a/argocd/manifests/alloy-ringtail/namespace.yaml
+++ b/argocd/manifests/alloy-ringtail/namespace.yaml
@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: alloy
--- a/argocd/manifests/alloy-ringtail/rbac.yaml
+++ b/argocd/manifests/alloy-ringtail/rbac.yaml
@ -0,0 +1,35 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: alloy
+  namespace: alloy
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: alloy
+rules:
+  - apiGroups: [""]
+    resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods", "pods/log", "namespaces"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["configmaps"]
+    verbs: ["get"]
+  - apiGroups: ["discovery.k8s.io"]
+    resources: ["endpointslices"]
+    verbs: ["get", "list", "watch"]
+  - nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
+    verbs: ["get"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: alloy
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: alloy
+subjects:
+  - kind: ServiceAccount
+    name: alloy
+    namespace: alloy