## Summary
- Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs
- Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests)
- Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki
- Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with:
- **Kubernetes Clusters** dashboard — multi-cluster with `cluster` and `namespace` template variables
- **Ringtail (k3s)** dashboard — dedicated ringtail view with GPU usage panels
## Deployment and Testing
1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`)
2. Sync `prometheus` → verify `cluster` label on scraped metrics
3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs
4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs
5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail
6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}`
7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values
8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods
## Notes
- Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later
- DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve
- The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270
176 lines
4.2 KiB
Text
176 lines
4.2 KiB
Text
// Alloy k8s configuration - collects pod logs from all namespaces
|
|
|
|
// ============== K8S POD LOG DISCOVERY ==============
|
|
|
|
// Discover all pods in the cluster
|
|
discovery.kubernetes "pods" {
|
|
role = "pod"
|
|
}
|
|
|
|
// Relabel to extract useful metadata
|
|
discovery.relabel "pods" {
|
|
targets = discovery.kubernetes.pods.targets
|
|
|
|
// Keep only running pods
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_phase"]
|
|
regex = "Pending|Succeeded|Failed|Unknown"
|
|
action = "drop"
|
|
}
|
|
|
|
// Set namespace label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
target_label = "namespace"
|
|
}
|
|
|
|
// Set pod name label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
target_label = "pod"
|
|
}
|
|
|
|
// Set container name label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
target_label = "container"
|
|
}
|
|
|
|
// Set app label from pod labels
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app"]
|
|
target_label = "app"
|
|
}
|
|
|
|
// Fallback: use app.kubernetes.io/name if no app label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
target_label = "app"
|
|
regex = "(.+)"
|
|
action = "replace"
|
|
}
|
|
|
|
// Set node name
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_node_name"]
|
|
target_label = "node"
|
|
}
|
|
|
|
// Build the log path for the pod container
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
|
target_label = "__path__"
|
|
separator = "/"
|
|
replacement = "/var/log/pods/*$1/$2/*.log"
|
|
}
|
|
}
|
|
|
|
// Tail pod logs
|
|
loki.source.kubernetes "pods" {
|
|
targets = discovery.relabel.pods.output
|
|
forward_to = [loki.process.pods.receiver]
|
|
}
|
|
|
|
// Process logs - parse JSON if present, add labels
|
|
loki.process "pods" {
|
|
forward_to = [loki.write.loki.receiver]
|
|
|
|
// Drop noisy deprecation warning from minikube storage-provisioner
|
|
// See: https://github.com/kubernetes/minikube/issues/21009
|
|
stage.drop {
|
|
source = ""
|
|
expression = "v1 Endpoints is deprecated"
|
|
}
|
|
|
|
// Try to parse JSON logs (e.g., structured app logs)
|
|
// Handle both "msg" (common) and "message" (zot) field names
|
|
stage.json {
|
|
expressions = {
|
|
level = "level",
|
|
msg = "msg",
|
|
message = "message",
|
|
time = "time",
|
|
caller = "caller",
|
|
repository = "repository",
|
|
}
|
|
}
|
|
|
|
// Drop JSON parsing error labels (non-JSON logs are fine, just won't have extracted fields)
|
|
stage.label_drop {
|
|
values = ["__error__", "__error_details__"]
|
|
}
|
|
|
|
// Extract labels from parsed JSON data
|
|
stage.labels {
|
|
values = {
|
|
level = "",
|
|
caller = "",
|
|
repository = "",
|
|
}
|
|
}
|
|
|
|
// Add cluster label for multi-cluster identification
|
|
stage.static_labels {
|
|
values = { cluster = "indri" }
|
|
}
|
|
}
|
|
|
|
// Write logs to Loki
|
|
loki.write "loki" {
|
|
endpoint {
|
|
url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
|
|
}
|
|
}
|
|
|
|
// ============== SERVICE HEALTH PROBES ==============
|
|
|
|
// Blackbox-style HTTP probes for k8s services
|
|
prometheus.exporter.blackbox "services" {
|
|
config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"
|
|
|
|
target {
|
|
name = "miniflux"
|
|
address = "http://miniflux.miniflux.svc.cluster.local:8080/healthcheck"
|
|
module = "http_2xx"
|
|
}
|
|
|
|
target {
|
|
name = "kiwix"
|
|
address = "http://kiwix.kiwix.svc.cluster.local:80/"
|
|
module = "http_2xx"
|
|
}
|
|
|
|
target {
|
|
name = "transmission"
|
|
address = "http://transmission.torrent.svc.cluster.local:9091/transmission/web/"
|
|
module = "http_2xx"
|
|
}
|
|
|
|
target {
|
|
name = "devpi"
|
|
address = "http://devpi.devpi.svc.cluster.local:3141/+api"
|
|
module = "http_2xx"
|
|
}
|
|
|
|
target {
|
|
name = "argocd"
|
|
address = "http://argocd-server.argocd.svc.cluster.local:80/healthz"
|
|
module = "http_2xx"
|
|
}
|
|
}
|
|
|
|
// Scrape blackbox probe results
|
|
prometheus.scrape "blackbox" {
|
|
targets = prometheus.exporter.blackbox.services.targets
|
|
scrape_interval = "30s"
|
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
|
}
|
|
|
|
// Push metrics to Prometheus
|
|
prometheus.remote_write "prometheus" {
|
|
external_labels = { cluster = "indri" }
|
|
|
|
endpoint {
|
|
url = "http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
|
|
}
|
|
}
|