blumeops/argocd/manifests/alloy-k8s/config.alloy

// Alloy k8s configuration - collects pod logs from all namespaces

// ============== K8S POD LOG DISCOVERY ==============

// Discover all pods in the cluster
discovery.kubernetes "pods" {
  role = "pod"
}

// Relabel to extract useful metadata
discovery.relabel "pods" {
  targets = discovery.kubernetes.pods.targets

  // Keep only running pods
  rule {
    source_labels = ["__meta_kubernetes_pod_phase"]
    regex         = "Pending|Succeeded|Failed|Unknown"
    action        = "drop"
  }

  // Set namespace label
  rule {
    source_labels = ["__meta_kubernetes_namespace"]
    target_label  = "namespace"
  }

  // Set pod name label
  rule {
    source_labels = ["__meta_kubernetes_pod_name"]
    target_label  = "pod"
  }

  // Set container name label
  rule {
    source_labels = ["__meta_kubernetes_pod_container_name"]
    target_label  = "container"
  }

  // Set app label from pod labels
  rule {
    source_labels = ["__meta_kubernetes_pod_label_app"]
    target_label  = "app"
  }

  // Fallback: use app.kubernetes.io/name if no app label
  rule {
    source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
    target_label  = "app"
    regex         = "(.+)"
    action        = "replace"
  }

  // Set node name
  rule {
    source_labels = ["__meta_kubernetes_pod_node_name"]
    target_label  = "node"
  }

  // Build the log path for the pod container
  rule {
    source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
    target_label  = "__path__"
    separator     = "/"
    replacement   = "/var/log/pods/*$1/$2/*.log"
  }
}

// Tail pod logs
loki.source.kubernetes "pods" {
  targets    = discovery.relabel.pods.output
  forward_to = [loki.process.pods.receiver]
}

// Process logs - parse JSON if present, add labels
loki.process "pods" {
  forward_to = [loki.write.loki.receiver]

  // Drop noisy deprecation warning from minikube storage-provisioner
  // See: https://github.com/kubernetes/minikube/issues/21009
  stage.drop {
    source     = ""
    expression = "v1 Endpoints is deprecated"
  }

  // Try to parse JSON logs (e.g., structured app logs)
  // Handle both "msg" (common) and "message" (zot) field names
  stage.json {
    expressions = {
      level      = "level",
      msg        = "msg",
      message    = "message",
      time       = "time",
      caller     = "caller",
      repository = "repository",
    }
  }

  // Drop JSON parsing error labels (non-JSON logs are fine, just won't have extracted fields)
  stage.label_drop {
    values = ["__error__", "__error_details__"]
  }

  // Extract labels from parsed JSON data
  stage.labels {
    values = {
      level      = "",
      caller     = "",
      repository = "",
    }
  }

  // Add cluster label for multi-cluster identification
  stage.static_labels {
    values = { cluster = "indri" }
  }
}

// Write logs to Loki
loki.write "loki" {
  endpoint {
    url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"
  }
}

// ============== SERVICE HEALTH PROBES ==============

// Blackbox-style HTTP probes for k8s services
prometheus.exporter.blackbox "services" {
  config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"

  target {
    name    = "miniflux"
    address = "http://miniflux.miniflux.svc.cluster.local:8080/healthcheck"
    module  = "http_2xx"
  }

  target {
    name    = "kiwix"
    address = "http://kiwix.kiwix.svc.cluster.local:80/"
    module  = "http_2xx"
  }

  target {
    name    = "transmission"
    address = "http://transmission.torrent.svc.cluster.local:9091/transmission/web/"
    module  = "http_2xx"
  }

  target {
    name    = "devpi"
    address = "http://devpi.devpi.svc.cluster.local:3141/+api"
    module  = "http_2xx"
  }

  target {
    name    = "argocd"
    address = "http://argocd-server.argocd.svc.cluster.local:80/healthz"
    module  = "http_2xx"
  }
}

// Scrape blackbox probe results
prometheus.scrape "blackbox" {
  targets         = prometheus.exporter.blackbox.services.targets
  scrape_interval = "30s"
  forward_to      = [prometheus.remote_write.prometheus.receiver]
}

// Push metrics to Prometheus
prometheus.remote_write "prometheus" {
  external_labels = { cluster = "indri" }

  endpoint {
    url = "http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write"
  }
}
Add kustomize images: and configMapGenerator: across services (#264) ## Summary - Move hardcoded image tags to kustomization.yaml `images:` transformer across 22 services — image names in manifests become version-agnostic templates, with tags centralized in one place per service - Replace hand-written ConfigMap manifests with `configMapGenerator:` in 12 services — config data extracted to standalone files, generated ConfigMaps include content hashes that trigger automatic pod rollouts on changes - Create new `kustomization.yaml` for forgejo-runner and nvidia-device-plugin (switches ArgoCD from directory mode to kustomize mode, rendered output identical) ### Services modified Images only (8): cv, devpi, docs, kube-state-metrics, miniflux, navidrome, teslamate, torrent Images + configMapGenerator (10): alloy-k8s, forgejo-runner, frigate, grafana, homepage, kiwix, loki, mosquitto, ntfy, prometheus Images only, no configMapGenerator (4): authentik (skip blueprints — special YAML tags), tailscale-operator-base (Deployment only, CRD image fields left as-is) Skipped entirely (6): argocd (remote upstream), databases (no image fields), external-secrets, grafana-config (cross-kustomization dashboards), immich (Helm-managed), 1password-connect/cloudnative-pg (no kustomization.yaml) ### What changes at deploy time - images: — no functional diff, `kustomize build` produces identical output with tags - configMapGenerator: — ConfigMap names gain hash suffixes (e.g., `prometheus-config` → `prometheus-config-6f42fhctcb`) and all Deployment/StatefulSet/DaemonSet references are updated automatically. Pods will restart once per service on first sync due to the name change ## Test plan - [x] `kubectl kustomize` builds all 30 service directories successfully - [x] Image tags verified in rendered output for all modified services - [x] ConfigMap hash suffixes verified in rendered output - [x] ConfigMap references in Deployments/StatefulSets confirmed to use hashed names - [x] All pre-commit hooks pass (yamllint, shellcheck, prettier, etc.) - [ ] `argocd app diff` each service to confirm only expected ConfigMap name changes - [ ] Deploy from branch starting with a low-risk service (e.g., mosquitto) Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/264 2026-02-24 14:25:19 -08:00			`// Alloy k8s configuration - collects pod logs from all namespaces`

			`// ============== K8S POD LOG DISCOVERY ==============`

			`// Discover all pods in the cluster`
			`discovery.kubernetes "pods" {`
			`role = "pod"`
			`}`

			`// Relabel to extract useful metadata`
			`discovery.relabel "pods" {`
			`targets = discovery.kubernetes.pods.targets`

			`// Keep only running pods`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_phase"]`
			`regex = "Pending\|Succeeded\|Failed\|Unknown"`
			`action = "drop"`
			`}`

			`// Set namespace label`
			`rule {`
			`source_labels = ["__meta_kubernetes_namespace"]`
			`target_label = "namespace"`
			`}`

			`// Set pod name label`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_name"]`
			`target_label = "pod"`
			`}`

			`// Set container name label`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_container_name"]`
			`target_label = "container"`
			`}`

			`// Set app label from pod labels`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_label_app"]`
			`target_label = "app"`
			`}`

			`// Fallback: use app.kubernetes.io/name if no app label`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]`
			`target_label = "app"`
			`regex = "(.+)"`
			`action = "replace"`
			`}`

			`// Set node name`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_node_name"]`
			`target_label = "node"`
			`}`

			`// Build the log path for the pod container`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]`
			`target_label = "__path__"`
			`separator = "/"`
			`replacement = "/var/log/pods/$1/$2/.log"`
			`}`
			`}`

			`// Tail pod logs`
			`loki.source.kubernetes "pods" {`
			`targets = discovery.relabel.pods.output`
			`forward_to = [loki.process.pods.receiver]`
			`}`

			`// Process logs - parse JSON if present, add labels`
			`loki.process "pods" {`
			`forward_to = [loki.write.loki.receiver]`

			`// Drop noisy deprecation warning from minikube storage-provisioner`
			`// See: https://github.com/kubernetes/minikube/issues/21009`
			`stage.drop {`
			`source = ""`
			`expression = "v1 Endpoints is deprecated"`
			`}`

			`// Try to parse JSON logs (e.g., structured app logs)`
			`// Handle both "msg" (common) and "message" (zot) field names`
			`stage.json {`
			`expressions = {`
			`level = "level",`
			`msg = "msg",`
			`message = "message",`
			`time = "time",`
			`caller = "caller",`
			`repository = "repository",`
			`}`
			`}`

			`// Drop JSON parsing error labels (non-JSON logs are fine, just won't have extracted fields)`
			`stage.label_drop {`
			`values = ["__error__", "__error_details__"]`
			`}`

			`// Extract labels from parsed JSON data`
			`stage.labels {`
			`values = {`
			`level = "",`
			`caller = "",`
			`repository = "",`
			`}`
			`}`
Add multi-cluster observability with ringtail metrics and dashboards (#270) ## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - Kubernetes Clusters dashboard — multi-cluster with `cluster` and `namespace` template variables - Ringtail (k3s) dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270 2026-02-25 22:01:00 -08:00
			`// Add cluster label for multi-cluster identification`
			`stage.static_labels {`
			`values = { cluster = "indri" }`
			`}`
Add kustomize images: and configMapGenerator: across services (#264) ## Summary - Move hardcoded image tags to kustomization.yaml `images:` transformer across 22 services — image names in manifests become version-agnostic templates, with tags centralized in one place per service - Replace hand-written ConfigMap manifests with `configMapGenerator:` in 12 services — config data extracted to standalone files, generated ConfigMaps include content hashes that trigger automatic pod rollouts on changes - Create new `kustomization.yaml` for forgejo-runner and nvidia-device-plugin (switches ArgoCD from directory mode to kustomize mode, rendered output identical) ### Services modified Images only (8): cv, devpi, docs, kube-state-metrics, miniflux, navidrome, teslamate, torrent Images + configMapGenerator (10): alloy-k8s, forgejo-runner, frigate, grafana, homepage, kiwix, loki, mosquitto, ntfy, prometheus Images only, no configMapGenerator (4): authentik (skip blueprints — special YAML tags), tailscale-operator-base (Deployment only, CRD image fields left as-is) Skipped entirely (6): argocd (remote upstream), databases (no image fields), external-secrets, grafana-config (cross-kustomization dashboards), immich (Helm-managed), 1password-connect/cloudnative-pg (no kustomization.yaml) ### What changes at deploy time - images: — no functional diff, `kustomize build` produces identical output with tags - configMapGenerator: — ConfigMap names gain hash suffixes (e.g., `prometheus-config` → `prometheus-config-6f42fhctcb`) and all Deployment/StatefulSet/DaemonSet references are updated automatically. Pods will restart once per service on first sync due to the name change ## Test plan - [x] `kubectl kustomize` builds all 30 service directories successfully - [x] Image tags verified in rendered output for all modified services - [x] ConfigMap hash suffixes verified in rendered output - [x] ConfigMap references in Deployments/StatefulSets confirmed to use hashed names - [x] All pre-commit hooks pass (yamllint, shellcheck, prettier, etc.) - [ ] `argocd app diff` each service to confirm only expected ConfigMap name changes - [ ] Deploy from branch starting with a low-risk service (e.g., mosquitto) Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/264 2026-02-24 14:25:19 -08:00			`}`

			`// Write logs to Loki`
			`loki.write "loki" {`
			`endpoint {`
			`url = "http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push"`
			`}`
			`}`

			`// ============== SERVICE HEALTH PROBES ==============`

			`// Blackbox-style HTTP probes for k8s services`
			`prometheus.exporter.blackbox "services" {`
			`config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"`

			`target {`
			`name = "miniflux"`
			`address = "http://miniflux.miniflux.svc.cluster.local:8080/healthcheck"`
			`module = "http_2xx"`
			`}`

			`target {`
			`name = "kiwix"`
			`address = "http://kiwix.kiwix.svc.cluster.local:80/"`
			`module = "http_2xx"`
			`}`

			`target {`
			`name = "transmission"`
			`address = "http://transmission.torrent.svc.cluster.local:9091/transmission/web/"`
			`module = "http_2xx"`
			`}`

			`target {`
			`name = "devpi"`
			`address = "http://devpi.devpi.svc.cluster.local:3141/+api"`
			`module = "http_2xx"`
			`}`

			`target {`
			`name = "argocd"`
			`address = "http://argocd-server.argocd.svc.cluster.local:80/healthz"`
			`module = "http_2xx"`
			`}`
			`}`

			`// Scrape blackbox probe results`
			`prometheus.scrape "blackbox" {`
			`targets = prometheus.exporter.blackbox.services.targets`
			`scrape_interval = "30s"`
			`forward_to = [prometheus.remote_write.prometheus.receiver]`
			`}`

			`// Push metrics to Prometheus`
			`prometheus.remote_write "prometheus" {`
Add multi-cluster observability with ringtail metrics and dashboards (#270) ## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - Kubernetes Clusters dashboard — multi-cluster with `cluster` and `namespace` template variables - Ringtail (k3s) dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270 2026-02-25 22:01:00 -08:00			`external_labels = { cluster = "indri" }`

Add kustomize images: and configMapGenerator: across services (#264) ## Summary - Move hardcoded image tags to kustomization.yaml `images:` transformer across 22 services — image names in manifests become version-agnostic templates, with tags centralized in one place per service - Replace hand-written ConfigMap manifests with `configMapGenerator:` in 12 services — config data extracted to standalone files, generated ConfigMaps include content hashes that trigger automatic pod rollouts on changes - Create new `kustomization.yaml` for forgejo-runner and nvidia-device-plugin (switches ArgoCD from directory mode to kustomize mode, rendered output identical) ### Services modified Images only (8): cv, devpi, docs, kube-state-metrics, miniflux, navidrome, teslamate, torrent Images + configMapGenerator (10): alloy-k8s, forgejo-runner, frigate, grafana, homepage, kiwix, loki, mosquitto, ntfy, prometheus Images only, no configMapGenerator (4): authentik (skip blueprints — special YAML tags), tailscale-operator-base (Deployment only, CRD image fields left as-is) Skipped entirely (6): argocd (remote upstream), databases (no image fields), external-secrets, grafana-config (cross-kustomization dashboards), immich (Helm-managed), 1password-connect/cloudnative-pg (no kustomization.yaml) ### What changes at deploy time - images: — no functional diff, `kustomize build` produces identical output with tags - configMapGenerator: — ConfigMap names gain hash suffixes (e.g., `prometheus-config` → `prometheus-config-6f42fhctcb`) and all Deployment/StatefulSet/DaemonSet references are updated automatically. Pods will restart once per service on first sync due to the name change ## Test plan - [x] `kubectl kustomize` builds all 30 service directories successfully - [x] Image tags verified in rendered output for all modified services - [x] ConfigMap hash suffixes verified in rendered output - [x] ConfigMap references in Deployments/StatefulSets confirmed to use hashed names - [x] All pre-commit hooks pass (yamllint, shellcheck, prettier, etc.) - [ ] `argocd app diff` each service to confirm only expected ConfigMap name changes - [ ] Deploy from branch starting with a low-risk service (e.g., mosquitto) Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/264 2026-02-24 14:25:19 -08:00			`endpoint {`
			`url = "http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write"`
			`}`
			`}`