blumeops/argocd/manifests/alloy-ringtail/config.alloy

// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics
// Remote-writes metrics to indri Prometheus, logs to indri Loki

// ============== HOST METRICS ==============

// System metrics exporter (Linux host via /host/proc, /host/sys mounts)
prometheus.exporter.unix "system" {
  procfs_path = "/host/proc"
  sysfs_path  = "/host/sys"
  rootfs_path = "/host/root"
}

// Scrape system metrics and add instance label
prometheus.scrape "system" {
  targets         = prometheus.exporter.unix.system.targets
  forward_to      = [prometheus.relabel.instance.receiver]
  scrape_interval = "15s"
}

// Add instance label
prometheus.relabel "instance" {
  forward_to = [prometheus.remote_write.prometheus.receiver]

  rule {
    target_label = "instance"
    replacement  = "ringtail"
  }
}

// ============== SNOWFLAKE PROXY METRICS ==============

// Scrape Tor Snowflake proxy metrics from host (systemd service on port 9999)
prometheus.scrape "snowflake_proxy" {
  targets         = [{"__address__" = coalesce(sys.env("HOST_IP"), "localhost") + ":9999", "job" = "snowflake_proxy"}]
  metrics_path    = "/internal/metrics"
  scrape_interval = "30s"
  forward_to      = [prometheus.relabel.instance.receiver]
}

// ============== KUBE-STATE-METRICS SCRAPE ==============

prometheus.scrape "kube_state_metrics" {
  targets         = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]
  scrape_interval = "15s"
  forward_to      = [prometheus.remote_write.prometheus.receiver]
}

// ============== SERVICE HEALTH PROBES ==============

// Blackbox-style HTTP probes for in-cluster services on ringtail
prometheus.exporter.blackbox "services" {
  config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"

  target {
    name    = "immich"
    address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping"
    module  = "http_2xx"
  }
}

// Scrape blackbox probe results
prometheus.scrape "blackbox" {
  targets         = prometheus.exporter.blackbox.services.targets
  scrape_interval = "30s"
  forward_to      = [prometheus.remote_write.prometheus.receiver]
}

// Push metrics to indri Prometheus
prometheus.remote_write "prometheus" {
  external_labels = { cluster = "ringtail" }

  endpoint {
    url = "https://prometheus.tail8d86e.ts.net/api/v1/write"

    tls_config {
      insecure_skip_verify = true
    }
  }
}

// ============== K8S POD LOG DISCOVERY ==============

// Discover all pods in the cluster
discovery.kubernetes "pods" {
  role = "pod"
}

// Relabel to extract useful metadata
discovery.relabel "pods" {
  targets = discovery.kubernetes.pods.targets

  // Keep only running pods
  rule {
    source_labels = ["__meta_kubernetes_pod_phase"]
    regex         = "Pending|Succeeded|Failed|Unknown"
    action        = "drop"
  }

  // Set namespace label
  rule {
    source_labels = ["__meta_kubernetes_namespace"]
    target_label  = "namespace"
  }

  // Set pod name label
  rule {
    source_labels = ["__meta_kubernetes_pod_name"]
    target_label  = "pod"
  }

  // Set container name label
  rule {
    source_labels = ["__meta_kubernetes_pod_container_name"]
    target_label  = "container"
  }

  // Set app label from pod labels
  rule {
    source_labels = ["__meta_kubernetes_pod_label_app"]
    target_label  = "app"
  }

  // Fallback: use app.kubernetes.io/name if no app label
  rule {
    source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
    target_label  = "app"
    regex         = "(.+)"
    action        = "replace"
  }

  // Set node name
  rule {
    source_labels = ["__meta_kubernetes_pod_node_name"]
    target_label  = "node"
  }

  // Build the log path for the pod container
  rule {
    source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
    target_label  = "__path__"
    separator     = "/"
    replacement   = "/var/log/pods/*$1/$2/*.log"
  }
}

// Tail pod logs
loki.source.kubernetes "pods" {
  targets    = discovery.relabel.pods.output
  forward_to = [loki.process.pods.receiver]
}

// Process logs - parse JSON if present, add labels
loki.process "pods" {
  forward_to = [loki.write.loki.receiver]

  // Try to parse JSON logs
  stage.json {
    expressions = {
      level   = "level",
      msg     = "msg",
      message = "message",
      time    = "time",
      caller  = "caller",
    }
  }

  // Drop JSON parsing error labels (non-JSON logs are fine)
  stage.label_drop {
    values = ["__error__", "__error_details__"]
  }

  // Normalize 1password-connect numeric log levels to strings (1=error..5=trace)
  // Scoped to the 1password namespace so other services are unaffected.
  // See: https://github.com/1Password/connect/issues/44
  stage.match {
    selector = "{namespace=\"1password\"}"

    stage.template {
      source   = "level"
      template = "{{ if eq .Value \"1\" }}error{{ else if eq .Value \"2\" }}warn{{ else if eq .Value \"3\" }}info{{ else if eq .Value \"4\" }}debug{{ else if eq .Value \"5\" }}trace{{ else }}{{ .Value }}{{ end }}"
    }
  }

  // Extract labels from parsed JSON data
  stage.labels {
    values = {
      level  = "",
      caller = "",
    }
  }

  // Add cluster label for multi-cluster identification
  stage.static_labels {
    values = { cluster = "ringtail" }
  }
}

// Write logs to indri Loki
loki.write "loki" {
  endpoint {
    url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"

    tls_config {
      insecure_skip_verify = true
    }
  }
}
Add multi-cluster observability with ringtail metrics and dashboards (#270) ## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - Kubernetes Clusters dashboard — multi-cluster with `cluster` and `namespace` template variables - Ringtail (k3s) dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270 2026-02-25 22:01:00 -08:00			`// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics`
			`// Remote-writes metrics to indri Prometheus, logs to indri Loki`

			`// ============== HOST METRICS ==============`

			`// System metrics exporter (Linux host via /host/proc, /host/sys mounts)`
			`prometheus.exporter.unix "system" {`
			`procfs_path = "/host/proc"`
			`sysfs_path = "/host/sys"`
			`rootfs_path = "/host/root"`
			`}`

			`// Scrape system metrics and add instance label`
			`prometheus.scrape "system" {`
			`targets = prometheus.exporter.unix.system.targets`
			`forward_to = [prometheus.relabel.instance.receiver]`
			`scrape_interval = "15s"`
			`}`

			`// Add instance label`
			`prometheus.relabel "instance" {`
			`forward_to = [prometheus.remote_write.prometheus.receiver]`

			`rule {`
			`target_label = "instance"`
			`replacement = "ringtail"`
			`}`
			`}`

Deploy Tor Snowflake proxy on ringtail (#311) ## Summary - Add Snowflake proxy as a native systemd service on ringtail (NixOS) - Uses `pkgs.snowflake` from nixpkgs (v2.11.0) - Hardened systemd unit with DynamicUser, ProtectSystem=strict, 512MB memory limit - Prometheus metrics enabled on localhost:9999 ## What is Snowflake? A Tor pluggable transport that helps censored users reach the Tor network via WebRTC. This is NOT a Tor exit node — traffic exits through Tor exit nodes operated by others. The proxy operator cannot see traffic content (double-encrypted) and destination servers never see the proxy's IP. ## Changes - `nixos/ringtail/configuration.nix` — new systemd service definition - `docs/reference/services/snowflake-proxy.md` — service reference card - `docs/reference/infrastructure/ringtail.md` — updated systemd services section - `service-versions.yaml` — added entry (type: nixos) ## Deploy plan After review, deploy via `mise run provision-ringtail`. Service starts automatically. ## Test plan - [ ] `mise run provision-ringtail` succeeds - [ ] `ssh ringtail 'systemctl status snowflake-proxy'` shows active - [ ] `ssh ringtail 'journalctl -u snowflake-proxy --no-pager -n 20'` shows broker connections - [ ] `ssh ringtail 'curl -s localhost:9999/metrics'` returns Prometheus metrics Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/311 2026-03-24 20:51:40 -07:00			`// ============== SNOWFLAKE PROXY METRICS ==============`

			`// Scrape Tor Snowflake proxy metrics from host (systemd service on port 9999)`
			`prometheus.scrape "snowflake_proxy" {`
			`targets = [{"__address__" = coalesce(sys.env("HOST_IP"), "localhost") + ":9999", "job" = "snowflake_proxy"}]`
			`metrics_path = "/internal/metrics"`
			`scrape_interval = "30s"`
			`forward_to = [prometheus.relabel.instance.receiver]`
			`}`

Add multi-cluster observability with ringtail metrics and dashboards (#270) ## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - Kubernetes Clusters dashboard — multi-cluster with `cluster` and `namespace` template variables - Ringtail (k3s) dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270 2026-02-25 22:01:00 -08:00			`// ============== KUBE-STATE-METRICS SCRAPE ==============`

			`prometheus.scrape "kube_state_metrics" {`
			`targets = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]`
			`scrape_interval = "15s"`
			`forward_to = [prometheus.remote_write.prometheus.receiver]`
			`}`

C0: move immich blackbox probe to ringtail alloy Immich migrated to ringtail's k3s cluster but the probe still targeted the in-cluster service DNS on indri's minikube, firing ServiceProbeFailure indefinitely. Moved the target into alloy-ringtail's config so the probe runs in the cluster where immich actually lives. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-17 08:46:22 -07:00			`// ============== SERVICE HEALTH PROBES ==============`

			`// Blackbox-style HTTP probes for in-cluster services on ringtail`
			`prometheus.exporter.blackbox "services" {`
			`config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"`

			`target {`
			`name = "immich"`
			`address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping"`
			`module = "http_2xx"`
			`}`
			`}`

			`// Scrape blackbox probe results`
			`prometheus.scrape "blackbox" {`
			`targets = prometheus.exporter.blackbox.services.targets`
			`scrape_interval = "30s"`
			`forward_to = [prometheus.remote_write.prometheus.receiver]`
			`}`

Add multi-cluster observability with ringtail metrics and dashboards (#270) ## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - Kubernetes Clusters dashboard — multi-cluster with `cluster` and `namespace` template variables - Ringtail (k3s) dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270 2026-02-25 22:01:00 -08:00			`// Push metrics to indri Prometheus`
			`prometheus.remote_write "prometheus" {`
			`external_labels = { cluster = "ringtail" }`

			`endpoint {`
			`url = "https://prometheus.tail8d86e.ts.net/api/v1/write"`

			`tls_config {`
			`insecure_skip_verify = true`
			`}`
			`}`
			`}`

			`// ============== K8S POD LOG DISCOVERY ==============`

			`// Discover all pods in the cluster`
			`discovery.kubernetes "pods" {`
			`role = "pod"`
			`}`

			`// Relabel to extract useful metadata`
			`discovery.relabel "pods" {`
			`targets = discovery.kubernetes.pods.targets`

			`// Keep only running pods`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_phase"]`
			`regex = "Pending\|Succeeded\|Failed\|Unknown"`
			`action = "drop"`
			`}`

			`// Set namespace label`
			`rule {`
			`source_labels = ["__meta_kubernetes_namespace"]`
			`target_label = "namespace"`
			`}`

			`// Set pod name label`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_name"]`
			`target_label = "pod"`
			`}`

			`// Set container name label`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_container_name"]`
			`target_label = "container"`
			`}`

			`// Set app label from pod labels`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_label_app"]`
			`target_label = "app"`
			`}`

			`// Fallback: use app.kubernetes.io/name if no app label`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]`
			`target_label = "app"`
			`regex = "(.+)"`
			`action = "replace"`
			`}`

			`// Set node name`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_node_name"]`
			`target_label = "node"`
			`}`

			`// Build the log path for the pod container`
			`rule {`
			`source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]`
			`target_label = "__path__"`
			`separator = "/"`
			`replacement = "/var/log/pods/$1/$2/.log"`
			`}`
			`}`

			`// Tail pod logs`
			`loki.source.kubernetes "pods" {`
			`targets = discovery.relabel.pods.output`
			`forward_to = [loki.process.pods.receiver]`
			`}`

			`// Process logs - parse JSON if present, add labels`
			`loki.process "pods" {`
			`forward_to = [loki.write.loki.receiver]`

			`// Try to parse JSON logs`
			`stage.json {`
			`expressions = {`
			`level = "level",`
			`msg = "msg",`
			`message = "message",`
			`time = "time",`
			`caller = "caller",`
			`}`
			`}`

			`// Drop JSON parsing error labels (non-JSON logs are fine)`
			`stage.label_drop {`
			`values = ["__error__", "__error_details__"]`
			`}`

Fix 1Password Connect numeric log levels misclassified in Grafana (#287) ## Summary - 1Password Connect uses non-standard numeric log levels (`1`=error, `2`=warn, `3`=info, `4`=debug, `5`=trace) per [1Password/connect#44](https://github.com/1Password/connect/issues/44) - Alloy extracts the `level` JSON field as-is, so info-level health checks get `level="3"` in Loki - Grafana expects string level labels — numeric values are unrecognized, causing misclassified log severity/coloring - Adds a `stage.match` + `stage.template` in the Alloy pipeline scoped to `{namespace="1password"}` to normalize numeric levels to standard strings - Other services are completely unaffected (scoped by namespace, not global) ## Deployment and Testing - [ ] Sync alloy-k8s from branch: `argocd app set alloy-k8s --revision fix/onepassword-numeric-log-levels && argocd app sync alloy-k8s` - [ ] Wait ~2 minutes for new logs to flow - [ ] Verify level labels: `curl -sG "http://localhost:3100/loki/api/v1/label/level/values" --data-urlencode 'query={namespace="1password"}'` should show `"info"` and `"warn"` instead of `"3"` and `"2"` - [ ] Check Grafana log panel for 1password namespace — logs should no longer appear as errors - [ ] After merge: `argocd app set alloy-k8s --revision main && argocd app sync alloy-k8s` Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/287 2026-03-07 13:57:04 -08:00			`// Normalize 1password-connect numeric log levels to strings (1=error..5=trace)`
			`// Scoped to the 1password namespace so other services are unaffected.`
			`// See: https://github.com/1Password/connect/issues/44`
			`stage.match {`
			`selector = "{namespace=\"1password\"}"`

			`stage.template {`
			`source = "level"`
			`template = "{{ if eq .Value \"1\" }}error{{ else if eq .Value \"2\" }}warn{{ else if eq .Value \"3\" }}info{{ else if eq .Value \"4\" }}debug{{ else if eq .Value \"5\" }}trace{{ else }}{{ .Value }}{{ end }}"`
			`}`
			`}`

Add multi-cluster observability with ringtail metrics and dashboards (#270) ## Summary - Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs - Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests) - Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki - Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with: - Kubernetes Clusters dashboard — multi-cluster with `cluster` and `namespace` template variables - Ringtail (k3s) dashboard — dedicated ringtail view with GPU usage panels ## Deployment and Testing 1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`) 2. Sync `prometheus` → verify `cluster` label on scraped metrics 3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs 4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs 5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail 6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}` 7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values 8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods ## Notes - Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later - DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve - The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270 2026-02-25 22:01:00 -08:00			`// Extract labels from parsed JSON data`
			`stage.labels {`
			`values = {`
			`level = "",`
			`caller = "",`
			`}`
			`}`

			`// Add cluster label for multi-cluster identification`
			`stage.static_labels {`
			`values = { cluster = "ringtail" }`
			`}`
			`}`

			`// Write logs to indri Loki`
			`loki.write "loki" {`
			`endpoint {`
			`url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"`

			`tls_config {`
			`insecure_skip_verify = true`
			`}`
			`}`
			`}`