Immich migrated to ringtail's k3s cluster but the probe still targeted the in-cluster service DNS on indri's minikube, firing ServiceProbeFailure indefinitely. Moved the target into alloy-ringtail's config so the probe runs in the cluster where immich actually lives. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
207 lines
5.4 KiB
Text
207 lines
5.4 KiB
Text
// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics
|
|
// Remote-writes metrics to indri Prometheus, logs to indri Loki
|
|
|
|
// ============== HOST METRICS ==============
|
|
|
|
// System metrics exporter (Linux host via /host/proc, /host/sys mounts)
|
|
prometheus.exporter.unix "system" {
|
|
procfs_path = "/host/proc"
|
|
sysfs_path = "/host/sys"
|
|
rootfs_path = "/host/root"
|
|
}
|
|
|
|
// Scrape system metrics and add instance label
|
|
prometheus.scrape "system" {
|
|
targets = prometheus.exporter.unix.system.targets
|
|
forward_to = [prometheus.relabel.instance.receiver]
|
|
scrape_interval = "15s"
|
|
}
|
|
|
|
// Add instance label
|
|
prometheus.relabel "instance" {
|
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
|
|
|
rule {
|
|
target_label = "instance"
|
|
replacement = "ringtail"
|
|
}
|
|
}
|
|
|
|
// ============== SNOWFLAKE PROXY METRICS ==============
|
|
|
|
// Scrape Tor Snowflake proxy metrics from host (systemd service on port 9999)
|
|
prometheus.scrape "snowflake_proxy" {
|
|
targets = [{"__address__" = coalesce(sys.env("HOST_IP"), "localhost") + ":9999", "job" = "snowflake_proxy"}]
|
|
metrics_path = "/internal/metrics"
|
|
scrape_interval = "30s"
|
|
forward_to = [prometheus.relabel.instance.receiver]
|
|
}
|
|
|
|
// ============== KUBE-STATE-METRICS SCRAPE ==============
|
|
|
|
prometheus.scrape "kube_state_metrics" {
|
|
targets = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}]
|
|
scrape_interval = "15s"
|
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
|
}
|
|
|
|
// ============== SERVICE HEALTH PROBES ==============
|
|
|
|
// Blackbox-style HTTP probes for in-cluster services on ringtail
|
|
prometheus.exporter.blackbox "services" {
|
|
config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }"
|
|
|
|
target {
|
|
name = "immich"
|
|
address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping"
|
|
module = "http_2xx"
|
|
}
|
|
}
|
|
|
|
// Scrape blackbox probe results
|
|
prometheus.scrape "blackbox" {
|
|
targets = prometheus.exporter.blackbox.services.targets
|
|
scrape_interval = "30s"
|
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
|
}
|
|
|
|
// Push metrics to indri Prometheus
|
|
prometheus.remote_write "prometheus" {
|
|
external_labels = { cluster = "ringtail" }
|
|
|
|
endpoint {
|
|
url = "https://prometheus.tail8d86e.ts.net/api/v1/write"
|
|
|
|
tls_config {
|
|
insecure_skip_verify = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// ============== K8S POD LOG DISCOVERY ==============
|
|
|
|
// Discover all pods in the cluster
|
|
discovery.kubernetes "pods" {
|
|
role = "pod"
|
|
}
|
|
|
|
// Relabel to extract useful metadata
|
|
discovery.relabel "pods" {
|
|
targets = discovery.kubernetes.pods.targets
|
|
|
|
// Keep only running pods
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_phase"]
|
|
regex = "Pending|Succeeded|Failed|Unknown"
|
|
action = "drop"
|
|
}
|
|
|
|
// Set namespace label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_namespace"]
|
|
target_label = "namespace"
|
|
}
|
|
|
|
// Set pod name label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_name"]
|
|
target_label = "pod"
|
|
}
|
|
|
|
// Set container name label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_container_name"]
|
|
target_label = "container"
|
|
}
|
|
|
|
// Set app label from pod labels
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app"]
|
|
target_label = "app"
|
|
}
|
|
|
|
// Fallback: use app.kubernetes.io/name if no app label
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"]
|
|
target_label = "app"
|
|
regex = "(.+)"
|
|
action = "replace"
|
|
}
|
|
|
|
// Set node name
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_node_name"]
|
|
target_label = "node"
|
|
}
|
|
|
|
// Build the log path for the pod container
|
|
rule {
|
|
source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"]
|
|
target_label = "__path__"
|
|
separator = "/"
|
|
replacement = "/var/log/pods/*$1/$2/*.log"
|
|
}
|
|
}
|
|
|
|
// Tail pod logs
|
|
loki.source.kubernetes "pods" {
|
|
targets = discovery.relabel.pods.output
|
|
forward_to = [loki.process.pods.receiver]
|
|
}
|
|
|
|
// Process logs - parse JSON if present, add labels
|
|
loki.process "pods" {
|
|
forward_to = [loki.write.loki.receiver]
|
|
|
|
// Try to parse JSON logs
|
|
stage.json {
|
|
expressions = {
|
|
level = "level",
|
|
msg = "msg",
|
|
message = "message",
|
|
time = "time",
|
|
caller = "caller",
|
|
}
|
|
}
|
|
|
|
// Drop JSON parsing error labels (non-JSON logs are fine)
|
|
stage.label_drop {
|
|
values = ["__error__", "__error_details__"]
|
|
}
|
|
|
|
// Normalize 1password-connect numeric log levels to strings (1=error..5=trace)
|
|
// Scoped to the 1password namespace so other services are unaffected.
|
|
// See: https://github.com/1Password/connect/issues/44
|
|
stage.match {
|
|
selector = "{namespace=\"1password\"}"
|
|
|
|
stage.template {
|
|
source = "level"
|
|
template = "{{ if eq .Value \"1\" }}error{{ else if eq .Value \"2\" }}warn{{ else if eq .Value \"3\" }}info{{ else if eq .Value \"4\" }}debug{{ else if eq .Value \"5\" }}trace{{ else }}{{ .Value }}{{ end }}"
|
|
}
|
|
}
|
|
|
|
// Extract labels from parsed JSON data
|
|
stage.labels {
|
|
values = {
|
|
level = "",
|
|
caller = "",
|
|
}
|
|
}
|
|
|
|
// Add cluster label for multi-cluster identification
|
|
stage.static_labels {
|
|
values = { cluster = "ringtail" }
|
|
}
|
|
}
|
|
|
|
// Write logs to indri Loki
|
|
loki.write "loki" {
|
|
endpoint {
|
|
url = "https://loki.tail8d86e.ts.net/loki/api/v1/push"
|
|
|
|
tls_config {
|
|
insecure_skip_verify = true
|
|
}
|
|
}
|
|
}
|