## Summary
- Add `cluster` label (indri/ringtail) to all Prometheus scrape jobs, Alloy k8s metrics/logs, and Alloy host metrics/logs
- Deploy kube-state-metrics on ringtail's k3s cluster (ArgoCD app + manifests)
- Deploy Alloy on ringtail to collect pod metrics and logs, remote-writing to indri's Prometheus and Loki
- Replace single-cluster "Minikube Kubernetes" and "K8s Services Health" dashboards with:
- **Kubernetes Clusters** dashboard — multi-cluster with `cluster` and `namespace` template variables
- **Ringtail (k3s)** dashboard — dedicated ringtail view with GPU usage panels
## Deployment and Testing
1. Sync `apps` on indri ArgoCD to pick up new app definitions (`kube-state-metrics-ringtail`, `alloy-ringtail`)
2. Sync `prometheus` → verify `cluster` label on scraped metrics
3. Sync `alloy-k8s` → verify `cluster=indri` on remote-written metrics and logs
4. Run `mise run provision-indri -- --tags alloy` → verify `cluster=indri` on host Alloy metrics/logs
5. Sync `kube-state-metrics-ringtail` → verify pods running on ringtail
6. Sync `alloy-ringtail` → verify pods running, check Prometheus for `kube_pod_info{cluster="ringtail"}`
7. Sync `grafana-config` → verify dashboards appear, cluster variable populates both values
8. Check Loki for `{cluster="ringtail"}` logs from ringtail pods
## Notes
- Alloy on ringtail uses `insecure_skip_verify=true` for TLS to Prometheus/Loki (Tailscale-managed certs not in container trust store) — tighten later
- DNS resolution for `*.tail8d86e.ts.net` from ringtail pods depends on CoreDNS inheriting host's MagicDNS resolver; may need CoreDNS forwarding rules if pods can't resolve
- The old services dashboard (blackbox probes) is removed — those probes are still running in alloy-k8s and the data is still in Prometheus, just not in a dedicated dashboard
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/270
131 lines
3.5 KiB
Django/Jinja
131 lines
3.5 KiB
Django/Jinja
// {{ ansible_managed }}
|
|
// Grafana Alloy configuration for {{ alloy_instance_label }}
|
|
// Collects system metrics (replacing node_exporter) and logs
|
|
|
|
// ============== METRICS COLLECTION ==============
|
|
|
|
// System metrics exporter (replaces node_exporter)
|
|
prometheus.exporter.unix "system" {
|
|
// Disable collectors that don't work on macOS
|
|
disable_collectors = ["thermal"]
|
|
|
|
textfile {
|
|
directory = "{{ alloy_textfile_dir }}"
|
|
}
|
|
}
|
|
|
|
// Scrape system metrics
|
|
prometheus.scrape "system" {
|
|
targets = prometheus.exporter.unix.system.targets
|
|
forward_to = [prometheus.relabel.instance.receiver]
|
|
scrape_interval = "{{ alloy_scrape_interval }}"
|
|
}
|
|
|
|
// Add instance label to match existing setup
|
|
prometheus.relabel "instance" {
|
|
forward_to = [prometheus.remote_write.prometheus.receiver]
|
|
|
|
rule {
|
|
target_label = "instance"
|
|
replacement = "{{ alloy_instance_label }}"
|
|
}
|
|
|
|
rule {
|
|
target_label = "cluster"
|
|
replacement = "indri"
|
|
}
|
|
}
|
|
|
|
// Push metrics to Prometheus via remote_write
|
|
prometheus.remote_write "prometheus" {
|
|
endpoint {
|
|
url = "{{ alloy_prometheus_url }}"
|
|
}
|
|
}
|
|
|
|
{% if alloy_collect_postgres | default(false) %}
|
|
// ============== POSTGRESQL METRICS ==============
|
|
|
|
// PostgreSQL exporter (read-only metrics via pg_monitor role)
|
|
prometheus.exporter.postgres "postgresql" {
|
|
data_source_names = ["postgresql://{{ alloy_postgres_user }}:{{ alloy_postgres_password | urlencode }}@{{ alloy_postgres_host }}:{{ alloy_postgres_port }}/{{ alloy_postgres_database }}?sslmode=disable"]
|
|
|
|
// Custom queries for vacuum and XID monitoring
|
|
custom_queries_config_path = "{{ alloy_config_dir }}/postgres_queries.yaml"
|
|
}
|
|
|
|
// Scrape PostgreSQL metrics
|
|
prometheus.scrape "postgresql" {
|
|
targets = prometheus.exporter.postgres.postgresql.targets
|
|
forward_to = [prometheus.relabel.instance.receiver]
|
|
scrape_interval = "{{ alloy_scrape_interval }}"
|
|
}
|
|
{% endif %}
|
|
|
|
{% if alloy_collect_zot | default(false) %}
|
|
// ============== ZOT REGISTRY METRICS ==============
|
|
|
|
// Scrape Zot's native metrics endpoint
|
|
prometheus.scrape "zot" {
|
|
targets = [{"__address__" = "localhost:5050"}]
|
|
metrics_path = "/metrics"
|
|
forward_to = [prometheus.relabel.instance.receiver]
|
|
scrape_interval = "{{ alloy_scrape_interval }}"
|
|
}
|
|
{% endif %}
|
|
|
|
{% if alloy_collect_logs %}
|
|
// ============== LOG COLLECTION ==============
|
|
|
|
// Discover log files - brew services
|
|
local.file_match "brew_logs" {
|
|
path_targets = [
|
|
{% for log in alloy_brew_logs %}
|
|
{__path__ = "{{ log.path }}", service = "{{ log.service }}", stream = "{{ log.stream }}"},
|
|
{% endfor %}
|
|
]
|
|
}
|
|
|
|
// Discover log files - mcquack LaunchAgents
|
|
local.file_match "mcquack_logs" {
|
|
path_targets = [
|
|
{% for log in alloy_mcquack_logs %}
|
|
{__path__ = "{{ log.path }}", service = "{{ log.service }}", stream = "{{ log.stream }}"},
|
|
{% endfor %}
|
|
]
|
|
}
|
|
|
|
// Read and forward brew service logs
|
|
loki.source.file "brew_logs" {
|
|
targets = local.file_match.brew_logs.targets
|
|
forward_to = [loki.relabel.add_host.receiver]
|
|
}
|
|
|
|
// Read and forward mcquack service logs
|
|
loki.source.file "mcquack_logs" {
|
|
targets = local.file_match.mcquack_logs.targets
|
|
forward_to = [loki.relabel.add_host.receiver]
|
|
}
|
|
|
|
// Add host label to all logs
|
|
loki.relabel "add_host" {
|
|
forward_to = [loki.write.loki.receiver]
|
|
|
|
rule {
|
|
target_label = "host"
|
|
replacement = "{{ alloy_instance_label }}"
|
|
}
|
|
|
|
rule {
|
|
target_label = "cluster"
|
|
replacement = "indri"
|
|
}
|
|
}
|
|
|
|
// Write logs to Loki
|
|
loki.write "loki" {
|
|
endpoint {
|
|
url = "{{ alloy_loki_url }}"
|
|
}
|
|
}
|
|
{% endif %}
|