## Summary Mikado chain to replace `mise run services-check` with Grafana Unified Alerting backed by ntfy push notifications. **Design:** - Grafana Unified Alerting evaluates rules against Prometheus/Loki - ntfy webhook contact point delivers iOS notifications - Anti-noise policy: page once per 24h per alert group - Every alert links to a runbook in `docs/how-to/alerts/` - services-check eventually queries the alerting API instead of doing its own probes **Chain (bottom-up):** 1. `configure-grafana-alerting-pipeline` — enable alerting, ntfy contact point, notification policy 2. `first-alert-and-runbook` — end-to-end proof of concept with blackbox probe failure 3. `port-services-check-alerts` — migrate all services-check probes to alert rules + runbooks 4. `refactor-services-check-to-query-alerts` — rewrite services-check to query Grafana API 5. `deploy-infra-alerting` — goal card 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: #303
99 lines
2.8 KiB
YAML
99 lines
2.8 KiB
YAML
global:
|
|
scrape_interval: 15s
|
|
evaluation_interval: 15s
|
|
|
|
# Indri system metrics are pushed via Alloy remote_write
|
|
# K8s services are scraped directly
|
|
|
|
scrape_configs:
|
|
# Sifaka NAS exporters (via Caddy L4 TCP proxy on indri)
|
|
- job_name: "node-exporter-sifaka"
|
|
static_configs:
|
|
- targets: ["nas.ops.eblu.me:9100"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
- job_name: "smartctl-sifaka"
|
|
scrape_interval: 60s
|
|
static_configs:
|
|
- targets: ["nas.ops.eblu.me:9633"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# CNPG PostgreSQL metrics (k8s internal)
|
|
- job_name: "cnpg-postgres"
|
|
static_configs:
|
|
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
|
|
labels:
|
|
instance: "blumeops-pg"
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# Prometheus self-monitoring
|
|
- job_name: "prometheus"
|
|
static_configs:
|
|
- targets: ["localhost:9090"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# Loki metrics
|
|
- job_name: "loki"
|
|
static_configs:
|
|
- targets: ["loki.monitoring.svc.cluster.local:3100"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# Kubernetes state metrics (pods, deployments, resource usage, etc.)
|
|
- job_name: "kube-state-metrics"
|
|
static_configs:
|
|
- targets: ["kube-state-metrics.monitoring.svc.cluster.local:8080"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# Transmission BitTorrent metrics (via sidecar exporter)
|
|
- job_name: "transmission"
|
|
static_configs:
|
|
- targets: ["transmission.torrent.svc.cluster.local:19091"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# Tempo operational metrics
|
|
- job_name: "tempo"
|
|
static_configs:
|
|
- targets: ["tempo.monitoring.svc.cluster.local:3200"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# UniFi network metrics (via UnPoller exporter)
|
|
- job_name: "unpoller"
|
|
static_configs:
|
|
- targets: ["unpoller.monitoring.svc.cluster.local:9130"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# ArgoCD application metrics
|
|
- job_name: "argocd"
|
|
static_configs:
|
|
- targets: ["argocd-metrics.argocd.svc.cluster.local:8082"]
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: indri
|
|
|
|
# Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail)
|
|
- job_name: "frigate"
|
|
scheme: https
|
|
static_configs:
|
|
- targets: ["nvr.ops.eblu.me"]
|
|
metrics_path: /api/metrics
|
|
metric_relabel_configs:
|
|
- target_label: cluster
|
|
replacement: ringtail
|