C2: Deploy infrastructure alerting pipeline (#303)
## Summary Mikado chain to replace `mise run services-check` with Grafana Unified Alerting backed by ntfy push notifications. **Design:** - Grafana Unified Alerting evaluates rules against Prometheus/Loki - ntfy webhook contact point delivers iOS notifications - Anti-noise policy: page once per 24h per alert group - Every alert links to a runbook in `docs/how-to/alerts/` - services-check eventually queries the alerting API instead of doing its own probes **Chain (bottom-up):** 1. `configure-grafana-alerting-pipeline` — enable alerting, ntfy contact point, notification policy 2. `first-alert-and-runbook` — end-to-end proof of concept with blackbox probe failure 3. `port-services-check-alerts` — migrate all services-check probes to alert rules + runbooks 4. `refactor-services-check-to-query-alerts` — rewrite services-check to query Grafana API 5. `deploy-infra-alerting` — goal card 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: #303
This commit is contained in:
parent
f1620abb17
commit
6d65e6928c
20 changed files with 1259 additions and 46 deletions
|
|
@ -6,6 +6,7 @@ set -euo pipefail
|
|||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
FAILED=0
|
||||
|
|
@ -36,11 +37,88 @@ check_http() {
|
|||
fi
|
||||
}
|
||||
|
||||
# ============== Grafana Alerting API ==============
|
||||
|
||||
GRAFANA_URL="https://grafana.ops.eblu.me"
|
||||
GRAFANA_CREDS=""
|
||||
|
||||
fetch_alerts() {
|
||||
if [ -z "$GRAFANA_CREDS" ]; then
|
||||
local pass
|
||||
pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true
|
||||
if [ -n "$pass" ]; then
|
||||
GRAFANA_CREDS=$(echo -n "admin:$pass" | base64)
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$GRAFANA_CREDS" ]; then
|
||||
echo ""
|
||||
return
|
||||
fi
|
||||
|
||||
curl -sf --max-time 10 \
|
||||
-H "Authorization: Basic $GRAFANA_CREDS" \
|
||||
"$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo ""
|
||||
}
|
||||
|
||||
# Fetch all alerts once
|
||||
ALERTS_JSON=$(fetch_alerts)
|
||||
|
||||
check_alert() {
|
||||
local name="$1"
|
||||
local alertname="$2"
|
||||
# Optional: filter by a label key=value
|
||||
local filter_key="${3:-}"
|
||||
local filter_value="${4:-}"
|
||||
|
||||
printf "%-24s " "$name..."
|
||||
|
||||
if [ -z "$ALERTS_JSON" ]; then
|
||||
echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)"
|
||||
return
|
||||
fi
|
||||
|
||||
local firing
|
||||
firing=$(echo "$ALERTS_JSON" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
except:
|
||||
sys.exit(1)
|
||||
alerts = data.get('data', {}).get('alerts', [])
|
||||
for a in alerts:
|
||||
if a['labels'].get('alertname') != '$alertname':
|
||||
continue
|
||||
if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value':
|
||||
continue
|
||||
if a['state'] in ('Alerting', 'Pending'):
|
||||
url = a.get('annotations', {}).get('runbook_url', '')
|
||||
summary = a.get('annotations', {}).get('summary', '')
|
||||
print(f'{summary}|{url}')
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ -z "$firing" ]; then
|
||||
echo -e "${GREEN}OK${NC}"
|
||||
else
|
||||
local summary runbook
|
||||
summary=$(echo "$firing" | head -1 | cut -d'|' -f1)
|
||||
runbook=$(echo "$firing" | head -1 | cut -d'|' -f2)
|
||||
echo -e "${RED}FIRING${NC}"
|
||||
if [ -n "$summary" ]; then
|
||||
echo -e " $summary"
|
||||
fi
|
||||
if [ -n "$runbook" ]; then
|
||||
echo -e " Runbook: $runbook"
|
||||
fi
|
||||
FAILED=1
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Checking services..."
|
||||
echo "===================="
|
||||
echo ""
|
||||
|
||||
# Local services on indri
|
||||
# Local services on indri (not yet covered by alerting)
|
||||
echo "Local services on indri:"
|
||||
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
|
||||
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
||||
|
|
@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met
|
|||
check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'"
|
||||
|
||||
echo ""
|
||||
echo "Metrics textfiles:"
|
||||
check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'"
|
||||
check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
||||
check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
||||
check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'"
|
||||
echo "Metrics textfiles (via alerting):"
|
||||
check_alert "textfile-freshness" "TextfileStale"
|
||||
|
||||
echo ""
|
||||
echo "Kubernetes cluster:"
|
||||
echo "Kubernetes cluster (not yet covered by alerting):"
|
||||
check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'"
|
||||
check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
||||
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (via Caddy):"
|
||||
check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy"
|
||||
check_http "Loki" "https://loki.ops.eblu.me/ready"
|
||||
check_http "Grafana" "https://grafana.ops.eblu.me/api/health"
|
||||
check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz"
|
||||
echo "HTTP endpoints (via alerting):"
|
||||
check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus"
|
||||
check_alert "Loki" "ServiceProbeFailure" "service" "loki"
|
||||
check_alert "Grafana" "ServiceProbeFailure" "service" "grafana"
|
||||
check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd"
|
||||
check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix"
|
||||
check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux"
|
||||
check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate"
|
||||
check_alert "Devpi" "ServiceProbeFailure" "service" "devpi"
|
||||
check_alert "Transmission" "ServiceProbeFailure" "service" "transmission"
|
||||
check_alert "Immich" "ServiceProbeFailure" "service" "immich"
|
||||
check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (not yet covered by alerting):"
|
||||
check_http "Forgejo" "https://forge.eblu.me/"
|
||||
check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog"
|
||||
check_http "Kiwix" "https://kiwix.ops.eblu.me/"
|
||||
check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck"
|
||||
check_http "TeslaMate" "https://tesla.ops.eblu.me/"
|
||||
check_http "Devpi" "https://pypi.ops.eblu.me/+api"
|
||||
check_http "Transmission" "https://torrent.ops.eblu.me/"
|
||||
check_http "Immich" "https://photos.ops.eblu.me/"
|
||||
check_http "Navidrome" "https://dj.ops.eblu.me/"
|
||||
check_http "CV" "https://cv.ops.eblu.me/"
|
||||
check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health"
|
||||
check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/"
|
||||
check_http "Frigate" "https://nvr.ops.eblu.me/api/version"
|
||||
check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'"
|
||||
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
||||
check_http "JobSync" "https://jobsync.ops.eblu.me/"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail (NixOS):"
|
||||
echo "Frigate (via alerting):"
|
||||
check_alert "camera-fps" "FrigateCameraDown"
|
||||
echo "Frigate (not yet covered by alerting):"
|
||||
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail (not yet covered by alerting):"
|
||||
check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true"
|
||||
check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null"
|
||||
check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'"
|
||||
|
|
@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw
|
|||
check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail k3s pods:"
|
||||
check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
echo "Pod health (via alerting):"
|
||||
check_alert "pod-readiness" "PodNotReady"
|
||||
|
||||
echo ""
|
||||
echo "Public services (via Fly.io):"
|
||||
echo "Database (via alerting):"
|
||||
check_alert "PostgreSQL" "PostgresClusterUnhealthy"
|
||||
|
||||
echo ""
|
||||
echo "Public services (not yet covered by alerting):"
|
||||
check_http "Docs (public)" "https://docs.eblu.me/"
|
||||
check_http "CV (public)" "https://cv.eblu.me/"
|
||||
check_http "Forge (public)" "https://forge.eblu.me/"
|
||||
check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz"
|
||||
|
||||
echo ""
|
||||
echo "Database:"
|
||||
check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432"
|
||||
|
||||
echo ""
|
||||
echo "Indri minikube pods:"
|
||||
check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
|
||||
echo ""
|
||||
echo "ArgoCD app sync status:"
|
||||
echo "ArgoCD app sync status (via alerting):"
|
||||
check_alert "argocd-sync" "ArgoCDAppOutOfSync"
|
||||
# Keep the detailed table as a summary view
|
||||
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
||||
while read -r name sync health target; do
|
||||
if [[ "$sync" == "Synced" ]]; then
|
||||
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
elif [[ "$sync" == "OutOfSync" ]]; then
|
||||
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
FAILED=1
|
||||
else
|
||||
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
fi
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue