From 52eed44542de0f73efee759b6b609014a3d869ed Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 22 Mar 2026 14:21:42 -0700 Subject: [PATCH] C2(deploy-infra-alerting): impl refactor services-check to query alerts Replace covered checks with Grafana alerting API queries: - ServiceProbeFailure: 11 HTTP endpoints - TextfileStale: metrics textfile freshness - FrigateCameraDown: camera FPS - PodNotReady: pod readiness (both clusters) - PostgresClusterUnhealthy: database health - ArgoCDAppOutOfSync: ArgoCD sync status Uncovered checks remain as direct probes (SSH, launchctl, public endpoints, k8s API, frigate storage, some HTTP endpoints). Firing alerts display summary and clickable runbook link. Grafana credentials fetched from 1Password; graceful fallback if unavailable. Co-Authored-By: Claude Opus 4.6 (1M context) --- mise-tasks/services-check | 159 +++++++++++++++++++++++++++----------- 1 file changed, 114 insertions(+), 45 deletions(-) diff --git a/mise-tasks/services-check b/mise-tasks/services-check index 94ced03..9ba2c8e 100755 --- a/mise-tasks/services-check +++ b/mise-tasks/services-check @@ -6,6 +6,7 @@ set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' +YELLOW='\033[0;33m' NC='\033[0m' # No Color FAILED=0 @@ -36,11 +37,88 @@ check_http() { fi } +# ============== Grafana Alerting API ============== + +GRAFANA_URL="https://grafana.ops.eblu.me" +GRAFANA_CREDS="" + +fetch_alerts() { + if [ -z "$GRAFANA_CREDS" ]; then + local pass + pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true + if [ -n "$pass" ]; then + GRAFANA_CREDS=$(echo -n "admin:$pass" | base64) + fi + fi + + if [ -z "$GRAFANA_CREDS" ]; then + echo "" + return + fi + + curl -sf --max-time 10 \ + -H "Authorization: Basic $GRAFANA_CREDS" \ + "$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo "" +} + +# Fetch all alerts once +ALERTS_JSON=$(fetch_alerts) + +check_alert() { + local name="$1" + local alertname="$2" + # Optional: filter by a label key=value + local filter_key="${3:-}" + local filter_value="${4:-}" + + printf "%-24s " "$name..." + + if [ -z "$ALERTS_JSON" ]; then + echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)" + return + fi + + local firing + firing=$(echo "$ALERTS_JSON" | python3 -c " +import json, sys +try: + data = json.load(sys.stdin) +except: + sys.exit(1) +alerts = data.get('data', {}).get('alerts', []) +for a in alerts: + if a['labels'].get('alertname') != '$alertname': + continue + if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value': + continue + if a['state'] in ('Alerting', 'Pending'): + url = a.get('annotations', {}).get('runbook_url', '') + summary = a.get('annotations', {}).get('summary', '') + print(f'{summary}|{url}') +" 2>/dev/null) + + if [ -z "$firing" ]; then + echo -e "${GREEN}OK${NC}" + else + local summary runbook + summary=$(echo "$firing" | head -1 | cut -d'|' -f1) + runbook=$(echo "$firing" | head -1 | cut -d'|' -f2) + echo -e "${RED}FIRING${NC}" + if [ -n "$summary" ]; then + echo -e " $summary" + fi + if [ -n "$runbook" ]; then + echo -e " Runbook: $runbook" + fi + FAILED=1 + fi +} + echo "Checking services..." echo "====================" echo "" -# Local services on indri +# Local services on indri (not yet covered by alerting) echo "Local services on indri:" check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'" check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'" @@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'" echo "" -echo "Metrics textfiles:" -check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'" -check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" -check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" -check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'" +echo "Metrics textfiles (via alerting):" +check_alert "textfile-freshness" "TextfileStale" echo "" -echo "Kubernetes cluster:" +echo "Kubernetes cluster (not yet covered by alerting):" check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'" check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" -echo "HTTP endpoints (via Caddy):" -check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy" -check_http "Loki" "https://loki.ops.eblu.me/ready" -check_http "Grafana" "https://grafana.ops.eblu.me/api/health" -check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz" +echo "HTTP endpoints (via alerting):" +check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus" +check_alert "Loki" "ServiceProbeFailure" "service" "loki" +check_alert "Grafana" "ServiceProbeFailure" "service" "grafana" +check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd" +check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix" +check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux" +check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate" +check_alert "Devpi" "ServiceProbeFailure" "service" "devpi" +check_alert "Transmission" "ServiceProbeFailure" "service" "transmission" +check_alert "Immich" "ServiceProbeFailure" "service" "immich" +check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome" + +echo "" +echo "HTTP endpoints (not yet covered by alerting):" check_http "Forgejo" "https://forge.eblu.me/" check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog" -check_http "Kiwix" "https://kiwix.ops.eblu.me/" -check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck" -check_http "TeslaMate" "https://tesla.ops.eblu.me/" -check_http "Devpi" "https://pypi.ops.eblu.me/+api" -check_http "Transmission" "https://torrent.ops.eblu.me/" -check_http "Immich" "https://photos.ops.eblu.me/" -check_http "Navidrome" "https://dj.ops.eblu.me/" check_http "CV" "https://cv.ops.eblu.me/" check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health" check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/" check_http "Frigate" "https://nvr.ops.eblu.me/api/version" -check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'" -check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" check_http "JobSync" "https://jobsync.ops.eblu.me/" echo "" -echo "Ringtail (NixOS):" +echo "Frigate (via alerting):" +check_alert "camera-fps" "FrigateCameraDown" +echo "Frigate (not yet covered by alerting):" +check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" + +echo "" +echo "Ringtail (not yet covered by alerting):" check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true" check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null" check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'" @@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'" echo "" -echo "Ringtail k3s pods:" -check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running" +echo "Pod health (via alerting):" +check_alert "pod-readiness" "PodNotReady" echo "" -echo "Public services (via Fly.io):" +echo "Database (via alerting):" +check_alert "PostgreSQL" "PostgresClusterUnhealthy" + +echo "" +echo "Public services (not yet covered by alerting):" check_http "Docs (public)" "https://docs.eblu.me/" check_http "CV (public)" "https://cv.eblu.me/" check_http "Forge (public)" "https://forge.eblu.me/" check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz" echo "" -echo "Database:" -check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432" - -echo "" -echo "Indri minikube pods:" -check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running" -check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running" -check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running" - -echo "" -echo "ArgoCD app sync status:" +echo "ArgoCD app sync status (via alerting):" +check_alert "argocd-sync" "ArgoCDAppOutOfSync" +# Keep the detailed table as a summary view printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET" while read -r name sync health target; do if [[ "$sync" == "Synced" ]]; then printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" elif [[ "$sync" == "OutOfSync" ]]; then printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" - FAILED=1 else printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target" fi