C2(deploy-infra-alerting): impl refactor services-check to query alerts

Replace covered checks with Grafana alerting API queries:
- ServiceProbeFailure: 11 HTTP endpoints
- TextfileStale: metrics textfile freshness
- FrigateCameraDown: camera FPS
- PodNotReady: pod readiness (both clusters)
- PostgresClusterUnhealthy: database health
- ArgoCDAppOutOfSync: ArgoCD sync status

Uncovered checks remain as direct probes (SSH, launchctl, public
endpoints, k8s API, frigate storage, some HTTP endpoints).

Firing alerts display summary and clickable runbook link.
Grafana credentials fetched from 1Password; graceful fallback
if unavailable.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Erich Blume 2026-03-22 14:21:42 -07:00
commit 52eed44542

View file

@ -6,6 +6,7 @@ set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[0;33m'
NC='\033[0m' # No Color
FAILED=0
@ -36,11 +37,88 @@ check_http() {
fi
}
# ============== Grafana Alerting API ==============
GRAFANA_URL="https://grafana.ops.eblu.me"
GRAFANA_CREDS=""
fetch_alerts() {
if [ -z "$GRAFANA_CREDS" ]; then
local pass
pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true
if [ -n "$pass" ]; then
GRAFANA_CREDS=$(echo -n "admin:$pass" | base64)
fi
fi
if [ -z "$GRAFANA_CREDS" ]; then
echo ""
return
fi
curl -sf --max-time 10 \
-H "Authorization: Basic $GRAFANA_CREDS" \
"$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo ""
}
# Fetch all alerts once
ALERTS_JSON=$(fetch_alerts)
check_alert() {
local name="$1"
local alertname="$2"
# Optional: filter by a label key=value
local filter_key="${3:-}"
local filter_value="${4:-}"
printf "%-24s " "$name..."
if [ -z "$ALERTS_JSON" ]; then
echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)"
return
fi
local firing
firing=$(echo "$ALERTS_JSON" | python3 -c "
import json, sys
try:
data = json.load(sys.stdin)
except:
sys.exit(1)
alerts = data.get('data', {}).get('alerts', [])
for a in alerts:
if a['labels'].get('alertname') != '$alertname':
continue
if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value':
continue
if a['state'] in ('Alerting', 'Pending'):
url = a.get('annotations', {}).get('runbook_url', '')
summary = a.get('annotations', {}).get('summary', '')
print(f'{summary}|{url}')
" 2>/dev/null)
if [ -z "$firing" ]; then
echo -e "${GREEN}OK${NC}"
else
local summary runbook
summary=$(echo "$firing" | head -1 | cut -d'|' -f1)
runbook=$(echo "$firing" | head -1 | cut -d'|' -f2)
echo -e "${RED}FIRING${NC}"
if [ -n "$summary" ]; then
echo -e " $summary"
fi
if [ -n "$runbook" ]; then
echo -e " Runbook: $runbook"
fi
FAILED=1
fi
}
echo "Checking services..."
echo "===================="
echo ""
# Local services on indri
# Local services on indri (not yet covered by alerting)
echo "Local services on indri:"
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met
check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'"
echo ""
echo "Metrics textfiles:"
check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'"
check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'"
echo "Metrics textfiles (via alerting):"
check_alert "textfile-freshness" "TextfileStale"
echo ""
echo "Kubernetes cluster:"
echo "Kubernetes cluster (not yet covered by alerting):"
check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'"
check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
echo ""
echo "HTTP endpoints (via Caddy):"
check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy"
check_http "Loki" "https://loki.ops.eblu.me/ready"
check_http "Grafana" "https://grafana.ops.eblu.me/api/health"
check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz"
echo "HTTP endpoints (via alerting):"
check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus"
check_alert "Loki" "ServiceProbeFailure" "service" "loki"
check_alert "Grafana" "ServiceProbeFailure" "service" "grafana"
check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd"
check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix"
check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux"
check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate"
check_alert "Devpi" "ServiceProbeFailure" "service" "devpi"
check_alert "Transmission" "ServiceProbeFailure" "service" "transmission"
check_alert "Immich" "ServiceProbeFailure" "service" "immich"
check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome"
echo ""
echo "HTTP endpoints (not yet covered by alerting):"
check_http "Forgejo" "https://forge.eblu.me/"
check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog"
check_http "Kiwix" "https://kiwix.ops.eblu.me/"
check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck"
check_http "TeslaMate" "https://tesla.ops.eblu.me/"
check_http "Devpi" "https://pypi.ops.eblu.me/+api"
check_http "Transmission" "https://torrent.ops.eblu.me/"
check_http "Immich" "https://photos.ops.eblu.me/"
check_http "Navidrome" "https://dj.ops.eblu.me/"
check_http "CV" "https://cv.ops.eblu.me/"
check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health"
check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/"
check_http "Frigate" "https://nvr.ops.eblu.me/api/version"
check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'"
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
check_http "JobSync" "https://jobsync.ops.eblu.me/"
echo ""
echo "Ringtail (NixOS):"
echo "Frigate (via alerting):"
check_alert "camera-fps" "FrigateCameraDown"
echo "Frigate (not yet covered by alerting):"
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
echo ""
echo "Ringtail (not yet covered by alerting):"
check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true"
check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null"
check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'"
@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw
check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'"
echo ""
echo "Ringtail k3s pods:"
check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running"
echo "Pod health (via alerting):"
check_alert "pod-readiness" "PodNotReady"
echo ""
echo "Public services (via Fly.io):"
echo "Database (via alerting):"
check_alert "PostgreSQL" "PostgresClusterUnhealthy"
echo ""
echo "Public services (not yet covered by alerting):"
check_http "Docs (public)" "https://docs.eblu.me/"
check_http "CV (public)" "https://cv.eblu.me/"
check_http "Forge (public)" "https://forge.eblu.me/"
check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz"
echo ""
echo "Database:"
check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432"
echo ""
echo "Indri minikube pods:"
check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running"
check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running"
check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running"
echo ""
echo "ArgoCD app sync status:"
echo "ArgoCD app sync status (via alerting):"
check_alert "argocd-sync" "ArgoCDAppOutOfSync"
# Keep the detailed table as a summary view
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
while read -r name sync health target; do
if [[ "$sync" == "Synced" ]]; then
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
elif [[ "$sync" == "OutOfSync" ]]; then
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
FAILED=1
else
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
fi