diff --git a/mise-tasks/services-check b/mise-tasks/services-check index 94ced03..9ba2c8e 100755 --- a/mise-tasks/services-check +++ b/mise-tasks/services-check @@ -6,6 +6,7 @@ set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' +YELLOW='\033[0;33m' NC='\033[0m' # No Color FAILED=0 @@ -36,11 +37,88 @@ check_http() { fi } +# ============== Grafana Alerting API ============== + +GRAFANA_URL="https://grafana.ops.eblu.me" +GRAFANA_CREDS="" + +fetch_alerts() { + if [ -z "$GRAFANA_CREDS" ]; then + local pass + pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true + if [ -n "$pass" ]; then + GRAFANA_CREDS=$(echo -n "admin:$pass" | base64) + fi + fi + + if [ -z "$GRAFANA_CREDS" ]; then + echo "" + return + fi + + curl -sf --max-time 10 \ + -H "Authorization: Basic $GRAFANA_CREDS" \ + "$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo "" +} + +# Fetch all alerts once +ALERTS_JSON=$(fetch_alerts) + +check_alert() { + local name="$1" + local alertname="$2" + # Optional: filter by a label key=value + local filter_key="${3:-}" + local filter_value="${4:-}" + + printf "%-24s " "$name..." + + if [ -z "$ALERTS_JSON" ]; then + echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)" + return + fi + + local firing + firing=$(echo "$ALERTS_JSON" | python3 -c " +import json, sys +try: + data = json.load(sys.stdin) +except: + sys.exit(1) +alerts = data.get('data', {}).get('alerts', []) +for a in alerts: + if a['labels'].get('alertname') != '$alertname': + continue + if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value': + continue + if a['state'] in ('Alerting', 'Pending'): + url = a.get('annotations', {}).get('runbook_url', '') + summary = a.get('annotations', {}).get('summary', '') + print(f'{summary}|{url}') +" 2>/dev/null) + + if [ -z "$firing" ]; then + echo -e "${GREEN}OK${NC}" + else + local summary runbook + summary=$(echo "$firing" | head -1 | cut -d'|' -f1) + runbook=$(echo "$firing" | head -1 | cut -d'|' -f2) + echo -e "${RED}FIRING${NC}" + if [ -n "$summary" ]; then + echo -e " $summary" + fi + if [ -n "$runbook" ]; then + echo -e " Runbook: $runbook" + fi + FAILED=1 + fi +} + echo "Checking services..." echo "====================" echo "" -# Local services on indri +# Local services on indri (not yet covered by alerting) echo "Local services on indri:" check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'" check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'" @@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'" echo "" -echo "Metrics textfiles:" -check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'" -check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" -check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" -check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'" +echo "Metrics textfiles (via alerting):" +check_alert "textfile-freshness" "TextfileStale" echo "" -echo "Kubernetes cluster:" +echo "Kubernetes cluster (not yet covered by alerting):" check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'" check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" -echo "HTTP endpoints (via Caddy):" -check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy" -check_http "Loki" "https://loki.ops.eblu.me/ready" -check_http "Grafana" "https://grafana.ops.eblu.me/api/health" -check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz" +echo "HTTP endpoints (via alerting):" +check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus" +check_alert "Loki" "ServiceProbeFailure" "service" "loki" +check_alert "Grafana" "ServiceProbeFailure" "service" "grafana" +check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd" +check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix" +check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux" +check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate" +check_alert "Devpi" "ServiceProbeFailure" "service" "devpi" +check_alert "Transmission" "ServiceProbeFailure" "service" "transmission" +check_alert "Immich" "ServiceProbeFailure" "service" "immich" +check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome" + +echo "" +echo "HTTP endpoints (not yet covered by alerting):" check_http "Forgejo" "https://forge.eblu.me/" check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog" -check_http "Kiwix" "https://kiwix.ops.eblu.me/" -check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck" -check_http "TeslaMate" "https://tesla.ops.eblu.me/" -check_http "Devpi" "https://pypi.ops.eblu.me/+api" -check_http "Transmission" "https://torrent.ops.eblu.me/" -check_http "Immich" "https://photos.ops.eblu.me/" -check_http "Navidrome" "https://dj.ops.eblu.me/" check_http "CV" "https://cv.ops.eblu.me/" check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health" check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/" check_http "Frigate" "https://nvr.ops.eblu.me/api/version" -check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'" -check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" check_http "JobSync" "https://jobsync.ops.eblu.me/" echo "" -echo "Ringtail (NixOS):" +echo "Frigate (via alerting):" +check_alert "camera-fps" "FrigateCameraDown" +echo "Frigate (not yet covered by alerting):" +check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" + +echo "" +echo "Ringtail (not yet covered by alerting):" check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true" check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null" check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'" @@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'" echo "" -echo "Ringtail k3s pods:" -check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running" +echo "Pod health (via alerting):" +check_alert "pod-readiness" "PodNotReady" echo "" -echo "Public services (via Fly.io):" +echo "Database (via alerting):" +check_alert "PostgreSQL" "PostgresClusterUnhealthy" + +echo "" +echo "Public services (not yet covered by alerting):" check_http "Docs (public)" "https://docs.eblu.me/" check_http "CV (public)" "https://cv.eblu.me/" check_http "Forge (public)" "https://forge.eblu.me/" check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz" echo "" -echo "Database:" -check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432" - -echo "" -echo "Indri minikube pods:" -check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running" -check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running" -check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running" - -echo "" -echo "ArgoCD app sync status:" +echo "ArgoCD app sync status (via alerting):" +check_alert "argocd-sync" "ArgoCDAppOutOfSync" +# Keep the detailed table as a summary view printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET" while read -r name sync health target; do if [[ "$sync" == "Synced" ]]; then printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" elif [[ "$sync" == "OutOfSync" ]]; then printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" - FAILED=1 else printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target" fi