#!/usr/bin/env bash #MISE description="Check that all services are online and responding" set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[0;33m' NC='\033[0m' # No Color FAILED=0 check_service() { local name="$1" local check_cmd="$2" printf "%-24s " "$name..." if eval "$check_cmd" > /dev/null 2>&1; then echo -e "${GREEN}OK${NC}" else echo -e "${RED}FAILED${NC}" FAILED=1 fi } check_http() { local name="$1" local url="$2" printf "%-24s " "$name..." if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then echo -e "${GREEN}OK${NC}" else echo -e "${RED}FAILED${NC}" FAILED=1 fi } # ============== Grafana Alerting API ============== GRAFANA_URL="https://grafana.ops.eblu.me" GRAFANA_CREDS="" fetch_alerts() { if [ -z "$GRAFANA_CREDS" ]; then local pass pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true if [ -n "$pass" ]; then GRAFANA_CREDS=$(echo -n "admin:$pass" | base64) fi fi if [ -z "$GRAFANA_CREDS" ]; then echo "" return fi curl -sf --max-time 10 \ -H "Authorization: Basic $GRAFANA_CREDS" \ "$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo "" } # Fetch all alerts once ALERTS_JSON=$(fetch_alerts) check_alert() { local name="$1" local alertname="$2" # Optional: filter by a label key=value local filter_key="${3:-}" local filter_value="${4:-}" printf "%-24s " "$name..." if [ -z "$ALERTS_JSON" ]; then echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)" return fi local firing firing=$(echo "$ALERTS_JSON" | python3 -c " import json, sys try: data = json.load(sys.stdin) except: sys.exit(1) alerts = data.get('data', {}).get('alerts', []) for a in alerts: if a['labels'].get('alertname') != '$alertname': continue if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value': continue if a['state'] in ('Alerting', 'Pending'): url = a.get('annotations', {}).get('runbook_url', '') summary = a.get('annotations', {}).get('summary', '') print(f'{summary}|{url}') " 2>/dev/null) if [ -z "$firing" ]; then echo -e "${GREEN}OK${NC}" else local summary runbook summary=$(echo "$firing" | head -1 | cut -d'|' -f1) runbook=$(echo "$firing" | head -1 | cut -d'|' -f2) echo -e "${RED}FIRING${NC}" if [ -n "$summary" ]; then echo -e " $summary" fi if [ -n "$runbook" ]; then echo -e " Runbook: $runbook" fi FAILED=1 fi } echo "Checking services..." echo "====================" echo "" # Local services on indri (not yet covered by alerting) echo "Local services on indri:" check_service "forgejo" "ssh indri 'launchctl list mcquack.eblume.forgejo | grep -v \"^-\"'" check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'" check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'" check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'" check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'" check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'" check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'" check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'" echo "" echo "Metrics textfiles (via alerting):" check_alert "textfile-freshness" "TextfileStale" echo "" echo "Kubernetes cluster (not yet covered by alerting):" check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'" check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" echo "HTTP endpoints (via alerting):" check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus" check_alert "Loki" "ServiceProbeFailure" "service" "loki" check_alert "Grafana" "ServiceProbeFailure" "service" "grafana" check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd" check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix" check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux" check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate" check_alert "Devpi" "ServiceProbeFailure" "service" "devpi" check_alert "Transmission" "ServiceProbeFailure" "service" "transmission" check_alert "Immich" "ServiceProbeFailure" "service" "immich" check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome" echo "" echo "HTTP endpoints (not yet covered by alerting):" check_http "Forgejo" "https://forge.eblu.me/" check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog" check_http "CV" "https://cv.ops.eblu.me/" check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health" check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/" check_http "Frigate" "https://nvr.ops.eblu.me/api/version" echo "" echo "Frigate (via alerting):" check_alert "camera-fps" "FrigateCameraDown" echo "Frigate (not yet covered by alerting):" check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" echo "" echo "Ringtail (not yet covered by alerting):" check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true" check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null" check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'" check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw /healthz" check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'" echo "" echo "Pod health (via alerting):" check_alert "pod-readiness" "PodNotReady" echo "" echo "Database (via alerting):" check_alert "PostgreSQL" "PostgresClusterUnhealthy" echo "" echo "Public services (not yet covered by alerting):" check_http "Docs (public)" "https://docs.eblu.me/" check_http "CV (public)" "https://cv.eblu.me/" check_http "Forge (public)" "https://forge.eblu.me/" check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz" echo "" echo "ArgoCD app sync status (via alerting):" check_alert "argocd-sync" "ArgoCDAppOutOfSync" # Keep the detailed table as a summary view printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET" while read -r name sync health target; do if [[ "$sync" == "Synced" ]]; then printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" elif [[ "$sync" == "OutOfSync" ]]; then printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" else printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target" fi done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null) echo "" if [ $FAILED -eq 0 ]; then echo -e "${GREEN}All services healthy!${NC}" exit 0 else echo -e "${RED}Some services failed health check${NC}" exit 1 fi