Replace covered checks with Grafana alerting API queries: - ServiceProbeFailure: 11 HTTP endpoints - TextfileStale: metrics textfile freshness - FrigateCameraDown: camera FPS - PodNotReady: pod readiness (both clusters) - PostgresClusterUnhealthy: database health - ArgoCDAppOutOfSync: ArgoCD sync status Uncovered checks remain as direct probes (SSH, launchctl, public endpoints, k8s API, frigate storage, some HTTP endpoints). Firing alerts display summary and clickable runbook link. Grafana credentials fetched from 1Password; graceful fallback if unavailable. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
217 lines
7.7 KiB
Bash
Executable file
217 lines
7.7 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
#MISE description="Check that all services are online and responding"
|
|
|
|
set -euo pipefail
|
|
|
|
# Colors for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[0;33m'
|
|
NC='\033[0m' # No Color
|
|
|
|
FAILED=0
|
|
|
|
check_service() {
|
|
local name="$1"
|
|
local check_cmd="$2"
|
|
|
|
printf "%-24s " "$name..."
|
|
if eval "$check_cmd" > /dev/null 2>&1; then
|
|
echo -e "${GREEN}OK${NC}"
|
|
else
|
|
echo -e "${RED}FAILED${NC}"
|
|
FAILED=1
|
|
fi
|
|
}
|
|
|
|
check_http() {
|
|
local name="$1"
|
|
local url="$2"
|
|
|
|
printf "%-24s " "$name..."
|
|
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
|
echo -e "${GREEN}OK${NC}"
|
|
else
|
|
echo -e "${RED}FAILED${NC}"
|
|
FAILED=1
|
|
fi
|
|
}
|
|
|
|
# ============== Grafana Alerting API ==============
|
|
|
|
GRAFANA_URL="https://grafana.ops.eblu.me"
|
|
GRAFANA_CREDS=""
|
|
|
|
fetch_alerts() {
|
|
if [ -z "$GRAFANA_CREDS" ]; then
|
|
local pass
|
|
pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true
|
|
if [ -n "$pass" ]; then
|
|
GRAFANA_CREDS=$(echo -n "admin:$pass" | base64)
|
|
fi
|
|
fi
|
|
|
|
if [ -z "$GRAFANA_CREDS" ]; then
|
|
echo ""
|
|
return
|
|
fi
|
|
|
|
curl -sf --max-time 10 \
|
|
-H "Authorization: Basic $GRAFANA_CREDS" \
|
|
"$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo ""
|
|
}
|
|
|
|
# Fetch all alerts once
|
|
ALERTS_JSON=$(fetch_alerts)
|
|
|
|
check_alert() {
|
|
local name="$1"
|
|
local alertname="$2"
|
|
# Optional: filter by a label key=value
|
|
local filter_key="${3:-}"
|
|
local filter_value="${4:-}"
|
|
|
|
printf "%-24s " "$name..."
|
|
|
|
if [ -z "$ALERTS_JSON" ]; then
|
|
echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)"
|
|
return
|
|
fi
|
|
|
|
local firing
|
|
firing=$(echo "$ALERTS_JSON" | python3 -c "
|
|
import json, sys
|
|
try:
|
|
data = json.load(sys.stdin)
|
|
except:
|
|
sys.exit(1)
|
|
alerts = data.get('data', {}).get('alerts', [])
|
|
for a in alerts:
|
|
if a['labels'].get('alertname') != '$alertname':
|
|
continue
|
|
if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value':
|
|
continue
|
|
if a['state'] in ('Alerting', 'Pending'):
|
|
url = a.get('annotations', {}).get('runbook_url', '')
|
|
summary = a.get('annotations', {}).get('summary', '')
|
|
print(f'{summary}|{url}')
|
|
" 2>/dev/null)
|
|
|
|
if [ -z "$firing" ]; then
|
|
echo -e "${GREEN}OK${NC}"
|
|
else
|
|
local summary runbook
|
|
summary=$(echo "$firing" | head -1 | cut -d'|' -f1)
|
|
runbook=$(echo "$firing" | head -1 | cut -d'|' -f2)
|
|
echo -e "${RED}FIRING${NC}"
|
|
if [ -n "$summary" ]; then
|
|
echo -e " $summary"
|
|
fi
|
|
if [ -n "$runbook" ]; then
|
|
echo -e " Runbook: $runbook"
|
|
fi
|
|
FAILED=1
|
|
fi
|
|
}
|
|
|
|
echo "Checking services..."
|
|
echo "===================="
|
|
echo ""
|
|
|
|
# Local services on indri (not yet covered by alerting)
|
|
echo "Local services on indri:"
|
|
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
|
|
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
|
check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'"
|
|
check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'"
|
|
check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'"
|
|
check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'"
|
|
check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'"
|
|
check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'"
|
|
|
|
echo ""
|
|
echo "Metrics textfiles (via alerting):"
|
|
check_alert "textfile-freshness" "TextfileStale"
|
|
|
|
echo ""
|
|
echo "Kubernetes cluster (not yet covered by alerting):"
|
|
check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'"
|
|
check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
|
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
|
|
|
echo ""
|
|
echo "HTTP endpoints (via alerting):"
|
|
check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus"
|
|
check_alert "Loki" "ServiceProbeFailure" "service" "loki"
|
|
check_alert "Grafana" "ServiceProbeFailure" "service" "grafana"
|
|
check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd"
|
|
check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix"
|
|
check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux"
|
|
check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate"
|
|
check_alert "Devpi" "ServiceProbeFailure" "service" "devpi"
|
|
check_alert "Transmission" "ServiceProbeFailure" "service" "transmission"
|
|
check_alert "Immich" "ServiceProbeFailure" "service" "immich"
|
|
check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome"
|
|
|
|
echo ""
|
|
echo "HTTP endpoints (not yet covered by alerting):"
|
|
check_http "Forgejo" "https://forge.eblu.me/"
|
|
check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog"
|
|
check_http "CV" "https://cv.ops.eblu.me/"
|
|
check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health"
|
|
check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/"
|
|
check_http "Frigate" "https://nvr.ops.eblu.me/api/version"
|
|
check_http "JobSync" "https://jobsync.ops.eblu.me/"
|
|
|
|
echo ""
|
|
echo "Frigate (via alerting):"
|
|
check_alert "camera-fps" "FrigateCameraDown"
|
|
echo "Frigate (not yet covered by alerting):"
|
|
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
|
|
|
echo ""
|
|
echo "Ringtail (not yet covered by alerting):"
|
|
check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true"
|
|
check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null"
|
|
check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'"
|
|
check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw /healthz"
|
|
check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'"
|
|
|
|
echo ""
|
|
echo "Pod health (via alerting):"
|
|
check_alert "pod-readiness" "PodNotReady"
|
|
|
|
echo ""
|
|
echo "Database (via alerting):"
|
|
check_alert "PostgreSQL" "PostgresClusterUnhealthy"
|
|
|
|
echo ""
|
|
echo "Public services (not yet covered by alerting):"
|
|
check_http "Docs (public)" "https://docs.eblu.me/"
|
|
check_http "CV (public)" "https://cv.eblu.me/"
|
|
check_http "Forge (public)" "https://forge.eblu.me/"
|
|
check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz"
|
|
|
|
echo ""
|
|
echo "ArgoCD app sync status (via alerting):"
|
|
check_alert "argocd-sync" "ArgoCDAppOutOfSync"
|
|
# Keep the detailed table as a summary view
|
|
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
|
while read -r name sync health target; do
|
|
if [[ "$sync" == "Synced" ]]; then
|
|
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
|
elif [[ "$sync" == "OutOfSync" ]]; then
|
|
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
|
else
|
|
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
|
fi
|
|
done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null)
|
|
|
|
echo ""
|
|
if [ $FAILED -eq 0 ]; then
|
|
echo -e "${GREEN}All services healthy!${NC}"
|
|
exit 0
|
|
else
|
|
echo -e "${RED}Some services failed health check${NC}"
|
|
exit 1
|
|
fi
|