C2: Deploy infrastructure alerting pipeline #303
1 changed files with 114 additions and 45 deletions
C2(deploy-infra-alerting): impl refactor services-check to query alerts
Replace covered checks with Grafana alerting API queries: - ServiceProbeFailure: 11 HTTP endpoints - TextfileStale: metrics textfile freshness - FrigateCameraDown: camera FPS - PodNotReady: pod readiness (both clusters) - PostgresClusterUnhealthy: database health - ArgoCDAppOutOfSync: ArgoCD sync status Uncovered checks remain as direct probes (SSH, launchctl, public endpoints, k8s API, frigate storage, some HTTP endpoints). Firing alerts display summary and clickable runbook link. Grafana credentials fetched from 1Password; graceful fallback if unavailable. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
commit
52eed44542
|
|
@ -6,6 +6,7 @@ set -euo pipefail
|
|||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
FAILED=0
|
||||
|
|
@ -36,11 +37,88 @@ check_http() {
|
|||
fi
|
||||
}
|
||||
|
||||
# ============== Grafana Alerting API ==============
|
||||
|
||||
GRAFANA_URL="https://grafana.ops.eblu.me"
|
||||
GRAFANA_CREDS=""
|
||||
|
||||
fetch_alerts() {
|
||||
if [ -z "$GRAFANA_CREDS" ]; then
|
||||
local pass
|
||||
pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true
|
||||
if [ -n "$pass" ]; then
|
||||
GRAFANA_CREDS=$(echo -n "admin:$pass" | base64)
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$GRAFANA_CREDS" ]; then
|
||||
echo ""
|
||||
return
|
||||
fi
|
||||
|
||||
curl -sf --max-time 10 \
|
||||
-H "Authorization: Basic $GRAFANA_CREDS" \
|
||||
"$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo ""
|
||||
}
|
||||
|
||||
# Fetch all alerts once
|
||||
ALERTS_JSON=$(fetch_alerts)
|
||||
|
||||
check_alert() {
|
||||
local name="$1"
|
||||
local alertname="$2"
|
||||
# Optional: filter by a label key=value
|
||||
local filter_key="${3:-}"
|
||||
local filter_value="${4:-}"
|
||||
|
||||
printf "%-24s " "$name..."
|
||||
|
||||
if [ -z "$ALERTS_JSON" ]; then
|
||||
echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)"
|
||||
return
|
||||
fi
|
||||
|
||||
local firing
|
||||
firing=$(echo "$ALERTS_JSON" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
except:
|
||||
sys.exit(1)
|
||||
alerts = data.get('data', {}).get('alerts', [])
|
||||
for a in alerts:
|
||||
if a['labels'].get('alertname') != '$alertname':
|
||||
continue
|
||||
if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value':
|
||||
continue
|
||||
if a['state'] in ('Alerting', 'Pending'):
|
||||
url = a.get('annotations', {}).get('runbook_url', '')
|
||||
summary = a.get('annotations', {}).get('summary', '')
|
||||
print(f'{summary}|{url}')
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ -z "$firing" ]; then
|
||||
echo -e "${GREEN}OK${NC}"
|
||||
else
|
||||
local summary runbook
|
||||
summary=$(echo "$firing" | head -1 | cut -d'|' -f1)
|
||||
runbook=$(echo "$firing" | head -1 | cut -d'|' -f2)
|
||||
echo -e "${RED}FIRING${NC}"
|
||||
if [ -n "$summary" ]; then
|
||||
echo -e " $summary"
|
||||
fi
|
||||
if [ -n "$runbook" ]; then
|
||||
echo -e " Runbook: $runbook"
|
||||
fi
|
||||
FAILED=1
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Checking services..."
|
||||
echo "===================="
|
||||
echo ""
|
||||
|
||||
# Local services on indri
|
||||
# Local services on indri (not yet covered by alerting)
|
||||
echo "Local services on indri:"
|
||||
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
|
||||
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
||||
|
|
@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met
|
|||
check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'"
|
||||
|
||||
echo ""
|
||||
echo "Metrics textfiles:"
|
||||
check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'"
|
||||
check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
||||
check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
||||
check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'"
|
||||
echo "Metrics textfiles (via alerting):"
|
||||
check_alert "textfile-freshness" "TextfileStale"
|
||||
|
||||
echo ""
|
||||
echo "Kubernetes cluster:"
|
||||
echo "Kubernetes cluster (not yet covered by alerting):"
|
||||
check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'"
|
||||
check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
||||
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (via Caddy):"
|
||||
check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy"
|
||||
check_http "Loki" "https://loki.ops.eblu.me/ready"
|
||||
check_http "Grafana" "https://grafana.ops.eblu.me/api/health"
|
||||
check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz"
|
||||
echo "HTTP endpoints (via alerting):"
|
||||
check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus"
|
||||
check_alert "Loki" "ServiceProbeFailure" "service" "loki"
|
||||
check_alert "Grafana" "ServiceProbeFailure" "service" "grafana"
|
||||
check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd"
|
||||
check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix"
|
||||
check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux"
|
||||
check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate"
|
||||
check_alert "Devpi" "ServiceProbeFailure" "service" "devpi"
|
||||
check_alert "Transmission" "ServiceProbeFailure" "service" "transmission"
|
||||
check_alert "Immich" "ServiceProbeFailure" "service" "immich"
|
||||
check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (not yet covered by alerting):"
|
||||
check_http "Forgejo" "https://forge.eblu.me/"
|
||||
check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog"
|
||||
check_http "Kiwix" "https://kiwix.ops.eblu.me/"
|
||||
check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck"
|
||||
check_http "TeslaMate" "https://tesla.ops.eblu.me/"
|
||||
check_http "Devpi" "https://pypi.ops.eblu.me/+api"
|
||||
check_http "Transmission" "https://torrent.ops.eblu.me/"
|
||||
check_http "Immich" "https://photos.ops.eblu.me/"
|
||||
check_http "Navidrome" "https://dj.ops.eblu.me/"
|
||||
check_http "CV" "https://cv.ops.eblu.me/"
|
||||
check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health"
|
||||
check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/"
|
||||
check_http "Frigate" "https://nvr.ops.eblu.me/api/version"
|
||||
check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'"
|
||||
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
||||
check_http "JobSync" "https://jobsync.ops.eblu.me/"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail (NixOS):"
|
||||
echo "Frigate (via alerting):"
|
||||
check_alert "camera-fps" "FrigateCameraDown"
|
||||
echo "Frigate (not yet covered by alerting):"
|
||||
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail (not yet covered by alerting):"
|
||||
check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true"
|
||||
check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null"
|
||||
check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'"
|
||||
|
|
@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw
|
|||
check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail k3s pods:"
|
||||
check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
echo "Pod health (via alerting):"
|
||||
check_alert "pod-readiness" "PodNotReady"
|
||||
|
||||
echo ""
|
||||
echo "Public services (via Fly.io):"
|
||||
echo "Database (via alerting):"
|
||||
check_alert "PostgreSQL" "PostgresClusterUnhealthy"
|
||||
|
||||
echo ""
|
||||
echo "Public services (not yet covered by alerting):"
|
||||
check_http "Docs (public)" "https://docs.eblu.me/"
|
||||
check_http "CV (public)" "https://cv.eblu.me/"
|
||||
check_http "Forge (public)" "https://forge.eblu.me/"
|
||||
check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz"
|
||||
|
||||
echo ""
|
||||
echo "Database:"
|
||||
check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432"
|
||||
|
||||
echo ""
|
||||
echo "Indri minikube pods:"
|
||||
check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
|
||||
echo ""
|
||||
echo "ArgoCD app sync status:"
|
||||
echo "ArgoCD app sync status (via alerting):"
|
||||
check_alert "argocd-sync" "ArgoCDAppOutOfSync"
|
||||
# Keep the detailed table as a summary view
|
||||
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
||||
while read -r name sync health target; do
|
||||
if [[ "$sync" == "Synced" ]]; then
|
||||
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
elif [[ "$sync" == "OutOfSync" ]]; then
|
||||
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
FAILED=1
|
||||
else
|
||||
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
fi
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue