From 6d65e6928cb6f6326a669630aa6f72e0e0af4115 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 22 Mar 2026 14:52:56 -0700 Subject: [PATCH] C2: Deploy infrastructure alerting pipeline (#303) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Mikado chain to replace `mise run services-check` with Grafana Unified Alerting backed by ntfy push notifications. **Design:** - Grafana Unified Alerting evaluates rules against Prometheus/Loki - ntfy webhook contact point delivers iOS notifications - Anti-noise policy: page once per 24h per alert group - Every alert links to a runbook in `docs/how-to/alerts/` - services-check eventually queries the alerting API instead of doing its own probes **Chain (bottom-up):** 1. `configure-grafana-alerting-pipeline` — enable alerting, ntfy contact point, notification policy 2. `first-alert-and-runbook` — end-to-end proof of concept with blackbox probe failure 3. `port-services-check-alerts` — migrate all services-check probes to alert rules + runbooks 4. `refactor-services-check-to-query-alerts` — rewrite services-check to query Grafana API 5. `deploy-infra-alerting` — goal card 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/303 --- argocd/manifests/alloy-k8s/config.alloy | 37 ++ argocd/manifests/grafana/alerting.yaml | 393 ++++++++++++++++++ argocd/manifests/grafana/deployment.yaml | 3 + argocd/manifests/grafana/grafana.ini | 5 + argocd/manifests/grafana/kustomization.yaml | 1 + argocd/manifests/prometheus/prometheus.yml | 8 + .../mikado-deploy-infra-alerting.feature.md | 1 + .../configure-grafana-alerting-pipeline.md | 59 +++ docs/how-to/runbooks/deploy-infra-alerting.md | 77 ++++ .../runbooks/first-alert-and-runbook.md | 68 +++ .../runbooks/port-services-check-alerts.md | 74 ++++ ...refactor-services-check-to-query-alerts.md | 53 +++ .../runbooks/runbook-argocd-out-of-sync.md | 65 +++ .../runbooks/runbook-frigate-camera-down.md | 39 ++ docs/how-to/runbooks/runbook-pod-not-ready.md | 55 +++ .../runbooks/runbook-postgres-unhealthy.md | 63 +++ .../runbooks/runbook-service-probe-failure.md | 75 ++++ .../how-to/runbooks/runbook-textfile-stale.md | 58 +++ docs/reference/operations/observability.md | 12 +- mise-tasks/services-check | 159 +++++-- 20 files changed, 1259 insertions(+), 46 deletions(-) create mode 100644 argocd/manifests/grafana/alerting.yaml create mode 100644 docs/changelog.d/mikado-deploy-infra-alerting.feature.md create mode 100644 docs/how-to/runbooks/configure-grafana-alerting-pipeline.md create mode 100644 docs/how-to/runbooks/deploy-infra-alerting.md create mode 100644 docs/how-to/runbooks/first-alert-and-runbook.md create mode 100644 docs/how-to/runbooks/port-services-check-alerts.md create mode 100644 docs/how-to/runbooks/refactor-services-check-to-query-alerts.md create mode 100644 docs/how-to/runbooks/runbook-argocd-out-of-sync.md create mode 100644 docs/how-to/runbooks/runbook-frigate-camera-down.md create mode 100644 docs/how-to/runbooks/runbook-pod-not-ready.md create mode 100644 docs/how-to/runbooks/runbook-postgres-unhealthy.md create mode 100644 docs/how-to/runbooks/runbook-service-probe-failure.md create mode 100644 docs/how-to/runbooks/runbook-textfile-stale.md diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index c169c93..a716ddc 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -169,6 +169,43 @@ prometheus.exporter.blackbox "services" { address = "http://argocd-server.argocd.svc.cluster.local:80/healthz" module = "http_2xx" } + + target { + name = "prometheus" + address = "http://prometheus.monitoring.svc.cluster.local:9090/-/healthy" + module = "http_2xx" + } + + target { + name = "loki" + address = "http://loki.monitoring.svc.cluster.local:3100/ready" + module = "http_2xx" + } + + target { + name = "grafana" + address = "http://grafana.monitoring.svc.cluster.local:80/api/health" + module = "http_2xx" + } + + target { + name = "teslamate" + address = "http://teslamate.teslamate.svc.cluster.local:4000/" + module = "http_2xx" + } + + target { + name = "immich" + address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping" + module = "http_2xx" + } + + target { + name = "navidrome" + address = "http://navidrome.navidrome.svc.cluster.local:4533/" + module = "http_2xx" + } + } // Scrape blackbox probe results diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml new file mode 100644 index 0000000..abc4c0f --- /dev/null +++ b/argocd/manifests/grafana/alerting.yaml @@ -0,0 +1,393 @@ +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: ntfy-infra + receivers: + - uid: ntfy-infra-webhook + type: webhook + settings: + url: https://ntfy.ops.eblu.me + httpMethod: POST + maxAlerts: "0" + payload: + template: >- + {{ template "ntfy-infra.payload" . }} + disableResolveMessage: false + +policies: + - orgId: 1 + receiver: ntfy-infra + group_by: + - alertname + - service + group_wait: 1m + group_interval: 12h + repeat_interval: 24h + +groups: + - orgId: 1 + name: service-health + folder: Infrastructure Alerts + interval: 30s + rules: + - uid: service-probe-failure + title: ServiceProbeFailure + condition: C + for: 2m + noDataState: Alerting + execErrState: Alerting + annotations: + summary: >- + {{ index $labels "service" }} health check is failing + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure + labels: + severity: warning + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + label_replace(probe_success, "service", + "$1", "job", "integrations/blackbox/(.*)") + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: lt + params: + - 1 + operator: + type: and + refId: C + + - orgId: 1 + name: textfile-freshness + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: textfile-stale + title: TextfileStale + condition: C + for: 15m + noDataState: Alerting + execErrState: Alerting + annotations: + summary: >- + Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale + labels: + severity: warning + service: indri-metrics + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + time() - node_textfile_mtime_seconds + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: gt + params: + - 3600 + operator: + type: and + refId: C + + - orgId: 1 + name: frigate-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: frigate-camera-down + title: FrigateCameraDown + condition: C + for: 5m + noDataState: Alerting + execErrState: Alerting + annotations: + summary: >- + Frigate camera {{ index $labels "camera_name" }} has 0 FPS + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down + labels: + severity: warning + service: frigate + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: frigate_camera_fps + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: lt + params: + - 1 + operator: + type: and + refId: C + + - orgId: 1 + name: database-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: postgres-cluster-unhealthy + title: PostgresClusterUnhealthy + condition: C + for: 3m + noDataState: Alerting + execErrState: Alerting + annotations: + summary: >- + PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy + labels: + severity: critical + service: postgresql + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: cnpg_collector_up + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: lt + params: + - 1 + operator: + type: and + refId: C + + - orgId: 1 + name: pod-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: pod-not-ready + title: PodNotReady + condition: C + for: 5m + noDataState: OK + execErrState: Alerting + annotations: + summary: >- + Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready + labels: + severity: warning + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + kube_pod_status_ready{condition="true"} == 0 + unless on (namespace, pod) + kube_pod_owner{owner_kind="Job"} + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: lt + params: + - 1 + operator: + type: and + refId: C + + - orgId: 1 + name: argocd-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: argocd-app-out-of-sync + title: ArgoCDAppOutOfSync + condition: C + for: 30m + noDataState: OK + execErrState: Alerting + annotations: + summary: >- + ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }} + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync + labels: + severity: warning + service: argocd + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + argocd_app_info{sync_status!="Synced"} + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: gt + params: + - 0 + operator: + type: and + refId: C + +templates: + - orgId: 1 + name: ntfy-infra + template: | + {{ define "ntfy-infra.payload" -}} + {{- $msg := "" -}} + {{- range .Alerts -}} + {{- $msg = (printf "%s%s\n" $msg .Annotations.summary) -}} + {{- end -}} + {{- $title := (printf "[%s] %s" (.Status | toUpper) .CommonLabels.alertname) -}} + {{- $actions := coll.Slice -}} + {{- range .Alerts -}} + {{- if .Annotations.runbook_url -}} + {{- $actions = coll.Append (coll.Dict "action" "view" "label" "Open Runbook" "url" .Annotations.runbook_url) $actions -}} + {{- end -}} + {{- end -}} + {{- coll.Dict "topic" "infra-alerts" "title" $title "message" $msg "priority" 3 "actions" $actions | data.ToJSON -}} + {{- end }} diff --git a/argocd/manifests/grafana/deployment.yaml b/argocd/manifests/grafana/deployment.yaml index 61a2f88..5fbb8eb 100644 --- a/argocd/manifests/grafana/deployment.yaml +++ b/argocd/manifests/grafana/deployment.yaml @@ -277,6 +277,9 @@ spec: - name: config mountPath: /etc/grafana/provisioning/datasources/datasources.yaml subPath: datasources.yaml + - name: config + mountPath: /etc/grafana/provisioning/alerting/alerting.yaml + subPath: alerting.yaml - name: storage mountPath: /var/lib/grafana - name: sc-dashboard-volume diff --git a/argocd/manifests/grafana/grafana.ini b/argocd/manifests/grafana/grafana.ini index 61cdd7e..a0a6db8 100644 --- a/argocd/manifests/grafana/grafana.ini +++ b/argocd/manifests/grafana/grafana.ini @@ -30,3 +30,8 @@ allow_embedding = false [server] root_url = https://grafana.ops.eblu.me + +[unified_alerting] +enabled = true +evaluation_timeout = 30s +min_interval = 10s diff --git a/argocd/manifests/grafana/kustomization.yaml b/argocd/manifests/grafana/kustomization.yaml index c052bf9..3aeaa26 100644 --- a/argocd/manifests/grafana/kustomization.yaml +++ b/argocd/manifests/grafana/kustomization.yaml @@ -25,6 +25,7 @@ configMapGenerator: files: - grafana.ini - datasources.yaml + - alerting.yaml options: labels: app.kubernetes.io/name: grafana diff --git a/argocd/manifests/prometheus/prometheus.yml b/argocd/manifests/prometheus/prometheus.yml index 2d2dbcf..f96ce12 100644 --- a/argocd/manifests/prometheus/prometheus.yml +++ b/argocd/manifests/prometheus/prometheus.yml @@ -80,6 +80,14 @@ scrape_configs: - target_label: cluster replacement: indri + # ArgoCD application metrics + - job_name: "argocd" + static_configs: + - targets: ["argocd-metrics.argocd.svc.cluster.local:8082"] + metric_relabel_configs: + - target_label: cluster + replacement: indri + # Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail) - job_name: "frigate" scheme: https diff --git a/docs/changelog.d/mikado-deploy-infra-alerting.feature.md b/docs/changelog.d/mikado-deploy-infra-alerting.feature.md new file mode 100644 index 0000000..7106014 --- /dev/null +++ b/docs/changelog.d/mikado-deploy-infra-alerting.feature.md @@ -0,0 +1 @@ +Deploy infrastructure alerting pipeline using Grafana Unified Alerting with ntfy push notifications. 7 alert rules with runbooks covering service health, pod readiness, PostgreSQL, textfile freshness, Frigate cameras, and ArgoCD sync status. services-check now queries the alerting API for covered checks. diff --git a/docs/how-to/runbooks/configure-grafana-alerting-pipeline.md b/docs/how-to/runbooks/configure-grafana-alerting-pipeline.md new file mode 100644 index 0000000..eb90128 --- /dev/null +++ b/docs/how-to/runbooks/configure-grafana-alerting-pipeline.md @@ -0,0 +1,59 @@ +--- +title: Configure Grafana Alerting Pipeline +modified: 2026-03-22 +tags: + - how-to + - alerting + - grafana +--- + +# Configure Grafana Alerting Pipeline + +Enable Grafana Unified Alerting, create an ntfy webhook contact point, configure the notification policy with anti-noise settings, and set up a message template with runbook links. + +## What to Do + +### 1. Enable Unified Alerting in grafana.ini + +Add the `[unified_alerting]` section to the Grafana ConfigMap. Grafana 11+ has unified alerting enabled by default, but we should be explicit and configure the evaluation interval. + +### 2. Create Alerting Provisioning Files + +Grafana supports provisioning alert resources via YAML files in `/etc/grafana/provisioning/alerting/`. Create: + +- **Contact point** — ntfy webhook targeting `http://ntfy.ntfy.svc.cluster.local:80/infra-alerts` (cluster-internal, since Grafana and ntfy are on different clusters, use `ntfy.ops.eblu.me` via Caddy instead) +- **Notification policy** — root policy with `group_wait: 1m`, `group_interval: 12h`, `repeat_interval: 24h`, grouped by `alertname` and `service` +- **Message template** — format that includes alert name, summary, and a clickable runbook URL as an ntfy action button + +### 3. Mount Provisioning into Grafana + +Add the alerting provisioning ConfigMap to the Grafana deployment, mounted at `/etc/grafana/provisioning/alerting/`. + +### 4. Create the `infra-alerts` Topic + +ntfy topics are created on first publish — no explicit setup needed. But verify that the topic works by sending a test notification. + +### 5. Verify End-to-End + +- Grafana UI shows the ntfy contact point under Alerting → Contact Points +- Notification policy shows the anti-noise settings +- Test notification from Grafana reaches the ntfy iOS app + +## Key Details + +- Grafana runs on minikube (indri), ntfy runs on k3s (ringtail). The contact point URL must go through Caddy: `https://ntfy.ops.eblu.me/infra-alerts` +- ntfy action buttons use the `X-Actions` header or JSON body format: `view, Open Runbook, ` +- Grafana provisioning files are applied on startup and cannot be edited from the UI (which is what we want for GitOps) + +## Verification + +- [ ] Grafana starts with unified alerting enabled +- [ ] Contact point `ntfy-infra` visible in Grafana UI +- [ ] Notification policy shows correct group/repeat intervals +- [ ] Test notification arrives on iOS via ntfy app +- [ ] Test notification includes a clickable runbook link + +## Related + +- [[deploy-infra-alerting]] — Parent goal +- [[first-alert-and-runbook]] — Next: create the first real alert diff --git a/docs/how-to/runbooks/deploy-infra-alerting.md b/docs/how-to/runbooks/deploy-infra-alerting.md new file mode 100644 index 0000000..e02523d --- /dev/null +++ b/docs/how-to/runbooks/deploy-infra-alerting.md @@ -0,0 +1,77 @@ +--- +title: Deploy Infrastructure Alerting Pipeline +modified: 2026-03-22 +tags: + - how-to + - alerting + - observability +--- + +# Deploy Infrastructure Alerting Pipeline + +Replace the manual `mise run services-check` approach with Grafana Unified Alerting backed by ntfy push notifications, so infrastructure problems page once and include actionable runbook links. + +## Architecture + +``` +Prometheus (metrics) ──┐ + ├──▶ Grafana Alert Rules ──▶ ntfy webhook ──▶ iOS push +Loki (logs) ──────────┘ │ + │ + Notification Policy + (group_wait: 1m, + group_interval: 12h, + repeat_interval: 24h) +``` + +## Design Decisions + +| Decision | Choice | Rationale | +|----------|--------|-----------| +| **Alert engine** | Grafana Unified Alerting | Already deployed, no new service needed | +| **Notification** | ntfy webhook contact point | Already deployed on ringtail, iOS app works | +| **Anti-noise** | 24h repeat interval | Page once per day max per alert group | +| **Runbooks** | `docs/how-to/runbooks/.md` | Clickable link in every notification | +| **Provisioning** | Grafana provisioning YAML (GitOps) | Alerts defined in repo, not just UI | +| **Topic** | `infra-alerts` (separate from `frigate-alerts`) | Different severity/audience | + +## Alerting Policy + +- Each alert fires **once** and does not re-notify for 24 hours +- A "resolved" notification is sent when the condition clears +- Every alert annotation includes `runbook_url` linking to its how-to doc +- The ntfy message template renders the runbook URL as a clickable action button +- Alerts are grouped by service to avoid notification storms + +## Migration Path + +1. Stand up the pipeline: Grafana alerting config, ntfy contact point, notification policy, message template +2. Create the first alert + runbook as proof of concept (e.g., a blackbox probe failure) +3. Port services-check health checks to Grafana alert rules, one by one, each with a runbook +4. Refactor services-check to query the Grafana alerting API instead of doing its own probes + +## What services-check Covers Today + +These checks will be migrated to alert rules: + +| Category | Checks | Data Source | +|----------|--------|-------------| +| Local services (indri) | forgejo, alloy, borgmatic, zot via brew/launchctl | Need new probes or textfile metrics | +| Metrics textfiles | freshness of `.prom` files | Existing node_textfile metrics | +| K8s cluster health | minikube API, k3s API | kube-state-metrics | +| HTTP endpoints | ~12 services via Caddy | Alloy blackbox exporter (already exists) | +| Ringtail | SSH, tailscale, k3s health | Need new probes | +| K3s pods | ntfy, authentik, frigate, etc. | kube-state-metrics on ringtail | +| Public services | docs, cv, forge via Fly.io | Alloy on Fly.io or external probe | +| PostgreSQL | CNPG readiness | CNPG metrics (already scraped) | +| ArgoCD sync | app sync/health status | ArgoCD metrics or API | + +## Related + +- [[configure-grafana-alerting-pipeline]] — Foundation: contact point, policy, template +- [[first-alert-and-runbook]] — Proof of concept alert +- [[port-services-check-alerts]] — Systematic migration +- [[refactor-services-check-to-query-alerts]] — Final integration +- [[observability]] — Current observability stack +- [[ntfy]] — Push notification service +- [[grafana]] — Dashboard and alerting platform diff --git a/docs/how-to/runbooks/first-alert-and-runbook.md b/docs/how-to/runbooks/first-alert-and-runbook.md new file mode 100644 index 0000000..6ce13bf --- /dev/null +++ b/docs/how-to/runbooks/first-alert-and-runbook.md @@ -0,0 +1,68 @@ +--- +title: First Alert and Runbook +modified: 2026-03-22 +tags: + - how-to + - alerting +--- + +# First Alert and Runbook + +Create one end-to-end alert as proof of concept — an alert rule that fires, delivers a notification to ntfy with a runbook link, and has a corresponding runbook doc. + +## What to Do + +### 1. Choose the First Alert + +The best candidate is a **blackbox probe failure** because: +- Alloy's blackbox exporter already probes 5 services (miniflux, kiwix, transmission, devpi, argocd) at 30s intervals +- The metric `probe_success` is already in Prometheus +- It maps directly to what services-check does (HTTP health checks) +- A single alert rule with a `service` label can cover all probed services + +### 2. Create the Alert Rule + +Provision via YAML in the alerting provisioning ConfigMap. The rule should: +- Query `probe_success == 0` from Prometheus +- Fire after the condition persists for 2 minutes (avoid flapping) +- Include labels: `severity: warning`, `service: {{ $labels.instance }}` +- Include annotations: `summary`, `runbook_url` pointing to the runbook doc + +### 3. Create the Runbook + +Write `docs/how-to/runbooks/runbook-service-probe-failure.md` as a how-to doc explaining: +- What the alert means +- How to check which service is down +- Common causes and resolution steps +- How to silence the alert if the downtime is planned + +### 4. Verify End-to-End + +- Stop one of the probed services (e.g., scale miniflux to 0) +- Wait for the alert to fire (~2 minutes) +- Confirm ntfy notification arrives with correct summary and runbook link +- Click the runbook link and verify it reaches docs.eblu.me +- Scale the service back up +- Confirm "resolved" notification arrives +- Confirm no repeat notification during the 24h window + +## Key Details + +- Grafana alert rules can be provisioned as YAML files alongside contact points and notification policies +- The blackbox probe metrics from Alloy use the job name `blackbox` and include an `instance` label with the service name +- The runbook URL format: `https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure` + +## Verification + +- [ ] Alert rule appears in Grafana UI under Alerting → Alert Rules +- [ ] Simulated failure triggers ntfy notification within ~3 minutes +- [ ] Notification includes service name, summary, and clickable runbook link +- [ ] Resolution triggers a "resolved" notification +- [ ] No repeat notification within 24h window + +## Related + +- [[configure-grafana-alerting-pipeline]] — Prerequisite: pipeline must be working +- [[deploy-infra-alerting]] — Parent goal +- [[port-services-check-alerts]] — Next: port remaining checks +- [[runbook-service-probe-failure]] — The runbook created for this alert diff --git a/docs/how-to/runbooks/port-services-check-alerts.md b/docs/how-to/runbooks/port-services-check-alerts.md new file mode 100644 index 0000000..4420f58 --- /dev/null +++ b/docs/how-to/runbooks/port-services-check-alerts.md @@ -0,0 +1,74 @@ +--- +title: Port services-check Alerts to Grafana +modified: 2026-03-22 +tags: + - how-to + - alerting +--- + +# Port services-check Alerts to Grafana + +Systematically migrate the health checks from `mise run services-check` to Grafana alert rules, each with a corresponding runbook. After this card, the alerting system covers everything services-check does today. + +## What to Do + +### 1. Inventory and Prioritize + +Map each services-check probe to a data source and alert rule. Some checks already have metrics in Prometheus; others need new instrumentation. + +**Already have metrics (easy):** +- HTTP endpoint probes → Alloy blackbox exporter (`probe_success`) +- PostgreSQL health → CNPG metrics (`cnpg_pg_replication_streaming`, `cnpg_collector_up`) +- K8s pod health → kube-state-metrics (`kube_pod_status_phase`) +- ArgoCD sync status → ArgoCD metrics (`argocd_app_info` with sync/health labels) + +**Need new probes or metrics:** +- Local indri services (forgejo, alloy, borgmatic, zot via brew/launchctl) → Alloy host textfile or new probes +- Metrics textfile freshness → `node_textfile_mtime_seconds` (already collected by Alloy on indri) +- Ringtail SSH/tailscale health → Alloy blackbox on ringtail or cross-cluster probe +- Public services (docs, cv, forge via Fly.io) → Alloy on Fly.io or Grafana synthetic monitoring + +### 2. Add Missing Probes + +Extend Alloy configurations where needed: +- **Alloy on indri:** Add blackbox targets for forgejo, zot (local HTTP endpoints) +- **Alloy on ringtail:** Add blackbox targets for ringtail-local services +- **Consider:** Whether public endpoint probing belongs in Fly.io Alloy or a separate prober + +### 3. Create Alert Rules + +For each check category, create provisioned Grafana alert rules. Group related checks into alert rule groups (e.g., "indri-services", "k8s-health", "public-endpoints"). + +### 4. Create Runbooks + +One runbook per alert type in `docs/how-to/runbooks/runbook-.md`. Each runbook should cover: +- What the alert means +- Diagnostic steps +- Common fixes +- How to silence for planned maintenance + +### 5. Remove from services-check + +As each check is ported, remove it from the services-check script (or mark it as "now handled by alerting"). The goal is that services-check shrinks as alerting grows. + +## Key Details + +- Don't try to port everything in one session — this card may span multiple work cycles within the C2 chain +- Prioritize checks that have caught real problems in the past +- Some checks (like ArgoCD sync status table) may remain in services-check as a human-readable summary even after alerting covers the failure cases +- The Alloy blackbox exporter on k8s already covers 5 services; extending it to more is straightforward + +## Verification + +- [ ] All HTTP endpoint checks from services-check have corresponding alert rules +- [ ] Pod health checks have corresponding alert rules +- [ ] PostgreSQL health has a corresponding alert rule +- [ ] Each alert rule has a runbook doc in `docs/how-to/runbooks/` +- [ ] Test at least 2-3 failure scenarios end-to-end +- [ ] services-check script has been updated to reflect ported checks + +## Related + +- [[first-alert-and-runbook]] — Prerequisite: established the pattern +- [[deploy-infra-alerting]] — Parent goal +- [[refactor-services-check-to-query-alerts]] — Next: make services-check query alerts diff --git a/docs/how-to/runbooks/refactor-services-check-to-query-alerts.md b/docs/how-to/runbooks/refactor-services-check-to-query-alerts.md new file mode 100644 index 0000000..244be1f --- /dev/null +++ b/docs/how-to/runbooks/refactor-services-check-to-query-alerts.md @@ -0,0 +1,53 @@ +--- +title: Refactor services-check to Query Alerts +modified: 2026-03-22 +tags: + - how-to + - alerting +--- + +# Refactor services-check to Query Alerts + +Change `mise run services-check` from doing its own health probes to querying the Grafana alerting API for currently firing alerts. The script becomes a CLI view into the same alerting system that sends ntfy notifications. + +## What to Do + +### 1. Query the Grafana Alerting API + +Grafana exposes alert state via: +- `GET /api/v1/provisioning/alert-rules` — all configured rules +- `GET /api/prometheus/grafana/api/v1/alerts` — currently firing alerts (Prometheus-compatible format) + +The second endpoint is simpler — it returns only active alerts with labels and annotations, similar to Alertmanager's `/api/v1/alerts`. + +### 2. Rewrite services-check + +The new services-check should: +1. Query the Grafana alerting API for firing alerts +2. Display them in a table with service name, alert name, duration, and runbook link +3. If no alerts are firing, print a green "all clear" message +4. Exit 0 if no alerts, exit 1 if any are firing +5. Optionally keep a few checks that don't map to alerting (e.g., the ArgoCD sync status table as a summary view) + +### 3. Handle Authentication + +services-check will need a Grafana API token or service account token. Options: +- Use the existing Grafana admin credentials from 1Password (`op read`) +- Create a dedicated read-only service account in Grafana + +### 4. Preserve the ArgoCD Summary + +The ArgoCD sync/health table in services-check is a useful quick view even when nothing is alerting. Consider keeping it as a separate section that always displays, independent of the alert query. + +## Verification + +- [ ] `mise run services-check` queries Grafana instead of doing direct probes +- [ ] Firing alerts are displayed with service name, alert name, and runbook link +- [ ] Exit code reflects alert state (0 = clear, 1 = firing) +- [ ] Works when Grafana is unreachable (graceful error, not a crash) +- [ ] ArgoCD summary table still works + +## Related + +- [[port-services-check-alerts]] — Prerequisite: alerts must exist to query +- [[deploy-infra-alerting]] — Parent goal diff --git a/docs/how-to/runbooks/runbook-argocd-out-of-sync.md b/docs/how-to/runbooks/runbook-argocd-out-of-sync.md new file mode 100644 index 0000000..753b336 --- /dev/null +++ b/docs/how-to/runbooks/runbook-argocd-out-of-sync.md @@ -0,0 +1,65 @@ +--- +title: "Runbook: ArgoCD App Out of Sync" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: ArgoCD App Out of Sync + +**Alert name:** `ArgoCDAppOutOfSync` + +An ArgoCD application has been out of sync for 30+ minutes. This means the live state in Kubernetes differs from what's declared in Git. + +## Diagnostic Steps + +1. **Check which app is out of sync** — the `name` label in the alert tells you: + ```fish + argocd app get + ``` + +2. **View the diff**: + ```fish + argocd app diff + ``` + +3. **Check if it's a branch revision issue** — during C1/C2 work, apps may be pointed at a feature branch. After merge, they need to be reset to main: + ```fish + argocd app get -o json | python3 -c "import json,sys; print(json.load(sys.stdin)['spec']['source']['targetRevision'])" + ``` + +4. **Check ArgoCD UI** — https://argocd.ops.eblu.me — look for sync errors or degraded status. + +## Common Causes + +- **Forgot to sync after push** — ArgoCD uses manual sync; changes require explicit `argocd app sync` +- **Branch revision not reset after PR merge** — app still points at a deleted branch +- **Kustomize/manifest error** — invalid YAML or unsatisfiable resource requirements +- **Pruning needed** — old ConfigMaps from `configMapGenerator` need pruning + +## Resolution + +```fish +# Simple sync +argocd app sync + +# If pruning is needed +argocd app sync --prune + +# If stuck on a deleted branch +argocd app set --revision main +argocd app sync +``` + +## Silencing + +During active C1/C2 development, apps may intentionally be out of sync: +1. Grafana → Alerting → Silences → Create Silence +2. Match `alertname = ArgoCDAppOutOfSync` and `name = ` + +## Related + +- [[argocd]] — ArgoCD reference +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/how-to/runbooks/runbook-frigate-camera-down.md b/docs/how-to/runbooks/runbook-frigate-camera-down.md new file mode 100644 index 0000000..ea04e79 --- /dev/null +++ b/docs/how-to/runbooks/runbook-frigate-camera-down.md @@ -0,0 +1,39 @@ +--- +title: "Runbook: Frigate Camera Down" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: Frigate Camera Down + +**Alert name:** `FrigateCameraDown` + +A Frigate camera has reported 0 FPS for 5+ minutes, meaning the camera feed is not being received. + +## Diagnostic Steps + +1. **Check Frigate UI** — https://nvr.ops.eblu.me — look at the camera thumbnail and status +2. **Check Frigate API stats**: + ```fish + curl -s https://nvr.ops.eblu.me/api/stats | python3 -m json.tool + ``` +3. **Check Frigate pod logs** on ringtail: + ```fish + kubectl logs -n frigate -l app=frigate --context=k3s-ringtail --tail=30 + ``` +4. **Check the camera itself** — verify it's powered on and network-connected. Try accessing the RTSP stream directly. + +## Common Causes + +- **Camera offline** — power outage, network issue, or camera crash +- **NFS mount lost** — Frigate storage on sifaka; if the NFS mount drops, recording stops and FPS may drop +- **Frigate pod restart** — during restart, camera FPS briefly drops to 0 +- **RTSP stream timeout** — camera firmware issue; power cycle the camera + +## Related + +- [[frigate]] — Frigate NVR reference +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/how-to/runbooks/runbook-pod-not-ready.md b/docs/how-to/runbooks/runbook-pod-not-ready.md new file mode 100644 index 0000000..49dd35e --- /dev/null +++ b/docs/how-to/runbooks/runbook-pod-not-ready.md @@ -0,0 +1,55 @@ +--- +title: "Runbook: Pod Not Ready" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: Pod Not Ready + +**Alert name:** `PodNotReady` + +A Kubernetes pod has been in a not-ready state for 5+ minutes. + +## Diagnostic Steps + +1. **Identify the pod** from the alert labels (`pod`, `namespace`): + ```fish + kubectl describe pod -n --context=minikube-indri + ``` + +2. **Check events** — look for scheduling failures, image pull errors, or probe failures: + ```fish + kubectl get events -n --context=minikube-indri --sort-by='.lastTimestamp' | tail -20 + ``` + +3. **Check logs**: + ```fish + kubectl logs -n --context=minikube-indri --tail=50 + ``` + +4. **Check node resources**: + ```fish + kubectl top nodes --context=minikube-indri + kubectl top pods -n --context=minikube-indri + ``` + +## Common Causes + +- **CrashLoopBackOff** — app is crashing on startup, check logs +- **ImagePullBackOff** — container image not found or registry unreachable +- **Pending** — insufficient resources (CPU/memory), or PVC not bound +- **Readiness probe failing** — service is running but not healthy +- **NFS mount issue** — services depending on sifaka (kiwix, transmission, navidrome, jellyfin) will fail if NFS is down + +## Silencing + +1. Grafana → Alerting → Silences → Create Silence +2. Match `alertname = PodNotReady` +3. Optionally match `namespace = ` to silence a specific service + +## Related + +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/how-to/runbooks/runbook-postgres-unhealthy.md b/docs/how-to/runbooks/runbook-postgres-unhealthy.md new file mode 100644 index 0000000..2910851 --- /dev/null +++ b/docs/how-to/runbooks/runbook-postgres-unhealthy.md @@ -0,0 +1,63 @@ +--- +title: "Runbook: PostgreSQL Cluster Unhealthy" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: PostgreSQL Cluster Unhealthy + +**Alert name:** `PostgresClusterUnhealthy` + +The CNPG collector metrics endpoint is down, indicating the PostgreSQL cluster is not responding. + +## Affected Services + +The `blumeops-pg` CNPG cluster on indri's minikube runs databases for: +- TeslaMate +- Authentik (cross-cluster from ringtail) +- Immich +- Grafana dashboards (TeslaMate datasource) + +## Diagnostic Steps + +1. **Check CNPG cluster status**: + ```fish + kubectl get cluster blumeops-pg -n databases --context=minikube-indri + kubectl get pods -n databases -l cnpg.io/cluster=blumeops-pg --context=minikube-indri + ``` + +2. **Check pod logs**: + ```fish + kubectl logs -n databases -l cnpg.io/cluster=blumeops-pg --context=minikube-indri --tail=30 + ``` + +3. **Check if pg_isready**: + ```fish + pg_isready -h pg.ops.eblu.me -p 5432 + ``` + +4. **Check PVC storage**: + ```fish + kubectl get pvc -n databases --context=minikube-indri + ``` + +## Common Causes + +- **Pod crash** — OOM, disk full, or configuration error +- **PVC storage full** — check with `kubectl exec` into the pod and `df -h` +- **Minikube issue** — if the node is under memory pressure, CNPG pods may be evicted +- **Network** — Caddy L4 proxy (`pg.ops.eblu.me`) may be misconfigured + +## Silencing + +For planned database maintenance: +1. Grafana → Alerting → Silences → Create Silence +2. Match `alertname = PostgresClusterUnhealthy` + +## Related + +- [[postgresql]] — CNPG cluster reference +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/how-to/runbooks/runbook-service-probe-failure.md b/docs/how-to/runbooks/runbook-service-probe-failure.md new file mode 100644 index 0000000..575606e --- /dev/null +++ b/docs/how-to/runbooks/runbook-service-probe-failure.md @@ -0,0 +1,75 @@ +--- +title: "Runbook: Service Probe Failure" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: Service Probe Failure + +**Alert name:** `ServiceProbeFailure` + +A blackbox HTTP health check has failed for 2+ minutes, meaning a service is not responding to its health endpoint. + +## Affected Services + +This alert covers services probed by the Alloy blackbox exporter on indri's minikube cluster: + +| Service | Health Endpoint | +|---------|----------------| +| miniflux | `/healthcheck` | +| kiwix | `/` | +| transmission | `/transmission/web/` | +| devpi | `/+api` | +| argocd | `/healthz` | + +The failing service is identified by the `service` label in the alert (extracted from the `job` label). + +## Diagnostic Steps + +1. **Check which service is down** — the alert label `service` tells you. You can also run: + ```fish + kubectl get pods -n --context=minikube-indri + ``` + +2. **Check pod status** — look for CrashLoopBackOff, OOMKilled, or pending pods: + ```fish + kubectl describe pod -n --context=minikube-indri + ``` + +3. **Check pod logs**: + ```fish + kubectl logs -n --context=minikube-indri --tail=50 + ``` + +4. **Check if minikube itself is healthy**: + ```fish + ssh indri 'minikube status' + ``` + +5. **Check NFS mounts** (kiwix, transmission depend on sifaka NFS): + ```fish + ssh indri 'df -h | grep Volumes' + ``` + +## Common Causes + +- **Pod crashed** — check logs, restart with `kubectl delete pod` +- **NFS mount lost** — sifaka offline or AutoMounter not running. SSH to indri and check `/Volumes/` +- **Resource exhaustion** — check `kubectl top pods -n ` for memory/CPU pressure +- **Minikube paused/stopped** — `ssh indri 'minikube status'`, restart if needed + +## Silencing + +For planned maintenance, silence this alert in Grafana: +1. Go to Alerting → Silences → Create Silence +2. Match label `alertname = ServiceProbeFailure` +3. Optionally match `service = ` to silence only one +4. Set duration for your maintenance window + +## Related + +- [[deploy-infra-alerting]] — Alerting pipeline overview +- [[configure-grafana-alerting-pipeline]] — Pipeline configuration diff --git a/docs/how-to/runbooks/runbook-textfile-stale.md b/docs/how-to/runbooks/runbook-textfile-stale.md new file mode 100644 index 0000000..2a70adf --- /dev/null +++ b/docs/how-to/runbooks/runbook-textfile-stale.md @@ -0,0 +1,58 @@ +--- +title: "Runbook: Textfile Stale" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: Textfile Stale + +**Alert name:** `TextfileStale` + +A Prometheus textfile collector `.prom` file on indri has not been updated for over 1 hour, indicating the metrics exporter script has stopped running. + +## Affected Textfiles + +| File | LaunchAgent | What it monitors | +|------|-------------|------------------| +| `borgmatic.prom` | `mcquack.eblume.borgmatic` | Backup status | +| `zot.prom` | `mcquack.eblume.zot` | Container registry | +| `minikube.prom` | `mcquack.minikube-metrics` | Minikube cluster status | +| `jellyfin.prom` | `mcquack.eblume.jellyfin-metrics` | Media server | + +## Diagnostic Steps + +1. **Check which file is stale** — the `file` label in the alert tells you. Verify on indri: + ```fish + ssh indri 'ls -la /opt/homebrew/var/node_exporter/textfile/' + ``` + +2. **Check if the LaunchAgent is running**: + ```fish + ssh indri 'launchctl list | grep mcquack' + ``` + +3. **Check LaunchAgent logs** (plist defines stdout/stderr paths): + ```fish + ssh indri 'cat ~/Library/Logs/mcquack/.log' + ``` + +4. **Try running the exporter manually**: + ```fish + ssh indri 'cat ~/Library/LaunchAgents/mcquack..plist' + # Find the ProgramArguments, run them manually + ``` + +## Common Causes + +- **LaunchAgent not loaded** — `launchctl load ~/Library/LaunchAgents/mcquack..plist` +- **Script error** — the exporter script crashed; check logs +- **Permissions** — the textfile directory is not writable +- **Indri reboot** — some LaunchAgents may not auto-start + +## Related + +- [[alloy]] — Collects textfile metrics via `prometheus.exporter.unix` +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/reference/operations/observability.md b/docs/reference/operations/observability.md index 5890147..35136d5 100644 --- a/docs/reference/operations/observability.md +++ b/docs/reference/operations/observability.md @@ -1,6 +1,6 @@ --- title: Observability -modified: 2026-02-07 +modified: 2026-03-22 tags: - operations --- @@ -16,3 +16,13 @@ Metrics, logs, traces, and dashboards for BlumeOps infrastructure. - [[tempo]] - Distributed tracing - [[alloy|Alloy]] - Metrics, log, and trace collection - [[grafana]] - Dashboards and visualization + +## Alerting + +- [[deploy-infra-alerting]] - Alerting pipeline (Grafana Unified Alerting → ntfy) +- [[runbook-service-probe-failure]] - Service health check failure runbook +- [[runbook-postgres-unhealthy]] - PostgreSQL cluster health runbook +- [[runbook-pod-not-ready]] - Pod not ready runbook +- [[runbook-textfile-stale]] - Metrics textfile freshness runbook +- [[runbook-frigate-camera-down]] - Frigate camera health runbook +- [[runbook-argocd-out-of-sync]] - ArgoCD sync status runbook diff --git a/mise-tasks/services-check b/mise-tasks/services-check index 94ced03..9ba2c8e 100755 --- a/mise-tasks/services-check +++ b/mise-tasks/services-check @@ -6,6 +6,7 @@ set -euo pipefail # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' +YELLOW='\033[0;33m' NC='\033[0m' # No Color FAILED=0 @@ -36,11 +37,88 @@ check_http() { fi } +# ============== Grafana Alerting API ============== + +GRAFANA_URL="https://grafana.ops.eblu.me" +GRAFANA_CREDS="" + +fetch_alerts() { + if [ -z "$GRAFANA_CREDS" ]; then + local pass + pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true + if [ -n "$pass" ]; then + GRAFANA_CREDS=$(echo -n "admin:$pass" | base64) + fi + fi + + if [ -z "$GRAFANA_CREDS" ]; then + echo "" + return + fi + + curl -sf --max-time 10 \ + -H "Authorization: Basic $GRAFANA_CREDS" \ + "$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo "" +} + +# Fetch all alerts once +ALERTS_JSON=$(fetch_alerts) + +check_alert() { + local name="$1" + local alertname="$2" + # Optional: filter by a label key=value + local filter_key="${3:-}" + local filter_value="${4:-}" + + printf "%-24s " "$name..." + + if [ -z "$ALERTS_JSON" ]; then + echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)" + return + fi + + local firing + firing=$(echo "$ALERTS_JSON" | python3 -c " +import json, sys +try: + data = json.load(sys.stdin) +except: + sys.exit(1) +alerts = data.get('data', {}).get('alerts', []) +for a in alerts: + if a['labels'].get('alertname') != '$alertname': + continue + if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value': + continue + if a['state'] in ('Alerting', 'Pending'): + url = a.get('annotations', {}).get('runbook_url', '') + summary = a.get('annotations', {}).get('summary', '') + print(f'{summary}|{url}') +" 2>/dev/null) + + if [ -z "$firing" ]; then + echo -e "${GREEN}OK${NC}" + else + local summary runbook + summary=$(echo "$firing" | head -1 | cut -d'|' -f1) + runbook=$(echo "$firing" | head -1 | cut -d'|' -f2) + echo -e "${RED}FIRING${NC}" + if [ -n "$summary" ]; then + echo -e " $summary" + fi + if [ -n "$runbook" ]; then + echo -e " Runbook: $runbook" + fi + FAILED=1 + fi +} + echo "Checking services..." echo "====================" echo "" -# Local services on indri +# Local services on indri (not yet covered by alerting) echo "Local services on indri:" check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'" check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'" @@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'" echo "" -echo "Metrics textfiles:" -check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'" -check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" -check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" -check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'" +echo "Metrics textfiles (via alerting):" +check_alert "textfile-freshness" "TextfileStale" echo "" -echo "Kubernetes cluster:" +echo "Kubernetes cluster (not yet covered by alerting):" check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'" check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" -echo "HTTP endpoints (via Caddy):" -check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy" -check_http "Loki" "https://loki.ops.eblu.me/ready" -check_http "Grafana" "https://grafana.ops.eblu.me/api/health" -check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz" +echo "HTTP endpoints (via alerting):" +check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus" +check_alert "Loki" "ServiceProbeFailure" "service" "loki" +check_alert "Grafana" "ServiceProbeFailure" "service" "grafana" +check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd" +check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix" +check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux" +check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate" +check_alert "Devpi" "ServiceProbeFailure" "service" "devpi" +check_alert "Transmission" "ServiceProbeFailure" "service" "transmission" +check_alert "Immich" "ServiceProbeFailure" "service" "immich" +check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome" + +echo "" +echo "HTTP endpoints (not yet covered by alerting):" check_http "Forgejo" "https://forge.eblu.me/" check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog" -check_http "Kiwix" "https://kiwix.ops.eblu.me/" -check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck" -check_http "TeslaMate" "https://tesla.ops.eblu.me/" -check_http "Devpi" "https://pypi.ops.eblu.me/+api" -check_http "Transmission" "https://torrent.ops.eblu.me/" -check_http "Immich" "https://photos.ops.eblu.me/" -check_http "Navidrome" "https://dj.ops.eblu.me/" check_http "CV" "https://cv.ops.eblu.me/" check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health" check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/" check_http "Frigate" "https://nvr.ops.eblu.me/api/version" -check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'" -check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" check_http "JobSync" "https://jobsync.ops.eblu.me/" echo "" -echo "Ringtail (NixOS):" +echo "Frigate (via alerting):" +check_alert "camera-fps" "FrigateCameraDown" +echo "Frigate (not yet covered by alerting):" +check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'" + +echo "" +echo "Ringtail (not yet covered by alerting):" check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true" check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null" check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'" @@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'" echo "" -echo "Ringtail k3s pods:" -check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running" +echo "Pod health (via alerting):" +check_alert "pod-readiness" "PodNotReady" echo "" -echo "Public services (via Fly.io):" +echo "Database (via alerting):" +check_alert "PostgreSQL" "PostgresClusterUnhealthy" + +echo "" +echo "Public services (not yet covered by alerting):" check_http "Docs (public)" "https://docs.eblu.me/" check_http "CV (public)" "https://cv.eblu.me/" check_http "Forge (public)" "https://forge.eblu.me/" check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz" echo "" -echo "Database:" -check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432" - -echo "" -echo "Indri minikube pods:" -check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running" -check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running" -check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running" -check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running" - -echo "" -echo "ArgoCD app sync status:" +echo "ArgoCD app sync status (via alerting):" +check_alert "argocd-sync" "ArgoCDAppOutOfSync" +# Keep the detailed table as a summary view printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET" while read -r name sync health target; do if [[ "$sync" == "Synced" ]]; then printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" elif [[ "$sync" == "OutOfSync" ]]; then printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" - FAILED=1 else printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target" fi