diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml index c0f0496..abc4c0f 100644 --- a/argocd/manifests/grafana/alerting.yaml +++ b/argocd/manifests/grafana/alerting.yaml @@ -40,7 +40,7 @@ groups: annotations: summary: >- {{ index $labels "service" }} health check is failing - runbook_url: https://docs.eblu.me/how-to/alerts/runbook-service-probe-failure + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure labels: severity: warning data: @@ -98,7 +98,7 @@ groups: annotations: summary: >- Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour - runbook_url: https://docs.eblu.me/how-to/alerts/runbook-textfile-stale + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale labels: severity: warning service: indri-metrics @@ -156,7 +156,7 @@ groups: annotations: summary: >- Frigate camera {{ index $labels "camera_name" }} has 0 FPS - runbook_url: https://docs.eblu.me/how-to/alerts/runbook-frigate-camera-down + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down labels: severity: warning service: frigate @@ -213,7 +213,7 @@ groups: annotations: summary: >- PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy - runbook_url: https://docs.eblu.me/how-to/alerts/runbook-postgres-unhealthy + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy labels: severity: critical service: postgresql @@ -270,7 +270,7 @@ groups: annotations: summary: >- Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready - runbook_url: https://docs.eblu.me/how-to/alerts/runbook-pod-not-ready + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready labels: severity: warning data: @@ -329,7 +329,7 @@ groups: annotations: summary: >- ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }} - runbook_url: https://docs.eblu.me/how-to/alerts/runbook-argocd-out-of-sync + runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync labels: severity: warning service: argocd diff --git a/docs/changelog.d/mikado-deploy-infra-alerting.feature.md b/docs/changelog.d/mikado-deploy-infra-alerting.feature.md new file mode 100644 index 0000000..7106014 --- /dev/null +++ b/docs/changelog.d/mikado-deploy-infra-alerting.feature.md @@ -0,0 +1 @@ +Deploy infrastructure alerting pipeline using Grafana Unified Alerting with ntfy push notifications. 7 alert rules with runbooks covering service health, pod readiness, PostgreSQL, textfile freshness, Frigate cameras, and ArgoCD sync status. services-check now queries the alerting API for covered checks. diff --git a/docs/how-to/alerts/configure-grafana-alerting-pipeline.md b/docs/how-to/runbooks/configure-grafana-alerting-pipeline.md similarity index 100% rename from docs/how-to/alerts/configure-grafana-alerting-pipeline.md rename to docs/how-to/runbooks/configure-grafana-alerting-pipeline.md diff --git a/docs/how-to/alerts/deploy-infra-alerting.md b/docs/how-to/runbooks/deploy-infra-alerting.md similarity index 94% rename from docs/how-to/alerts/deploy-infra-alerting.md rename to docs/how-to/runbooks/deploy-infra-alerting.md index 7c2e7f0..e02523d 100644 --- a/docs/how-to/alerts/deploy-infra-alerting.md +++ b/docs/how-to/runbooks/deploy-infra-alerting.md @@ -1,10 +1,6 @@ --- title: Deploy Infrastructure Alerting Pipeline modified: 2026-03-22 -status: active -branch: mikado/deploy-infra-alerting -requires: - - refactor-services-check-to-query-alerts tags: - how-to - alerting @@ -35,7 +31,7 @@ Loki (logs) ──────────┘ │ | **Alert engine** | Grafana Unified Alerting | Already deployed, no new service needed | | **Notification** | ntfy webhook contact point | Already deployed on ringtail, iOS app works | | **Anti-noise** | 24h repeat interval | Page once per day max per alert group | -| **Runbooks** | `docs/how-to/alerts/.md` | Clickable link in every notification | +| **Runbooks** | `docs/how-to/runbooks/.md` | Clickable link in every notification | | **Provisioning** | Grafana provisioning YAML (GitOps) | Alerts defined in repo, not just UI | | **Topic** | `infra-alerts` (separate from `frigate-alerts`) | Different severity/audience | diff --git a/docs/how-to/alerts/first-alert-and-runbook.md b/docs/how-to/runbooks/first-alert-and-runbook.md similarity index 91% rename from docs/how-to/alerts/first-alert-and-runbook.md rename to docs/how-to/runbooks/first-alert-and-runbook.md index 71b86bf..6ce13bf 100644 --- a/docs/how-to/alerts/first-alert-and-runbook.md +++ b/docs/how-to/runbooks/first-alert-and-runbook.md @@ -1,8 +1,6 @@ --- title: First Alert and Runbook modified: 2026-03-22 -requires: - - configure-grafana-alerting-pipeline tags: - how-to - alerting @@ -32,7 +30,7 @@ Provision via YAML in the alerting provisioning ConfigMap. The rule should: ### 3. Create the Runbook -Write `docs/how-to/alerts/runbook-service-probe-failure.md` as a how-to doc explaining: +Write `docs/how-to/runbooks/runbook-service-probe-failure.md` as a how-to doc explaining: - What the alert means - How to check which service is down - Common causes and resolution steps @@ -52,7 +50,7 @@ Write `docs/how-to/alerts/runbook-service-probe-failure.md` as a how-to doc expl - Grafana alert rules can be provisioned as YAML files alongside contact points and notification policies - The blackbox probe metrics from Alloy use the job name `blackbox` and include an `instance` label with the service name -- The runbook URL format: `https://docs.eblu.me/how-to/alerts/runbook-service-probe-failure` +- The runbook URL format: `https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure` ## Verification diff --git a/docs/how-to/alerts/port-services-check-alerts.md b/docs/how-to/runbooks/port-services-check-alerts.md similarity index 94% rename from docs/how-to/alerts/port-services-check-alerts.md rename to docs/how-to/runbooks/port-services-check-alerts.md index c2ea6ad..4420f58 100644 --- a/docs/how-to/alerts/port-services-check-alerts.md +++ b/docs/how-to/runbooks/port-services-check-alerts.md @@ -1,8 +1,6 @@ --- title: Port services-check Alerts to Grafana modified: 2026-03-22 -requires: - - first-alert-and-runbook tags: - how-to - alerting @@ -43,7 +41,7 @@ For each check category, create provisioned Grafana alert rules. Group related c ### 4. Create Runbooks -One runbook per alert type in `docs/how-to/alerts/runbook-.md`. Each runbook should cover: +One runbook per alert type in `docs/how-to/runbooks/runbook-.md`. Each runbook should cover: - What the alert means - Diagnostic steps - Common fixes @@ -65,7 +63,7 @@ As each check is ported, remove it from the services-check script (or mark it as - [ ] All HTTP endpoint checks from services-check have corresponding alert rules - [ ] Pod health checks have corresponding alert rules - [ ] PostgreSQL health has a corresponding alert rule -- [ ] Each alert rule has a runbook doc in `docs/how-to/alerts/` +- [ ] Each alert rule has a runbook doc in `docs/how-to/runbooks/` - [ ] Test at least 2-3 failure scenarios end-to-end - [ ] services-check script has been updated to reflect ported checks diff --git a/docs/how-to/alerts/refactor-services-check-to-query-alerts.md b/docs/how-to/runbooks/refactor-services-check-to-query-alerts.md similarity index 97% rename from docs/how-to/alerts/refactor-services-check-to-query-alerts.md rename to docs/how-to/runbooks/refactor-services-check-to-query-alerts.md index 640bcff..244be1f 100644 --- a/docs/how-to/alerts/refactor-services-check-to-query-alerts.md +++ b/docs/how-to/runbooks/refactor-services-check-to-query-alerts.md @@ -1,9 +1,6 @@ --- title: Refactor services-check to Query Alerts modified: 2026-03-22 -status: active -requires: - - port-services-check-alerts tags: - how-to - alerting diff --git a/docs/how-to/alerts/runbook-argocd-out-of-sync.md b/docs/how-to/runbooks/runbook-argocd-out-of-sync.md similarity index 100% rename from docs/how-to/alerts/runbook-argocd-out-of-sync.md rename to docs/how-to/runbooks/runbook-argocd-out-of-sync.md diff --git a/docs/how-to/alerts/runbook-frigate-camera-down.md b/docs/how-to/runbooks/runbook-frigate-camera-down.md similarity index 100% rename from docs/how-to/alerts/runbook-frigate-camera-down.md rename to docs/how-to/runbooks/runbook-frigate-camera-down.md diff --git a/docs/how-to/alerts/runbook-pod-not-ready.md b/docs/how-to/runbooks/runbook-pod-not-ready.md similarity index 100% rename from docs/how-to/alerts/runbook-pod-not-ready.md rename to docs/how-to/runbooks/runbook-pod-not-ready.md diff --git a/docs/how-to/alerts/runbook-postgres-unhealthy.md b/docs/how-to/runbooks/runbook-postgres-unhealthy.md similarity index 100% rename from docs/how-to/alerts/runbook-postgres-unhealthy.md rename to docs/how-to/runbooks/runbook-postgres-unhealthy.md diff --git a/docs/how-to/alerts/runbook-service-probe-failure.md b/docs/how-to/runbooks/runbook-service-probe-failure.md similarity index 100% rename from docs/how-to/alerts/runbook-service-probe-failure.md rename to docs/how-to/runbooks/runbook-service-probe-failure.md diff --git a/docs/how-to/alerts/runbook-textfile-stale.md b/docs/how-to/runbooks/runbook-textfile-stale.md similarity index 100% rename from docs/how-to/alerts/runbook-textfile-stale.md rename to docs/how-to/runbooks/runbook-textfile-stale.md