From 549c57ab82127aaa1e6ee4b6db57f5e8bf504355 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Sun, 22 Mar 2026 10:57:23 -0700
Subject: [PATCH] C2(deploy-infra-alerting): impl add first alert rule and
 runbook

- Add ServiceProbeFailure alert rule to Grafana alerting provisioning
  - Queries probe_success metric from Alloy blackbox exporter
  - Extracts service name from job label via label_replace
  - Fires after 2 minutes of failure, noDataState=Alerting
  - Annotations include summary with service name and runbook URL
- Add runbook at docs/how-to/alerts/runbook-service-probe-failure.md
  - Covers all 5 probed services (miniflux, kiwix, transmission, devpi, argocd)
  - Diagnostic steps, common causes, silencing instructions
- Add alerting section to observability.md reference doc

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/grafana/alerting.yaml        | 49 ++++++++++++
 .../alerts/runbook-service-probe-failure.md   | 75 +++++++++++++++++++
 docs/reference/operations/observability.md    |  7 +-
 3 files changed, 130 insertions(+), 1 deletion(-)
 create mode 100644 docs/how-to/alerts/runbook-service-probe-failure.md

diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml
index 3ac33b0..3fe4b1c 100644
--- a/argocd/manifests/grafana/alerting.yaml
+++ b/argocd/manifests/grafana/alerting.yaml
@@ -26,6 +26,55 @@ policies:
     group_interval: 12h
     repeat_interval: 24h
 
+groups:
+  - orgId: 1
+    name: service-health
+    folder: Infrastructure Alerts
+    interval: 30s
+    rules:
+      - uid: service-probe-failure
+        title: ServiceProbeFailure
+        condition: B
+        for: 2m
+        noDataState: Alerting
+        execErrState: Alerting
+        annotations:
+          summary: >-
+            {{ index $labels "service" }} health check is failing
+          runbook_url: https://docs.eblu.me/how-to/alerts/runbook-service-probe-failure
+        labels:
+          severity: warning
+        data:
+          - refId: A
+            datasourceUid: prometheus
+            relativeTimeRange:
+              from: 300
+              to: 0
+            model:
+              expr: >-
+                label_replace(probe_success, "service",
+                "$1", "job", "integrations/blackbox/(.*)")
+              interval: ""
+              refId: A
+          - refId: B
+            datasourceUid: "__expr__"
+            relativeTimeRange:
+              from: 0
+              to: 0
+            model:
+              type: threshold
+              expression: A
+              conditions:
+                - evaluator:
+                    type: lt
+                    params:
+                      - 1
+                  operator:
+                    type: and
+                  reducer:
+                    type: last
+              refId: B
+
 templates:
   - orgId: 1
     name: ntfy-infra
diff --git a/docs/how-to/alerts/runbook-service-probe-failure.md b/docs/how-to/alerts/runbook-service-probe-failure.md
new file mode 100644
index 0000000..575606e
--- /dev/null
+++ b/docs/how-to/alerts/runbook-service-probe-failure.md
@@ -0,0 +1,75 @@
+---
+title: "Runbook: Service Probe Failure"
+modified: 2026-03-22
+tags:
+  - how-to
+  - alerting
+  - runbook
+---
+
+# Runbook: Service Probe Failure
+
+**Alert name:** `ServiceProbeFailure`
+
+A blackbox HTTP health check has failed for 2+ minutes, meaning a service is not responding to its health endpoint.
+
+## Affected Services
+
+This alert covers services probed by the Alloy blackbox exporter on indri's minikube cluster:
+
+| Service | Health Endpoint |
+|---------|----------------|
+| miniflux | `/healthcheck` |
+| kiwix | `/` |
+| transmission | `/transmission/web/` |
+| devpi | `/+api` |
+| argocd | `/healthz` |
+
+The failing service is identified by the `service` label in the alert (extracted from the `job` label).
+
+## Diagnostic Steps
+
+1. **Check which service is down** — the alert label `service` tells you. You can also run:
+   ```fish
+   kubectl get pods -n <namespace> --context=minikube-indri
+   ```
+
+2. **Check pod status** — look for CrashLoopBackOff, OOMKilled, or pending pods:
+   ```fish
+   kubectl describe pod -n <namespace> <pod-name> --context=minikube-indri
+   ```
+
+3. **Check pod logs**:
+   ```fish
+   kubectl logs -n <namespace> <pod-name> --context=minikube-indri --tail=50
+   ```
+
+4. **Check if minikube itself is healthy**:
+   ```fish
+   ssh indri 'minikube status'
+   ```
+
+5. **Check NFS mounts** (kiwix, transmission depend on sifaka NFS):
+   ```fish
+   ssh indri 'df -h | grep Volumes'
+   ```
+
+## Common Causes
+
+- **Pod crashed** — check logs, restart with `kubectl delete pod`
+- **NFS mount lost** — sifaka offline or AutoMounter not running. SSH to indri and check `/Volumes/`
+- **Resource exhaustion** — check `kubectl top pods -n <namespace>` for memory/CPU pressure
+- **Minikube paused/stopped** — `ssh indri 'minikube status'`, restart if needed
+
+## Silencing
+
+For planned maintenance, silence this alert in Grafana:
+1. Go to Alerting → Silences → Create Silence
+2. Match label `alertname = ServiceProbeFailure`
+3. Optionally match `service = <specific-service>` to silence only one
+4. Set duration for your maintenance window
+
+## Related
+
+- [[deploy-infra-alerting]] — Alerting pipeline overview
+- [[configure-grafana-alerting-pipeline]] — Pipeline configuration
diff --git a/docs/reference/operations/observability.md b/docs/reference/operations/observability.md
index 5890147..852f5d3 100644
--- a/docs/reference/operations/observability.md
+++ b/docs/reference/operations/observability.md
@@ -1,6 +1,6 @@
 ---
 title: Observability
-modified: 2026-02-07
+modified: 2026-03-22
 tags:
   - operations
 ---
@@ -16,3 +16,8 @@ Metrics, logs, traces, and dashboards for BlumeOps infrastructure.
 - [[tempo]] - Distributed tracing
 - [[alloy|Alloy]] - Metrics, log, and trace collection
 - [[grafana]] - Dashboards and visualization
+
+## Alerting
+
+- [[deploy-infra-alerting]] - Alerting pipeline (Grafana Unified Alerting → ntfy)
+- [[runbook-service-probe-failure]] - Service health check failure runbook