diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index c169c93..667f735 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -169,6 +169,43 @@ prometheus.exporter.blackbox "services" { address = "http://argocd-server.argocd.svc.cluster.local:80/healthz" module = "http_2xx" } + + target { + name = "prometheus" + address = "http://prometheus.monitoring.svc.cluster.local:9090/-/healthy" + module = "http_2xx" + } + + target { + name = "loki" + address = "http://loki.monitoring.svc.cluster.local:3100/ready" + module = "http_2xx" + } + + target { + name = "grafana" + address = "http://grafana.monitoring.svc.cluster.local:3000/api/health" + module = "http_2xx" + } + + target { + name = "teslamate" + address = "http://teslamate.teslamate.svc.cluster.local:4000/" + module = "http_2xx" + } + + target { + name = "immich" + address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping" + module = "http_2xx" + } + + target { + name = "navidrome" + address = "http://navidrome.navidrome.svc.cluster.local:4533/" + module = "http_2xx" + } + } // Scrape blackbox probe results diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml index a190039..47f2ec6 100644 --- a/argocd/manifests/grafana/alerting.yaml +++ b/argocd/manifests/grafana/alerting.yaml @@ -84,6 +84,120 @@ groups: type: and refId: C + - orgId: 1 + name: database-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: postgres-cluster-unhealthy + title: PostgresClusterUnhealthy + condition: C + for: 3m + noDataState: Alerting + execErrState: Alerting + annotations: + summary: >- + PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy + runbook_url: https://docs.eblu.me/how-to/alerts/runbook-postgres-unhealthy + labels: + severity: critical + service: postgresql + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: cnpg_collector_up + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: lt + params: + - 1 + operator: + type: and + refId: C + + - orgId: 1 + name: pod-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: pod-not-ready + title: PodNotReady + condition: C + for: 5m + noDataState: NoData + execErrState: Alerting + annotations: + summary: >- + Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready + runbook_url: https://docs.eblu.me/how-to/alerts/runbook-pod-not-ready + labels: + severity: warning + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + kube_pod_status_ready{condition="true"} == 0 + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: lt + params: + - 1 + operator: + type: and + refId: C + templates: - orgId: 1 name: ntfy-infra diff --git a/docs/how-to/alerts/runbook-pod-not-ready.md b/docs/how-to/alerts/runbook-pod-not-ready.md new file mode 100644 index 0000000..49dd35e --- /dev/null +++ b/docs/how-to/alerts/runbook-pod-not-ready.md @@ -0,0 +1,55 @@ +--- +title: "Runbook: Pod Not Ready" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: Pod Not Ready + +**Alert name:** `PodNotReady` + +A Kubernetes pod has been in a not-ready state for 5+ minutes. + +## Diagnostic Steps + +1. **Identify the pod** from the alert labels (`pod`, `namespace`): + ```fish + kubectl describe pod -n --context=minikube-indri + ``` + +2. **Check events** — look for scheduling failures, image pull errors, or probe failures: + ```fish + kubectl get events -n --context=minikube-indri --sort-by='.lastTimestamp' | tail -20 + ``` + +3. **Check logs**: + ```fish + kubectl logs -n --context=minikube-indri --tail=50 + ``` + +4. **Check node resources**: + ```fish + kubectl top nodes --context=minikube-indri + kubectl top pods -n --context=minikube-indri + ``` + +## Common Causes + +- **CrashLoopBackOff** — app is crashing on startup, check logs +- **ImagePullBackOff** — container image not found or registry unreachable +- **Pending** — insufficient resources (CPU/memory), or PVC not bound +- **Readiness probe failing** — service is running but not healthy +- **NFS mount issue** — services depending on sifaka (kiwix, transmission, navidrome, jellyfin) will fail if NFS is down + +## Silencing + +1. Grafana → Alerting → Silences → Create Silence +2. Match `alertname = PodNotReady` +3. Optionally match `namespace = ` to silence a specific service + +## Related + +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/how-to/alerts/runbook-postgres-unhealthy.md b/docs/how-to/alerts/runbook-postgres-unhealthy.md new file mode 100644 index 0000000..2910851 --- /dev/null +++ b/docs/how-to/alerts/runbook-postgres-unhealthy.md @@ -0,0 +1,63 @@ +--- +title: "Runbook: PostgreSQL Cluster Unhealthy" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: PostgreSQL Cluster Unhealthy + +**Alert name:** `PostgresClusterUnhealthy` + +The CNPG collector metrics endpoint is down, indicating the PostgreSQL cluster is not responding. + +## Affected Services + +The `blumeops-pg` CNPG cluster on indri's minikube runs databases for: +- TeslaMate +- Authentik (cross-cluster from ringtail) +- Immich +- Grafana dashboards (TeslaMate datasource) + +## Diagnostic Steps + +1. **Check CNPG cluster status**: + ```fish + kubectl get cluster blumeops-pg -n databases --context=minikube-indri + kubectl get pods -n databases -l cnpg.io/cluster=blumeops-pg --context=minikube-indri + ``` + +2. **Check pod logs**: + ```fish + kubectl logs -n databases -l cnpg.io/cluster=blumeops-pg --context=minikube-indri --tail=30 + ``` + +3. **Check if pg_isready**: + ```fish + pg_isready -h pg.ops.eblu.me -p 5432 + ``` + +4. **Check PVC storage**: + ```fish + kubectl get pvc -n databases --context=minikube-indri + ``` + +## Common Causes + +- **Pod crash** — OOM, disk full, or configuration error +- **PVC storage full** — check with `kubectl exec` into the pod and `df -h` +- **Minikube issue** — if the node is under memory pressure, CNPG pods may be evicted +- **Network** — Caddy L4 proxy (`pg.ops.eblu.me`) may be misconfigured + +## Silencing + +For planned database maintenance: +1. Grafana → Alerting → Silences → Create Silence +2. Match `alertname = PostgresClusterUnhealthy` + +## Related + +- [[postgresql]] — CNPG cluster reference +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/reference/operations/observability.md b/docs/reference/operations/observability.md index 852f5d3..1aae7b9 100644 --- a/docs/reference/operations/observability.md +++ b/docs/reference/operations/observability.md @@ -21,3 +21,5 @@ Metrics, logs, traces, and dashboards for BlumeOps infrastructure. - [[deploy-infra-alerting]] - Alerting pipeline (Grafana Unified Alerting → ntfy) - [[runbook-service-probe-failure]] - Service health check failure runbook +- [[runbook-postgres-unhealthy]] - PostgreSQL cluster health runbook +- [[runbook-pod-not-ready]] - Pod not ready runbook