C2: Deploy infrastructure alerting pipeline (#303)
## Summary Mikado chain to replace `mise run services-check` with Grafana Unified Alerting backed by ntfy push notifications. **Design:** - Grafana Unified Alerting evaluates rules against Prometheus/Loki - ntfy webhook contact point delivers iOS notifications - Anti-noise policy: page once per 24h per alert group - Every alert links to a runbook in `docs/how-to/alerts/` - services-check eventually queries the alerting API instead of doing its own probes **Chain (bottom-up):** 1. `configure-grafana-alerting-pipeline` — enable alerting, ntfy contact point, notification policy 2. `first-alert-and-runbook` — end-to-end proof of concept with blackbox probe failure 3. `port-services-check-alerts` — migrate all services-check probes to alert rules + runbooks 4. `refactor-services-check-to-query-alerts` — rewrite services-check to query Grafana API 5. `deploy-infra-alerting` — goal card 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: #303
This commit is contained in:
parent
f1620abb17
commit
6d65e6928c
20 changed files with 1259 additions and 46 deletions
|
|
@ -169,6 +169,43 @@ prometheus.exporter.blackbox "services" {
|
|||
address = "http://argocd-server.argocd.svc.cluster.local:80/healthz"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
target {
|
||||
name = "prometheus"
|
||||
address = "http://prometheus.monitoring.svc.cluster.local:9090/-/healthy"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
target {
|
||||
name = "loki"
|
||||
address = "http://loki.monitoring.svc.cluster.local:3100/ready"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
target {
|
||||
name = "grafana"
|
||||
address = "http://grafana.monitoring.svc.cluster.local:80/api/health"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
target {
|
||||
name = "teslamate"
|
||||
address = "http://teslamate.teslamate.svc.cluster.local:4000/"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
target {
|
||||
name = "immich"
|
||||
address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
target {
|
||||
name = "navidrome"
|
||||
address = "http://navidrome.navidrome.svc.cluster.local:4533/"
|
||||
module = "http_2xx"
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Scrape blackbox probe results
|
||||
|
|
|
|||
393
argocd/manifests/grafana/alerting.yaml
Normal file
393
argocd/manifests/grafana/alerting.yaml
Normal file
|
|
@ -0,0 +1,393 @@
|
|||
apiVersion: 1
|
||||
|
||||
contactPoints:
|
||||
- orgId: 1
|
||||
name: ntfy-infra
|
||||
receivers:
|
||||
- uid: ntfy-infra-webhook
|
||||
type: webhook
|
||||
settings:
|
||||
url: https://ntfy.ops.eblu.me
|
||||
httpMethod: POST
|
||||
maxAlerts: "0"
|
||||
payload:
|
||||
template: >-
|
||||
{{ template "ntfy-infra.payload" . }}
|
||||
disableResolveMessage: false
|
||||
|
||||
policies:
|
||||
- orgId: 1
|
||||
receiver: ntfy-infra
|
||||
group_by:
|
||||
- alertname
|
||||
- service
|
||||
group_wait: 1m
|
||||
group_interval: 12h
|
||||
repeat_interval: 24h
|
||||
|
||||
groups:
|
||||
- orgId: 1
|
||||
name: service-health
|
||||
folder: Infrastructure Alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
- uid: service-probe-failure
|
||||
title: ServiceProbeFailure
|
||||
condition: C
|
||||
for: 2m
|
||||
noDataState: Alerting
|
||||
execErrState: Alerting
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ index $labels "service" }} health check is failing
|
||||
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure
|
||||
labels:
|
||||
severity: warning
|
||||
data:
|
||||
- refId: A
|
||||
datasourceUid: prometheus
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
model:
|
||||
expr: >-
|
||||
label_replace(probe_success, "service",
|
||||
"$1", "job", "integrations/blackbox/(.*)")
|
||||
interval: ""
|
||||
refId: A
|
||||
- refId: B
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: reduce
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: dropNN
|
||||
refId: B
|
||||
- refId: C
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: threshold
|
||||
expression: B
|
||||
conditions:
|
||||
- evaluator:
|
||||
type: lt
|
||||
params:
|
||||
- 1
|
||||
operator:
|
||||
type: and
|
||||
refId: C
|
||||
|
||||
- orgId: 1
|
||||
name: textfile-freshness
|
||||
folder: Infrastructure Alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- uid: textfile-stale
|
||||
title: TextfileStale
|
||||
condition: C
|
||||
for: 15m
|
||||
noDataState: Alerting
|
||||
execErrState: Alerting
|
||||
annotations:
|
||||
summary: >-
|
||||
Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour
|
||||
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale
|
||||
labels:
|
||||
severity: warning
|
||||
service: indri-metrics
|
||||
data:
|
||||
- refId: A
|
||||
datasourceUid: prometheus
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
model:
|
||||
expr: >-
|
||||
time() - node_textfile_mtime_seconds
|
||||
interval: ""
|
||||
refId: A
|
||||
- refId: B
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: reduce
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: dropNN
|
||||
refId: B
|
||||
- refId: C
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: threshold
|
||||
expression: B
|
||||
conditions:
|
||||
- evaluator:
|
||||
type: gt
|
||||
params:
|
||||
- 3600
|
||||
operator:
|
||||
type: and
|
||||
refId: C
|
||||
|
||||
- orgId: 1
|
||||
name: frigate-health
|
||||
folder: Infrastructure Alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- uid: frigate-camera-down
|
||||
title: FrigateCameraDown
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: Alerting
|
||||
execErrState: Alerting
|
||||
annotations:
|
||||
summary: >-
|
||||
Frigate camera {{ index $labels "camera_name" }} has 0 FPS
|
||||
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down
|
||||
labels:
|
||||
severity: warning
|
||||
service: frigate
|
||||
data:
|
||||
- refId: A
|
||||
datasourceUid: prometheus
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
model:
|
||||
expr: frigate_camera_fps
|
||||
interval: ""
|
||||
refId: A
|
||||
- refId: B
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: reduce
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: dropNN
|
||||
refId: B
|
||||
- refId: C
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: threshold
|
||||
expression: B
|
||||
conditions:
|
||||
- evaluator:
|
||||
type: lt
|
||||
params:
|
||||
- 1
|
||||
operator:
|
||||
type: and
|
||||
refId: C
|
||||
|
||||
- orgId: 1
|
||||
name: database-health
|
||||
folder: Infrastructure Alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- uid: postgres-cluster-unhealthy
|
||||
title: PostgresClusterUnhealthy
|
||||
condition: C
|
||||
for: 3m
|
||||
noDataState: Alerting
|
||||
execErrState: Alerting
|
||||
annotations:
|
||||
summary: >-
|
||||
PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy
|
||||
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy
|
||||
labels:
|
||||
severity: critical
|
||||
service: postgresql
|
||||
data:
|
||||
- refId: A
|
||||
datasourceUid: prometheus
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
model:
|
||||
expr: cnpg_collector_up
|
||||
interval: ""
|
||||
refId: A
|
||||
- refId: B
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: reduce
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: dropNN
|
||||
refId: B
|
||||
- refId: C
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: threshold
|
||||
expression: B
|
||||
conditions:
|
||||
- evaluator:
|
||||
type: lt
|
||||
params:
|
||||
- 1
|
||||
operator:
|
||||
type: and
|
||||
refId: C
|
||||
|
||||
- orgId: 1
|
||||
name: pod-health
|
||||
folder: Infrastructure Alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- uid: pod-not-ready
|
||||
title: PodNotReady
|
||||
condition: C
|
||||
for: 5m
|
||||
noDataState: OK
|
||||
execErrState: Alerting
|
||||
annotations:
|
||||
summary: >-
|
||||
Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready
|
||||
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready
|
||||
labels:
|
||||
severity: warning
|
||||
data:
|
||||
- refId: A
|
||||
datasourceUid: prometheus
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
model:
|
||||
expr: >-
|
||||
kube_pod_status_ready{condition="true"} == 0
|
||||
unless on (namespace, pod)
|
||||
kube_pod_owner{owner_kind="Job"}
|
||||
interval: ""
|
||||
refId: A
|
||||
- refId: B
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: reduce
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: dropNN
|
||||
refId: B
|
||||
- refId: C
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: threshold
|
||||
expression: B
|
||||
conditions:
|
||||
- evaluator:
|
||||
type: lt
|
||||
params:
|
||||
- 1
|
||||
operator:
|
||||
type: and
|
||||
refId: C
|
||||
|
||||
- orgId: 1
|
||||
name: argocd-health
|
||||
folder: Infrastructure Alerts
|
||||
interval: 60s
|
||||
rules:
|
||||
- uid: argocd-app-out-of-sync
|
||||
title: ArgoCDAppOutOfSync
|
||||
condition: C
|
||||
for: 30m
|
||||
noDataState: OK
|
||||
execErrState: Alerting
|
||||
annotations:
|
||||
summary: >-
|
||||
ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }}
|
||||
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync
|
||||
labels:
|
||||
severity: warning
|
||||
service: argocd
|
||||
data:
|
||||
- refId: A
|
||||
datasourceUid: prometheus
|
||||
relativeTimeRange:
|
||||
from: 300
|
||||
to: 0
|
||||
model:
|
||||
expr: >-
|
||||
argocd_app_info{sync_status!="Synced"}
|
||||
interval: ""
|
||||
refId: A
|
||||
- refId: B
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: reduce
|
||||
expression: A
|
||||
reducer: last
|
||||
settings:
|
||||
mode: dropNN
|
||||
refId: B
|
||||
- refId: C
|
||||
datasourceUid: "__expr__"
|
||||
relativeTimeRange:
|
||||
from: 0
|
||||
to: 0
|
||||
model:
|
||||
type: threshold
|
||||
expression: B
|
||||
conditions:
|
||||
- evaluator:
|
||||
type: gt
|
||||
params:
|
||||
- 0
|
||||
operator:
|
||||
type: and
|
||||
refId: C
|
||||
|
||||
templates:
|
||||
- orgId: 1
|
||||
name: ntfy-infra
|
||||
template: |
|
||||
{{ define "ntfy-infra.payload" -}}
|
||||
{{- $msg := "" -}}
|
||||
{{- range .Alerts -}}
|
||||
{{- $msg = (printf "%s%s\n" $msg .Annotations.summary) -}}
|
||||
{{- end -}}
|
||||
{{- $title := (printf "[%s] %s" (.Status | toUpper) .CommonLabels.alertname) -}}
|
||||
{{- $actions := coll.Slice -}}
|
||||
{{- range .Alerts -}}
|
||||
{{- if .Annotations.runbook_url -}}
|
||||
{{- $actions = coll.Append (coll.Dict "action" "view" "label" "Open Runbook" "url" .Annotations.runbook_url) $actions -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- coll.Dict "topic" "infra-alerts" "title" $title "message" $msg "priority" 3 "actions" $actions | data.ToJSON -}}
|
||||
{{- end }}
|
||||
|
|
@ -277,6 +277,9 @@ spec:
|
|||
- name: config
|
||||
mountPath: /etc/grafana/provisioning/datasources/datasources.yaml
|
||||
subPath: datasources.yaml
|
||||
- name: config
|
||||
mountPath: /etc/grafana/provisioning/alerting/alerting.yaml
|
||||
subPath: alerting.yaml
|
||||
- name: storage
|
||||
mountPath: /var/lib/grafana
|
||||
- name: sc-dashboard-volume
|
||||
|
|
|
|||
|
|
@ -30,3 +30,8 @@ allow_embedding = false
|
|||
|
||||
[server]
|
||||
root_url = https://grafana.ops.eblu.me
|
||||
|
||||
[unified_alerting]
|
||||
enabled = true
|
||||
evaluation_timeout = 30s
|
||||
min_interval = 10s
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ configMapGenerator:
|
|||
files:
|
||||
- grafana.ini
|
||||
- datasources.yaml
|
||||
- alerting.yaml
|
||||
options:
|
||||
labels:
|
||||
app.kubernetes.io/name: grafana
|
||||
|
|
|
|||
|
|
@ -80,6 +80,14 @@ scrape_configs:
|
|||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# ArgoCD application metrics
|
||||
- job_name: "argocd"
|
||||
static_configs:
|
||||
- targets: ["argocd-metrics.argocd.svc.cluster.local:8082"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail)
|
||||
- job_name: "frigate"
|
||||
scheme: https
|
||||
|
|
|
|||
1
docs/changelog.d/mikado-deploy-infra-alerting.feature.md
Normal file
1
docs/changelog.d/mikado-deploy-infra-alerting.feature.md
Normal file
|
|
@ -0,0 +1 @@
|
|||
Deploy infrastructure alerting pipeline using Grafana Unified Alerting with ntfy push notifications. 7 alert rules with runbooks covering service health, pod readiness, PostgreSQL, textfile freshness, Frigate cameras, and ArgoCD sync status. services-check now queries the alerting API for covered checks.
|
||||
59
docs/how-to/runbooks/configure-grafana-alerting-pipeline.md
Normal file
59
docs/how-to/runbooks/configure-grafana-alerting-pipeline.md
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
---
|
||||
title: Configure Grafana Alerting Pipeline
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- grafana
|
||||
---
|
||||
|
||||
# Configure Grafana Alerting Pipeline
|
||||
|
||||
Enable Grafana Unified Alerting, create an ntfy webhook contact point, configure the notification policy with anti-noise settings, and set up a message template with runbook links.
|
||||
|
||||
## What to Do
|
||||
|
||||
### 1. Enable Unified Alerting in grafana.ini
|
||||
|
||||
Add the `[unified_alerting]` section to the Grafana ConfigMap. Grafana 11+ has unified alerting enabled by default, but we should be explicit and configure the evaluation interval.
|
||||
|
||||
### 2. Create Alerting Provisioning Files
|
||||
|
||||
Grafana supports provisioning alert resources via YAML files in `/etc/grafana/provisioning/alerting/`. Create:
|
||||
|
||||
- **Contact point** — ntfy webhook targeting `http://ntfy.ntfy.svc.cluster.local:80/infra-alerts` (cluster-internal, since Grafana and ntfy are on different clusters, use `ntfy.ops.eblu.me` via Caddy instead)
|
||||
- **Notification policy** — root policy with `group_wait: 1m`, `group_interval: 12h`, `repeat_interval: 24h`, grouped by `alertname` and `service`
|
||||
- **Message template** — format that includes alert name, summary, and a clickable runbook URL as an ntfy action button
|
||||
|
||||
### 3. Mount Provisioning into Grafana
|
||||
|
||||
Add the alerting provisioning ConfigMap to the Grafana deployment, mounted at `/etc/grafana/provisioning/alerting/`.
|
||||
|
||||
### 4. Create the `infra-alerts` Topic
|
||||
|
||||
ntfy topics are created on first publish — no explicit setup needed. But verify that the topic works by sending a test notification.
|
||||
|
||||
### 5. Verify End-to-End
|
||||
|
||||
- Grafana UI shows the ntfy contact point under Alerting → Contact Points
|
||||
- Notification policy shows the anti-noise settings
|
||||
- Test notification from Grafana reaches the ntfy iOS app
|
||||
|
||||
## Key Details
|
||||
|
||||
- Grafana runs on minikube (indri), ntfy runs on k3s (ringtail). The contact point URL must go through Caddy: `https://ntfy.ops.eblu.me/infra-alerts`
|
||||
- ntfy action buttons use the `X-Actions` header or JSON body format: `view, Open Runbook, <url>`
|
||||
- Grafana provisioning files are applied on startup and cannot be edited from the UI (which is what we want for GitOps)
|
||||
|
||||
## Verification
|
||||
|
||||
- [ ] Grafana starts with unified alerting enabled
|
||||
- [ ] Contact point `ntfy-infra` visible in Grafana UI
|
||||
- [ ] Notification policy shows correct group/repeat intervals
|
||||
- [ ] Test notification arrives on iOS via ntfy app
|
||||
- [ ] Test notification includes a clickable runbook link
|
||||
|
||||
## Related
|
||||
|
||||
- [[deploy-infra-alerting]] — Parent goal
|
||||
- [[first-alert-and-runbook]] — Next: create the first real alert
|
||||
77
docs/how-to/runbooks/deploy-infra-alerting.md
Normal file
77
docs/how-to/runbooks/deploy-infra-alerting.md
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
---
|
||||
title: Deploy Infrastructure Alerting Pipeline
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- observability
|
||||
---
|
||||
|
||||
# Deploy Infrastructure Alerting Pipeline
|
||||
|
||||
Replace the manual `mise run services-check` approach with Grafana Unified Alerting backed by ntfy push notifications, so infrastructure problems page once and include actionable runbook links.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Prometheus (metrics) ──┐
|
||||
├──▶ Grafana Alert Rules ──▶ ntfy webhook ──▶ iOS push
|
||||
Loki (logs) ──────────┘ │
|
||||
│
|
||||
Notification Policy
|
||||
(group_wait: 1m,
|
||||
group_interval: 12h,
|
||||
repeat_interval: 24h)
|
||||
```
|
||||
|
||||
## Design Decisions
|
||||
|
||||
| Decision | Choice | Rationale |
|
||||
|----------|--------|-----------|
|
||||
| **Alert engine** | Grafana Unified Alerting | Already deployed, no new service needed |
|
||||
| **Notification** | ntfy webhook contact point | Already deployed on ringtail, iOS app works |
|
||||
| **Anti-noise** | 24h repeat interval | Page once per day max per alert group |
|
||||
| **Runbooks** | `docs/how-to/runbooks/<name>.md` | Clickable link in every notification |
|
||||
| **Provisioning** | Grafana provisioning YAML (GitOps) | Alerts defined in repo, not just UI |
|
||||
| **Topic** | `infra-alerts` (separate from `frigate-alerts`) | Different severity/audience |
|
||||
|
||||
## Alerting Policy
|
||||
|
||||
- Each alert fires **once** and does not re-notify for 24 hours
|
||||
- A "resolved" notification is sent when the condition clears
|
||||
- Every alert annotation includes `runbook_url` linking to its how-to doc
|
||||
- The ntfy message template renders the runbook URL as a clickable action button
|
||||
- Alerts are grouped by service to avoid notification storms
|
||||
|
||||
## Migration Path
|
||||
|
||||
1. Stand up the pipeline: Grafana alerting config, ntfy contact point, notification policy, message template
|
||||
2. Create the first alert + runbook as proof of concept (e.g., a blackbox probe failure)
|
||||
3. Port services-check health checks to Grafana alert rules, one by one, each with a runbook
|
||||
4. Refactor services-check to query the Grafana alerting API instead of doing its own probes
|
||||
|
||||
## What services-check Covers Today
|
||||
|
||||
These checks will be migrated to alert rules:
|
||||
|
||||
| Category | Checks | Data Source |
|
||||
|----------|--------|-------------|
|
||||
| Local services (indri) | forgejo, alloy, borgmatic, zot via brew/launchctl | Need new probes or textfile metrics |
|
||||
| Metrics textfiles | freshness of `.prom` files | Existing node_textfile metrics |
|
||||
| K8s cluster health | minikube API, k3s API | kube-state-metrics |
|
||||
| HTTP endpoints | ~12 services via Caddy | Alloy blackbox exporter (already exists) |
|
||||
| Ringtail | SSH, tailscale, k3s health | Need new probes |
|
||||
| K3s pods | ntfy, authentik, frigate, etc. | kube-state-metrics on ringtail |
|
||||
| Public services | docs, cv, forge via Fly.io | Alloy on Fly.io or external probe |
|
||||
| PostgreSQL | CNPG readiness | CNPG metrics (already scraped) |
|
||||
| ArgoCD sync | app sync/health status | ArgoCD metrics or API |
|
||||
|
||||
## Related
|
||||
|
||||
- [[configure-grafana-alerting-pipeline]] — Foundation: contact point, policy, template
|
||||
- [[first-alert-and-runbook]] — Proof of concept alert
|
||||
- [[port-services-check-alerts]] — Systematic migration
|
||||
- [[refactor-services-check-to-query-alerts]] — Final integration
|
||||
- [[observability]] — Current observability stack
|
||||
- [[ntfy]] — Push notification service
|
||||
- [[grafana]] — Dashboard and alerting platform
|
||||
68
docs/how-to/runbooks/first-alert-and-runbook.md
Normal file
68
docs/how-to/runbooks/first-alert-and-runbook.md
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
---
|
||||
title: First Alert and Runbook
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
---
|
||||
|
||||
# First Alert and Runbook
|
||||
|
||||
Create one end-to-end alert as proof of concept — an alert rule that fires, delivers a notification to ntfy with a runbook link, and has a corresponding runbook doc.
|
||||
|
||||
## What to Do
|
||||
|
||||
### 1. Choose the First Alert
|
||||
|
||||
The best candidate is a **blackbox probe failure** because:
|
||||
- Alloy's blackbox exporter already probes 5 services (miniflux, kiwix, transmission, devpi, argocd) at 30s intervals
|
||||
- The metric `probe_success` is already in Prometheus
|
||||
- It maps directly to what services-check does (HTTP health checks)
|
||||
- A single alert rule with a `service` label can cover all probed services
|
||||
|
||||
### 2. Create the Alert Rule
|
||||
|
||||
Provision via YAML in the alerting provisioning ConfigMap. The rule should:
|
||||
- Query `probe_success == 0` from Prometheus
|
||||
- Fire after the condition persists for 2 minutes (avoid flapping)
|
||||
- Include labels: `severity: warning`, `service: {{ $labels.instance }}`
|
||||
- Include annotations: `summary`, `runbook_url` pointing to the runbook doc
|
||||
|
||||
### 3. Create the Runbook
|
||||
|
||||
Write `docs/how-to/runbooks/runbook-service-probe-failure.md` as a how-to doc explaining:
|
||||
- What the alert means
|
||||
- How to check which service is down
|
||||
- Common causes and resolution steps
|
||||
- How to silence the alert if the downtime is planned
|
||||
|
||||
### 4. Verify End-to-End
|
||||
|
||||
- Stop one of the probed services (e.g., scale miniflux to 0)
|
||||
- Wait for the alert to fire (~2 minutes)
|
||||
- Confirm ntfy notification arrives with correct summary and runbook link
|
||||
- Click the runbook link and verify it reaches docs.eblu.me
|
||||
- Scale the service back up
|
||||
- Confirm "resolved" notification arrives
|
||||
- Confirm no repeat notification during the 24h window
|
||||
|
||||
## Key Details
|
||||
|
||||
- Grafana alert rules can be provisioned as YAML files alongside contact points and notification policies
|
||||
- The blackbox probe metrics from Alloy use the job name `blackbox` and include an `instance` label with the service name
|
||||
- The runbook URL format: `https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure`
|
||||
|
||||
## Verification
|
||||
|
||||
- [ ] Alert rule appears in Grafana UI under Alerting → Alert Rules
|
||||
- [ ] Simulated failure triggers ntfy notification within ~3 minutes
|
||||
- [ ] Notification includes service name, summary, and clickable runbook link
|
||||
- [ ] Resolution triggers a "resolved" notification
|
||||
- [ ] No repeat notification within 24h window
|
||||
|
||||
## Related
|
||||
|
||||
- [[configure-grafana-alerting-pipeline]] — Prerequisite: pipeline must be working
|
||||
- [[deploy-infra-alerting]] — Parent goal
|
||||
- [[port-services-check-alerts]] — Next: port remaining checks
|
||||
- [[runbook-service-probe-failure]] — The runbook created for this alert
|
||||
74
docs/how-to/runbooks/port-services-check-alerts.md
Normal file
74
docs/how-to/runbooks/port-services-check-alerts.md
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
---
|
||||
title: Port services-check Alerts to Grafana
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
---
|
||||
|
||||
# Port services-check Alerts to Grafana
|
||||
|
||||
Systematically migrate the health checks from `mise run services-check` to Grafana alert rules, each with a corresponding runbook. After this card, the alerting system covers everything services-check does today.
|
||||
|
||||
## What to Do
|
||||
|
||||
### 1. Inventory and Prioritize
|
||||
|
||||
Map each services-check probe to a data source and alert rule. Some checks already have metrics in Prometheus; others need new instrumentation.
|
||||
|
||||
**Already have metrics (easy):**
|
||||
- HTTP endpoint probes → Alloy blackbox exporter (`probe_success`)
|
||||
- PostgreSQL health → CNPG metrics (`cnpg_pg_replication_streaming`, `cnpg_collector_up`)
|
||||
- K8s pod health → kube-state-metrics (`kube_pod_status_phase`)
|
||||
- ArgoCD sync status → ArgoCD metrics (`argocd_app_info` with sync/health labels)
|
||||
|
||||
**Need new probes or metrics:**
|
||||
- Local indri services (forgejo, alloy, borgmatic, zot via brew/launchctl) → Alloy host textfile or new probes
|
||||
- Metrics textfile freshness → `node_textfile_mtime_seconds` (already collected by Alloy on indri)
|
||||
- Ringtail SSH/tailscale health → Alloy blackbox on ringtail or cross-cluster probe
|
||||
- Public services (docs, cv, forge via Fly.io) → Alloy on Fly.io or Grafana synthetic monitoring
|
||||
|
||||
### 2. Add Missing Probes
|
||||
|
||||
Extend Alloy configurations where needed:
|
||||
- **Alloy on indri:** Add blackbox targets for forgejo, zot (local HTTP endpoints)
|
||||
- **Alloy on ringtail:** Add blackbox targets for ringtail-local services
|
||||
- **Consider:** Whether public endpoint probing belongs in Fly.io Alloy or a separate prober
|
||||
|
||||
### 3. Create Alert Rules
|
||||
|
||||
For each check category, create provisioned Grafana alert rules. Group related checks into alert rule groups (e.g., "indri-services", "k8s-health", "public-endpoints").
|
||||
|
||||
### 4. Create Runbooks
|
||||
|
||||
One runbook per alert type in `docs/how-to/runbooks/runbook-<name>.md`. Each runbook should cover:
|
||||
- What the alert means
|
||||
- Diagnostic steps
|
||||
- Common fixes
|
||||
- How to silence for planned maintenance
|
||||
|
||||
### 5. Remove from services-check
|
||||
|
||||
As each check is ported, remove it from the services-check script (or mark it as "now handled by alerting"). The goal is that services-check shrinks as alerting grows.
|
||||
|
||||
## Key Details
|
||||
|
||||
- Don't try to port everything in one session — this card may span multiple work cycles within the C2 chain
|
||||
- Prioritize checks that have caught real problems in the past
|
||||
- Some checks (like ArgoCD sync status table) may remain in services-check as a human-readable summary even after alerting covers the failure cases
|
||||
- The Alloy blackbox exporter on k8s already covers 5 services; extending it to more is straightforward
|
||||
|
||||
## Verification
|
||||
|
||||
- [ ] All HTTP endpoint checks from services-check have corresponding alert rules
|
||||
- [ ] Pod health checks have corresponding alert rules
|
||||
- [ ] PostgreSQL health has a corresponding alert rule
|
||||
- [ ] Each alert rule has a runbook doc in `docs/how-to/runbooks/`
|
||||
- [ ] Test at least 2-3 failure scenarios end-to-end
|
||||
- [ ] services-check script has been updated to reflect ported checks
|
||||
|
||||
## Related
|
||||
|
||||
- [[first-alert-and-runbook]] — Prerequisite: established the pattern
|
||||
- [[deploy-infra-alerting]] — Parent goal
|
||||
- [[refactor-services-check-to-query-alerts]] — Next: make services-check query alerts
|
||||
|
|
@ -0,0 +1,53 @@
|
|||
---
|
||||
title: Refactor services-check to Query Alerts
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
---
|
||||
|
||||
# Refactor services-check to Query Alerts
|
||||
|
||||
Change `mise run services-check` from doing its own health probes to querying the Grafana alerting API for currently firing alerts. The script becomes a CLI view into the same alerting system that sends ntfy notifications.
|
||||
|
||||
## What to Do
|
||||
|
||||
### 1. Query the Grafana Alerting API
|
||||
|
||||
Grafana exposes alert state via:
|
||||
- `GET /api/v1/provisioning/alert-rules` — all configured rules
|
||||
- `GET /api/prometheus/grafana/api/v1/alerts` — currently firing alerts (Prometheus-compatible format)
|
||||
|
||||
The second endpoint is simpler — it returns only active alerts with labels and annotations, similar to Alertmanager's `/api/v1/alerts`.
|
||||
|
||||
### 2. Rewrite services-check
|
||||
|
||||
The new services-check should:
|
||||
1. Query the Grafana alerting API for firing alerts
|
||||
2. Display them in a table with service name, alert name, duration, and runbook link
|
||||
3. If no alerts are firing, print a green "all clear" message
|
||||
4. Exit 0 if no alerts, exit 1 if any are firing
|
||||
5. Optionally keep a few checks that don't map to alerting (e.g., the ArgoCD sync status table as a summary view)
|
||||
|
||||
### 3. Handle Authentication
|
||||
|
||||
services-check will need a Grafana API token or service account token. Options:
|
||||
- Use the existing Grafana admin credentials from 1Password (`op read`)
|
||||
- Create a dedicated read-only service account in Grafana
|
||||
|
||||
### 4. Preserve the ArgoCD Summary
|
||||
|
||||
The ArgoCD sync/health table in services-check is a useful quick view even when nothing is alerting. Consider keeping it as a separate section that always displays, independent of the alert query.
|
||||
|
||||
## Verification
|
||||
|
||||
- [ ] `mise run services-check` queries Grafana instead of doing direct probes
|
||||
- [ ] Firing alerts are displayed with service name, alert name, and runbook link
|
||||
- [ ] Exit code reflects alert state (0 = clear, 1 = firing)
|
||||
- [ ] Works when Grafana is unreachable (graceful error, not a crash)
|
||||
- [ ] ArgoCD summary table still works
|
||||
|
||||
## Related
|
||||
|
||||
- [[port-services-check-alerts]] — Prerequisite: alerts must exist to query
|
||||
- [[deploy-infra-alerting]] — Parent goal
|
||||
65
docs/how-to/runbooks/runbook-argocd-out-of-sync.md
Normal file
65
docs/how-to/runbooks/runbook-argocd-out-of-sync.md
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
---
|
||||
title: "Runbook: ArgoCD App Out of Sync"
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- runbook
|
||||
---
|
||||
|
||||
# Runbook: ArgoCD App Out of Sync
|
||||
|
||||
**Alert name:** `ArgoCDAppOutOfSync`
|
||||
|
||||
An ArgoCD application has been out of sync for 30+ minutes. This means the live state in Kubernetes differs from what's declared in Git.
|
||||
|
||||
## Diagnostic Steps
|
||||
|
||||
1. **Check which app is out of sync** — the `name` label in the alert tells you:
|
||||
```fish
|
||||
argocd app get <app-name>
|
||||
```
|
||||
|
||||
2. **View the diff**:
|
||||
```fish
|
||||
argocd app diff <app-name>
|
||||
```
|
||||
|
||||
3. **Check if it's a branch revision issue** — during C1/C2 work, apps may be pointed at a feature branch. After merge, they need to be reset to main:
|
||||
```fish
|
||||
argocd app get <app-name> -o json | python3 -c "import json,sys; print(json.load(sys.stdin)['spec']['source']['targetRevision'])"
|
||||
```
|
||||
|
||||
4. **Check ArgoCD UI** — https://argocd.ops.eblu.me — look for sync errors or degraded status.
|
||||
|
||||
## Common Causes
|
||||
|
||||
- **Forgot to sync after push** — ArgoCD uses manual sync; changes require explicit `argocd app sync`
|
||||
- **Branch revision not reset after PR merge** — app still points at a deleted branch
|
||||
- **Kustomize/manifest error** — invalid YAML or unsatisfiable resource requirements
|
||||
- **Pruning needed** — old ConfigMaps from `configMapGenerator` need pruning
|
||||
|
||||
## Resolution
|
||||
|
||||
```fish
|
||||
# Simple sync
|
||||
argocd app sync <app-name>
|
||||
|
||||
# If pruning is needed
|
||||
argocd app sync <app-name> --prune
|
||||
|
||||
# If stuck on a deleted branch
|
||||
argocd app set <app-name> --revision main
|
||||
argocd app sync <app-name>
|
||||
```
|
||||
|
||||
## Silencing
|
||||
|
||||
During active C1/C2 development, apps may intentionally be out of sync:
|
||||
1. Grafana → Alerting → Silences → Create Silence
|
||||
2. Match `alertname = ArgoCDAppOutOfSync` and `name = <app-name>`
|
||||
|
||||
## Related
|
||||
|
||||
- [[argocd]] — ArgoCD reference
|
||||
- [[deploy-infra-alerting]] — Alerting pipeline overview
|
||||
39
docs/how-to/runbooks/runbook-frigate-camera-down.md
Normal file
39
docs/how-to/runbooks/runbook-frigate-camera-down.md
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
---
|
||||
title: "Runbook: Frigate Camera Down"
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- runbook
|
||||
---
|
||||
|
||||
# Runbook: Frigate Camera Down
|
||||
|
||||
**Alert name:** `FrigateCameraDown`
|
||||
|
||||
A Frigate camera has reported 0 FPS for 5+ minutes, meaning the camera feed is not being received.
|
||||
|
||||
## Diagnostic Steps
|
||||
|
||||
1. **Check Frigate UI** — https://nvr.ops.eblu.me — look at the camera thumbnail and status
|
||||
2. **Check Frigate API stats**:
|
||||
```fish
|
||||
curl -s https://nvr.ops.eblu.me/api/stats | python3 -m json.tool
|
||||
```
|
||||
3. **Check Frigate pod logs** on ringtail:
|
||||
```fish
|
||||
kubectl logs -n frigate -l app=frigate --context=k3s-ringtail --tail=30
|
||||
```
|
||||
4. **Check the camera itself** — verify it's powered on and network-connected. Try accessing the RTSP stream directly.
|
||||
|
||||
## Common Causes
|
||||
|
||||
- **Camera offline** — power outage, network issue, or camera crash
|
||||
- **NFS mount lost** — Frigate storage on sifaka; if the NFS mount drops, recording stops and FPS may drop
|
||||
- **Frigate pod restart** — during restart, camera FPS briefly drops to 0
|
||||
- **RTSP stream timeout** — camera firmware issue; power cycle the camera
|
||||
|
||||
## Related
|
||||
|
||||
- [[frigate]] — Frigate NVR reference
|
||||
- [[deploy-infra-alerting]] — Alerting pipeline overview
|
||||
55
docs/how-to/runbooks/runbook-pod-not-ready.md
Normal file
55
docs/how-to/runbooks/runbook-pod-not-ready.md
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
---
|
||||
title: "Runbook: Pod Not Ready"
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- runbook
|
||||
---
|
||||
|
||||
# Runbook: Pod Not Ready
|
||||
|
||||
**Alert name:** `PodNotReady`
|
||||
|
||||
A Kubernetes pod has been in a not-ready state for 5+ minutes.
|
||||
|
||||
## Diagnostic Steps
|
||||
|
||||
1. **Identify the pod** from the alert labels (`pod`, `namespace`):
|
||||
```fish
|
||||
kubectl describe pod <pod> -n <namespace> --context=minikube-indri
|
||||
```
|
||||
|
||||
2. **Check events** — look for scheduling failures, image pull errors, or probe failures:
|
||||
```fish
|
||||
kubectl get events -n <namespace> --context=minikube-indri --sort-by='.lastTimestamp' | tail -20
|
||||
```
|
||||
|
||||
3. **Check logs**:
|
||||
```fish
|
||||
kubectl logs <pod> -n <namespace> --context=minikube-indri --tail=50
|
||||
```
|
||||
|
||||
4. **Check node resources**:
|
||||
```fish
|
||||
kubectl top nodes --context=minikube-indri
|
||||
kubectl top pods -n <namespace> --context=minikube-indri
|
||||
```
|
||||
|
||||
## Common Causes
|
||||
|
||||
- **CrashLoopBackOff** — app is crashing on startup, check logs
|
||||
- **ImagePullBackOff** — container image not found or registry unreachable
|
||||
- **Pending** — insufficient resources (CPU/memory), or PVC not bound
|
||||
- **Readiness probe failing** — service is running but not healthy
|
||||
- **NFS mount issue** — services depending on sifaka (kiwix, transmission, navidrome, jellyfin) will fail if NFS is down
|
||||
|
||||
## Silencing
|
||||
|
||||
1. Grafana → Alerting → Silences → Create Silence
|
||||
2. Match `alertname = PodNotReady`
|
||||
3. Optionally match `namespace = <namespace>` to silence a specific service
|
||||
|
||||
## Related
|
||||
|
||||
- [[deploy-infra-alerting]] — Alerting pipeline overview
|
||||
63
docs/how-to/runbooks/runbook-postgres-unhealthy.md
Normal file
63
docs/how-to/runbooks/runbook-postgres-unhealthy.md
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
---
|
||||
title: "Runbook: PostgreSQL Cluster Unhealthy"
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- runbook
|
||||
---
|
||||
|
||||
# Runbook: PostgreSQL Cluster Unhealthy
|
||||
|
||||
**Alert name:** `PostgresClusterUnhealthy`
|
||||
|
||||
The CNPG collector metrics endpoint is down, indicating the PostgreSQL cluster is not responding.
|
||||
|
||||
## Affected Services
|
||||
|
||||
The `blumeops-pg` CNPG cluster on indri's minikube runs databases for:
|
||||
- TeslaMate
|
||||
- Authentik (cross-cluster from ringtail)
|
||||
- Immich
|
||||
- Grafana dashboards (TeslaMate datasource)
|
||||
|
||||
## Diagnostic Steps
|
||||
|
||||
1. **Check CNPG cluster status**:
|
||||
```fish
|
||||
kubectl get cluster blumeops-pg -n databases --context=minikube-indri
|
||||
kubectl get pods -n databases -l cnpg.io/cluster=blumeops-pg --context=minikube-indri
|
||||
```
|
||||
|
||||
2. **Check pod logs**:
|
||||
```fish
|
||||
kubectl logs -n databases -l cnpg.io/cluster=blumeops-pg --context=minikube-indri --tail=30
|
||||
```
|
||||
|
||||
3. **Check if pg_isready**:
|
||||
```fish
|
||||
pg_isready -h pg.ops.eblu.me -p 5432
|
||||
```
|
||||
|
||||
4. **Check PVC storage**:
|
||||
```fish
|
||||
kubectl get pvc -n databases --context=minikube-indri
|
||||
```
|
||||
|
||||
## Common Causes
|
||||
|
||||
- **Pod crash** — OOM, disk full, or configuration error
|
||||
- **PVC storage full** — check with `kubectl exec` into the pod and `df -h`
|
||||
- **Minikube issue** — if the node is under memory pressure, CNPG pods may be evicted
|
||||
- **Network** — Caddy L4 proxy (`pg.ops.eblu.me`) may be misconfigured
|
||||
|
||||
## Silencing
|
||||
|
||||
For planned database maintenance:
|
||||
1. Grafana → Alerting → Silences → Create Silence
|
||||
2. Match `alertname = PostgresClusterUnhealthy`
|
||||
|
||||
## Related
|
||||
|
||||
- [[postgresql]] — CNPG cluster reference
|
||||
- [[deploy-infra-alerting]] — Alerting pipeline overview
|
||||
75
docs/how-to/runbooks/runbook-service-probe-failure.md
Normal file
75
docs/how-to/runbooks/runbook-service-probe-failure.md
Normal file
|
|
@ -0,0 +1,75 @@
|
|||
---
|
||||
title: "Runbook: Service Probe Failure"
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- runbook
|
||||
---
|
||||
|
||||
# Runbook: Service Probe Failure
|
||||
|
||||
**Alert name:** `ServiceProbeFailure`
|
||||
|
||||
A blackbox HTTP health check has failed for 2+ minutes, meaning a service is not responding to its health endpoint.
|
||||
|
||||
## Affected Services
|
||||
|
||||
This alert covers services probed by the Alloy blackbox exporter on indri's minikube cluster:
|
||||
|
||||
| Service | Health Endpoint |
|
||||
|---------|----------------|
|
||||
| miniflux | `/healthcheck` |
|
||||
| kiwix | `/` |
|
||||
| transmission | `/transmission/web/` |
|
||||
| devpi | `/+api` |
|
||||
| argocd | `/healthz` |
|
||||
|
||||
The failing service is identified by the `service` label in the alert (extracted from the `job` label).
|
||||
|
||||
## Diagnostic Steps
|
||||
|
||||
1. **Check which service is down** — the alert label `service` tells you. You can also run:
|
||||
```fish
|
||||
kubectl get pods -n <namespace> --context=minikube-indri
|
||||
```
|
||||
|
||||
2. **Check pod status** — look for CrashLoopBackOff, OOMKilled, or pending pods:
|
||||
```fish
|
||||
kubectl describe pod -n <namespace> <pod-name> --context=minikube-indri
|
||||
```
|
||||
|
||||
3. **Check pod logs**:
|
||||
```fish
|
||||
kubectl logs -n <namespace> <pod-name> --context=minikube-indri --tail=50
|
||||
```
|
||||
|
||||
4. **Check if minikube itself is healthy**:
|
||||
```fish
|
||||
ssh indri 'minikube status'
|
||||
```
|
||||
|
||||
5. **Check NFS mounts** (kiwix, transmission depend on sifaka NFS):
|
||||
```fish
|
||||
ssh indri 'df -h | grep Volumes'
|
||||
```
|
||||
|
||||
## Common Causes
|
||||
|
||||
- **Pod crashed** — check logs, restart with `kubectl delete pod`
|
||||
- **NFS mount lost** — sifaka offline or AutoMounter not running. SSH to indri and check `/Volumes/`
|
||||
- **Resource exhaustion** — check `kubectl top pods -n <namespace>` for memory/CPU pressure
|
||||
- **Minikube paused/stopped** — `ssh indri 'minikube status'`, restart if needed
|
||||
|
||||
## Silencing
|
||||
|
||||
For planned maintenance, silence this alert in Grafana:
|
||||
1. Go to Alerting → Silences → Create Silence
|
||||
2. Match label `alertname = ServiceProbeFailure`
|
||||
3. Optionally match `service = <specific-service>` to silence only one
|
||||
4. Set duration for your maintenance window
|
||||
|
||||
## Related
|
||||
|
||||
- [[deploy-infra-alerting]] — Alerting pipeline overview
|
||||
- [[configure-grafana-alerting-pipeline]] — Pipeline configuration
|
||||
58
docs/how-to/runbooks/runbook-textfile-stale.md
Normal file
58
docs/how-to/runbooks/runbook-textfile-stale.md
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
---
|
||||
title: "Runbook: Textfile Stale"
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- how-to
|
||||
- alerting
|
||||
- runbook
|
||||
---
|
||||
|
||||
# Runbook: Textfile Stale
|
||||
|
||||
**Alert name:** `TextfileStale`
|
||||
|
||||
A Prometheus textfile collector `.prom` file on indri has not been updated for over 1 hour, indicating the metrics exporter script has stopped running.
|
||||
|
||||
## Affected Textfiles
|
||||
|
||||
| File | LaunchAgent | What it monitors |
|
||||
|------|-------------|------------------|
|
||||
| `borgmatic.prom` | `mcquack.eblume.borgmatic` | Backup status |
|
||||
| `zot.prom` | `mcquack.eblume.zot` | Container registry |
|
||||
| `minikube.prom` | `mcquack.minikube-metrics` | Minikube cluster status |
|
||||
| `jellyfin.prom` | `mcquack.eblume.jellyfin-metrics` | Media server |
|
||||
|
||||
## Diagnostic Steps
|
||||
|
||||
1. **Check which file is stale** — the `file` label in the alert tells you. Verify on indri:
|
||||
```fish
|
||||
ssh indri 'ls -la /opt/homebrew/var/node_exporter/textfile/'
|
||||
```
|
||||
|
||||
2. **Check if the LaunchAgent is running**:
|
||||
```fish
|
||||
ssh indri 'launchctl list | grep mcquack'
|
||||
```
|
||||
|
||||
3. **Check LaunchAgent logs** (plist defines stdout/stderr paths):
|
||||
```fish
|
||||
ssh indri 'cat ~/Library/Logs/mcquack/<agent-name>.log'
|
||||
```
|
||||
|
||||
4. **Try running the exporter manually**:
|
||||
```fish
|
||||
ssh indri 'cat ~/Library/LaunchAgents/mcquack.<agent>.plist'
|
||||
# Find the ProgramArguments, run them manually
|
||||
```
|
||||
|
||||
## Common Causes
|
||||
|
||||
- **LaunchAgent not loaded** — `launchctl load ~/Library/LaunchAgents/mcquack.<agent>.plist`
|
||||
- **Script error** — the exporter script crashed; check logs
|
||||
- **Permissions** — the textfile directory is not writable
|
||||
- **Indri reboot** — some LaunchAgents may not auto-start
|
||||
|
||||
## Related
|
||||
|
||||
- [[alloy]] — Collects textfile metrics via `prometheus.exporter.unix`
|
||||
- [[deploy-infra-alerting]] — Alerting pipeline overview
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
title: Observability
|
||||
modified: 2026-02-07
|
||||
modified: 2026-03-22
|
||||
tags:
|
||||
- operations
|
||||
---
|
||||
|
|
@ -16,3 +16,13 @@ Metrics, logs, traces, and dashboards for BlumeOps infrastructure.
|
|||
- [[tempo]] - Distributed tracing
|
||||
- [[alloy|Alloy]] - Metrics, log, and trace collection
|
||||
- [[grafana]] - Dashboards and visualization
|
||||
|
||||
## Alerting
|
||||
|
||||
- [[deploy-infra-alerting]] - Alerting pipeline (Grafana Unified Alerting → ntfy)
|
||||
- [[runbook-service-probe-failure]] - Service health check failure runbook
|
||||
- [[runbook-postgres-unhealthy]] - PostgreSQL cluster health runbook
|
||||
- [[runbook-pod-not-ready]] - Pod not ready runbook
|
||||
- [[runbook-textfile-stale]] - Metrics textfile freshness runbook
|
||||
- [[runbook-frigate-camera-down]] - Frigate camera health runbook
|
||||
- [[runbook-argocd-out-of-sync]] - ArgoCD sync status runbook
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ set -euo pipefail
|
|||
# Colors for output
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[0;33m'
|
||||
NC='\033[0m' # No Color
|
||||
|
||||
FAILED=0
|
||||
|
|
@ -36,11 +37,88 @@ check_http() {
|
|||
fi
|
||||
}
|
||||
|
||||
# ============== Grafana Alerting API ==============
|
||||
|
||||
GRAFANA_URL="https://grafana.ops.eblu.me"
|
||||
GRAFANA_CREDS=""
|
||||
|
||||
fetch_alerts() {
|
||||
if [ -z "$GRAFANA_CREDS" ]; then
|
||||
local pass
|
||||
pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true
|
||||
if [ -n "$pass" ]; then
|
||||
GRAFANA_CREDS=$(echo -n "admin:$pass" | base64)
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -z "$GRAFANA_CREDS" ]; then
|
||||
echo ""
|
||||
return
|
||||
fi
|
||||
|
||||
curl -sf --max-time 10 \
|
||||
-H "Authorization: Basic $GRAFANA_CREDS" \
|
||||
"$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo ""
|
||||
}
|
||||
|
||||
# Fetch all alerts once
|
||||
ALERTS_JSON=$(fetch_alerts)
|
||||
|
||||
check_alert() {
|
||||
local name="$1"
|
||||
local alertname="$2"
|
||||
# Optional: filter by a label key=value
|
||||
local filter_key="${3:-}"
|
||||
local filter_value="${4:-}"
|
||||
|
||||
printf "%-24s " "$name..."
|
||||
|
||||
if [ -z "$ALERTS_JSON" ]; then
|
||||
echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)"
|
||||
return
|
||||
fi
|
||||
|
||||
local firing
|
||||
firing=$(echo "$ALERTS_JSON" | python3 -c "
|
||||
import json, sys
|
||||
try:
|
||||
data = json.load(sys.stdin)
|
||||
except:
|
||||
sys.exit(1)
|
||||
alerts = data.get('data', {}).get('alerts', [])
|
||||
for a in alerts:
|
||||
if a['labels'].get('alertname') != '$alertname':
|
||||
continue
|
||||
if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value':
|
||||
continue
|
||||
if a['state'] in ('Alerting', 'Pending'):
|
||||
url = a.get('annotations', {}).get('runbook_url', '')
|
||||
summary = a.get('annotations', {}).get('summary', '')
|
||||
print(f'{summary}|{url}')
|
||||
" 2>/dev/null)
|
||||
|
||||
if [ -z "$firing" ]; then
|
||||
echo -e "${GREEN}OK${NC}"
|
||||
else
|
||||
local summary runbook
|
||||
summary=$(echo "$firing" | head -1 | cut -d'|' -f1)
|
||||
runbook=$(echo "$firing" | head -1 | cut -d'|' -f2)
|
||||
echo -e "${RED}FIRING${NC}"
|
||||
if [ -n "$summary" ]; then
|
||||
echo -e " $summary"
|
||||
fi
|
||||
if [ -n "$runbook" ]; then
|
||||
echo -e " Runbook: $runbook"
|
||||
fi
|
||||
FAILED=1
|
||||
fi
|
||||
}
|
||||
|
||||
echo "Checking services..."
|
||||
echo "===================="
|
||||
echo ""
|
||||
|
||||
# Local services on indri
|
||||
# Local services on indri (not yet covered by alerting)
|
||||
echo "Local services on indri:"
|
||||
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
|
||||
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
||||
|
|
@ -52,43 +130,47 @@ check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-met
|
|||
check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'"
|
||||
|
||||
echo ""
|
||||
echo "Metrics textfiles:"
|
||||
check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'"
|
||||
check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
||||
check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
||||
check_service "jellyfin.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/jellyfin.prom'"
|
||||
echo "Metrics textfiles (via alerting):"
|
||||
check_alert "textfile-freshness" "TextfileStale"
|
||||
|
||||
echo ""
|
||||
echo "Kubernetes cluster:"
|
||||
echo "Kubernetes cluster (not yet covered by alerting):"
|
||||
check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'"
|
||||
check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
||||
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (via Caddy):"
|
||||
check_http "Prometheus" "https://prometheus.ops.eblu.me/-/healthy"
|
||||
check_http "Loki" "https://loki.ops.eblu.me/ready"
|
||||
check_http "Grafana" "https://grafana.ops.eblu.me/api/health"
|
||||
check_http "ArgoCD" "https://argocd.ops.eblu.me/healthz"
|
||||
echo "HTTP endpoints (via alerting):"
|
||||
check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus"
|
||||
check_alert "Loki" "ServiceProbeFailure" "service" "loki"
|
||||
check_alert "Grafana" "ServiceProbeFailure" "service" "grafana"
|
||||
check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd"
|
||||
check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix"
|
||||
check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux"
|
||||
check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate"
|
||||
check_alert "Devpi" "ServiceProbeFailure" "service" "devpi"
|
||||
check_alert "Transmission" "ServiceProbeFailure" "service" "transmission"
|
||||
check_alert "Immich" "ServiceProbeFailure" "service" "immich"
|
||||
check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (not yet covered by alerting):"
|
||||
check_http "Forgejo" "https://forge.eblu.me/"
|
||||
check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog"
|
||||
check_http "Kiwix" "https://kiwix.ops.eblu.me/"
|
||||
check_http "Miniflux" "https://feed.ops.eblu.me/healthcheck"
|
||||
check_http "TeslaMate" "https://tesla.ops.eblu.me/"
|
||||
check_http "Devpi" "https://pypi.ops.eblu.me/+api"
|
||||
check_http "Transmission" "https://torrent.ops.eblu.me/"
|
||||
check_http "Immich" "https://photos.ops.eblu.me/"
|
||||
check_http "Navidrome" "https://dj.ops.eblu.me/"
|
||||
check_http "CV" "https://cv.ops.eblu.me/"
|
||||
check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health"
|
||||
check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/"
|
||||
check_http "Frigate" "https://nvr.ops.eblu.me/api/version"
|
||||
check_service "frigate-camera-fps" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.cameras | to_entries | all(.value.camera_fps > 0)'"
|
||||
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
||||
check_http "JobSync" "https://jobsync.ops.eblu.me/"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail (NixOS):"
|
||||
echo "Frigate (via alerting):"
|
||||
check_alert "camera-fps" "FrigateCameraDown"
|
||||
echo "Frigate (not yet covered by alerting):"
|
||||
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail (not yet covered by alerting):"
|
||||
check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true"
|
||||
check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null"
|
||||
check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'"
|
||||
|
|
@ -96,43 +178,30 @@ check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw
|
|||
check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'"
|
||||
|
||||
echo ""
|
||||
echo "Ringtail k3s pods:"
|
||||
check_service "ntfy" "kubectl --context=k3s-ringtail -n ntfy get pods -l app=ntfy -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "authentik" "kubectl --context=k3s-ringtail -n authentik get pods -l component=server -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "frigate" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "frigate-notify" "kubectl --context=k3s-ringtail -n frigate get pods -l app=frigate-notify -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "nvidia-device-plugin" "kubectl --context=k3s-ringtail -n nvidia-device-plugin get pods -l app=nvidia-device-plugin -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "jobsync" "kubectl --context=k3s-ringtail -n jobsync get pods -l app=jobsync -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
echo "Pod health (via alerting):"
|
||||
check_alert "pod-readiness" "PodNotReady"
|
||||
|
||||
echo ""
|
||||
echo "Public services (via Fly.io):"
|
||||
echo "Database (via alerting):"
|
||||
check_alert "PostgreSQL" "PostgresClusterUnhealthy"
|
||||
|
||||
echo ""
|
||||
echo "Public services (not yet covered by alerting):"
|
||||
check_http "Docs (public)" "https://docs.eblu.me/"
|
||||
check_http "CV (public)" "https://cv.eblu.me/"
|
||||
check_http "Forge (public)" "https://forge.eblu.me/"
|
||||
check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz"
|
||||
|
||||
echo ""
|
||||
echo "Database:"
|
||||
check_service "PostgreSQL (k8s)" "pg_isready -h pg.ops.eblu.me -p 5432"
|
||||
|
||||
echo ""
|
||||
echo "Indri minikube pods:"
|
||||
check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "teslamate" "kubectl --context=minikube-indri -n teslamate get pods -l app=teslamate -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
|
||||
echo ""
|
||||
echo "ArgoCD app sync status:"
|
||||
echo "ArgoCD app sync status (via alerting):"
|
||||
check_alert "argocd-sync" "ArgoCDAppOutOfSync"
|
||||
# Keep the detailed table as a summary view
|
||||
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
||||
while read -r name sync health target; do
|
||||
if [[ "$sync" == "Synced" ]]; then
|
||||
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
elif [[ "$sync" == "OutOfSync" ]]; then
|
||||
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
FAILED=1
|
||||
else
|
||||
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
fi
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue