- Add ServiceProbeFailure alert rule to Grafana alerting provisioning - Queries probe_success metric from Alloy blackbox exporter - Extracts service name from job label via label_replace - Fires after 2 minutes of failure, noDataState=Alerting - Annotations include summary with service name and runbook URL - Add runbook at docs/how-to/alerts/runbook-service-probe-failure.md - Covers all 5 probed services (miniflux, kiwix, transmission, devpi, argocd) - Diagnostic steps, common causes, silencing instructions - Add alerting section to observability.md reference doc Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
91 lines
2.3 KiB
YAML
91 lines
2.3 KiB
YAML
apiVersion: 1
|
|
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: ntfy-infra
|
|
receivers:
|
|
- uid: ntfy-infra-webhook
|
|
type: webhook
|
|
settings:
|
|
url: https://ntfy.ops.eblu.me/infra-alerts
|
|
httpMethod: POST
|
|
title: >-
|
|
{{ template "ntfy-infra.title" . }}
|
|
message: >-
|
|
{{ template "ntfy-infra.message" . }}
|
|
maxAlerts: "0"
|
|
disableResolveMessage: false
|
|
|
|
policies:
|
|
- orgId: 1
|
|
receiver: ntfy-infra
|
|
group_by:
|
|
- alertname
|
|
- service
|
|
group_wait: 1m
|
|
group_interval: 12h
|
|
repeat_interval: 24h
|
|
|
|
groups:
|
|
- orgId: 1
|
|
name: service-health
|
|
folder: Infrastructure Alerts
|
|
interval: 30s
|
|
rules:
|
|
- uid: service-probe-failure
|
|
title: ServiceProbeFailure
|
|
condition: B
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
{{ index $labels "service" }} health check is failing
|
|
runbook_url: https://docs.eblu.me/how-to/alerts/runbook-service-probe-failure
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: >-
|
|
label_replace(probe_success, "service",
|
|
"$1", "job", "integrations/blackbox/(.*)")
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: A
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
operator:
|
|
type: and
|
|
reducer:
|
|
type: last
|
|
refId: B
|
|
|
|
templates:
|
|
- orgId: 1
|
|
name: ntfy-infra
|
|
template: |
|
|
{{ define "ntfy-infra.title" -}}
|
|
[{{ .Status | toUpper }}] {{ .CommonLabels.alertname }}
|
|
{{- end }}
|
|
|
|
{{ define "ntfy-infra.message" -}}
|
|
{{ range .Alerts -}}
|
|
{{ .Annotations.summary }}
|
|
{{ if .Annotations.runbook_url }}Runbook: {{ .Annotations.runbook_url }}{{ end }}
|
|
{{ end -}}
|
|
{{- end }}
|