blumeops/argocd/manifests/grafana/alerting.yaml
Erich Blume 67883950c3 C2(deploy-infra-alerting): finalize rewrite cards as historical docs
Remove all Mikado frontmatter (status, branch, requires) from chain
cards. Rename docs/how-to/alerts/ to docs/how-to/runbooks/ and update
all runbook_url references. Add changelog fragment.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-22 14:40:21 -07:00

393 lines
10 KiB
YAML

apiVersion: 1
contactPoints:
- orgId: 1
name: ntfy-infra
receivers:
- uid: ntfy-infra-webhook
type: webhook
settings:
url: https://ntfy.ops.eblu.me
httpMethod: POST
maxAlerts: "0"
payload:
template: >-
{{ template "ntfy-infra.payload" . }}
disableResolveMessage: false
policies:
- orgId: 1
receiver: ntfy-infra
group_by:
- alertname
- service
group_wait: 1m
group_interval: 12h
repeat_interval: 24h
groups:
- orgId: 1
name: service-health
folder: Infrastructure Alerts
interval: 30s
rules:
- uid: service-probe-failure
title: ServiceProbeFailure
condition: C
for: 2m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
{{ index $labels "service" }} health check is failing
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure
labels:
severity: warning
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
label_replace(probe_success, "service",
"$1", "job", "integrations/blackbox/(.*)")
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: textfile-freshness
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: textfile-stale
title: TextfileStale
condition: C
for: 15m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale
labels:
severity: warning
service: indri-metrics
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
time() - node_textfile_mtime_seconds
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params:
- 3600
operator:
type: and
refId: C
- orgId: 1
name: frigate-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: frigate-camera-down
title: FrigateCameraDown
condition: C
for: 5m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
Frigate camera {{ index $labels "camera_name" }} has 0 FPS
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down
labels:
severity: warning
service: frigate
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: frigate_camera_fps
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: database-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: postgres-cluster-unhealthy
title: PostgresClusterUnhealthy
condition: C
for: 3m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy
labels:
severity: critical
service: postgresql
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: cnpg_collector_up
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: pod-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: pod-not-ready
title: PodNotReady
condition: C
for: 5m
noDataState: OK
execErrState: Alerting
annotations:
summary: >-
Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready
labels:
severity: warning
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
kube_pod_status_ready{condition="true"} == 0
unless on (namespace, pod)
kube_pod_owner{owner_kind="Job"}
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: argocd-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: argocd-app-out-of-sync
title: ArgoCDAppOutOfSync
condition: C
for: 30m
noDataState: OK
execErrState: Alerting
annotations:
summary: >-
ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }}
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync
labels:
severity: warning
service: argocd
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
argocd_app_info{sync_status!="Synced"}
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params:
- 0
operator:
type: and
refId: C
templates:
- orgId: 1
name: ntfy-infra
template: |
{{ define "ntfy-infra.payload" -}}
{{- $msg := "" -}}
{{- range .Alerts -}}
{{- $msg = (printf "%s%s\n" $msg .Annotations.summary) -}}
{{- end -}}
{{- $title := (printf "[%s] %s" (.Status | toUpper) .CommonLabels.alertname) -}}
{{- $actions := coll.Slice -}}
{{- range .Alerts -}}
{{- if .Annotations.runbook_url -}}
{{- $actions = coll.Append (coll.Dict "action" "view" "label" "Open Runbook" "url" .Annotations.runbook_url) $actions -}}
{{- end -}}
{{- end -}}
{{- coll.Dict "topic" "infra-alerts" "title" $title "message" $msg "priority" 3 "actions" $actions | data.ToJSON -}}
{{- end }}