Remove all Mikado frontmatter (status, branch, requires) from chain cards. Rename docs/how-to/alerts/ to docs/how-to/runbooks/ and update all runbook_url references. Add changelog fragment. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
393 lines
10 KiB
YAML
393 lines
10 KiB
YAML
apiVersion: 1
|
|
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: ntfy-infra
|
|
receivers:
|
|
- uid: ntfy-infra-webhook
|
|
type: webhook
|
|
settings:
|
|
url: https://ntfy.ops.eblu.me
|
|
httpMethod: POST
|
|
maxAlerts: "0"
|
|
payload:
|
|
template: >-
|
|
{{ template "ntfy-infra.payload" . }}
|
|
disableResolveMessage: false
|
|
|
|
policies:
|
|
- orgId: 1
|
|
receiver: ntfy-infra
|
|
group_by:
|
|
- alertname
|
|
- service
|
|
group_wait: 1m
|
|
group_interval: 12h
|
|
repeat_interval: 24h
|
|
|
|
groups:
|
|
- orgId: 1
|
|
name: service-health
|
|
folder: Infrastructure Alerts
|
|
interval: 30s
|
|
rules:
|
|
- uid: service-probe-failure
|
|
title: ServiceProbeFailure
|
|
condition: C
|
|
for: 2m
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
{{ index $labels "service" }} health check is failing
|
|
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: >-
|
|
label_replace(probe_success, "service",
|
|
"$1", "job", "integrations/blackbox/(.*)")
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: reduce
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: dropNN
|
|
refId: B
|
|
- refId: C
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: B
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
operator:
|
|
type: and
|
|
refId: C
|
|
|
|
- orgId: 1
|
|
name: textfile-freshness
|
|
folder: Infrastructure Alerts
|
|
interval: 60s
|
|
rules:
|
|
- uid: textfile-stale
|
|
title: TextfileStale
|
|
condition: C
|
|
for: 15m
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour
|
|
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale
|
|
labels:
|
|
severity: warning
|
|
service: indri-metrics
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: >-
|
|
time() - node_textfile_mtime_seconds
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: reduce
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: dropNN
|
|
refId: B
|
|
- refId: C
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: B
|
|
conditions:
|
|
- evaluator:
|
|
type: gt
|
|
params:
|
|
- 3600
|
|
operator:
|
|
type: and
|
|
refId: C
|
|
|
|
- orgId: 1
|
|
name: frigate-health
|
|
folder: Infrastructure Alerts
|
|
interval: 60s
|
|
rules:
|
|
- uid: frigate-camera-down
|
|
title: FrigateCameraDown
|
|
condition: C
|
|
for: 5m
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
Frigate camera {{ index $labels "camera_name" }} has 0 FPS
|
|
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down
|
|
labels:
|
|
severity: warning
|
|
service: frigate
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: frigate_camera_fps
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: reduce
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: dropNN
|
|
refId: B
|
|
- refId: C
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: B
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
operator:
|
|
type: and
|
|
refId: C
|
|
|
|
- orgId: 1
|
|
name: database-health
|
|
folder: Infrastructure Alerts
|
|
interval: 60s
|
|
rules:
|
|
- uid: postgres-cluster-unhealthy
|
|
title: PostgresClusterUnhealthy
|
|
condition: C
|
|
for: 3m
|
|
noDataState: Alerting
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy
|
|
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy
|
|
labels:
|
|
severity: critical
|
|
service: postgresql
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: cnpg_collector_up
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: reduce
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: dropNN
|
|
refId: B
|
|
- refId: C
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: B
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
operator:
|
|
type: and
|
|
refId: C
|
|
|
|
- orgId: 1
|
|
name: pod-health
|
|
folder: Infrastructure Alerts
|
|
interval: 60s
|
|
rules:
|
|
- uid: pod-not-ready
|
|
title: PodNotReady
|
|
condition: C
|
|
for: 5m
|
|
noDataState: OK
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready
|
|
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready
|
|
labels:
|
|
severity: warning
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: >-
|
|
kube_pod_status_ready{condition="true"} == 0
|
|
unless on (namespace, pod)
|
|
kube_pod_owner{owner_kind="Job"}
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: reduce
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: dropNN
|
|
refId: B
|
|
- refId: C
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: B
|
|
conditions:
|
|
- evaluator:
|
|
type: lt
|
|
params:
|
|
- 1
|
|
operator:
|
|
type: and
|
|
refId: C
|
|
|
|
- orgId: 1
|
|
name: argocd-health
|
|
folder: Infrastructure Alerts
|
|
interval: 60s
|
|
rules:
|
|
- uid: argocd-app-out-of-sync
|
|
title: ArgoCDAppOutOfSync
|
|
condition: C
|
|
for: 30m
|
|
noDataState: OK
|
|
execErrState: Alerting
|
|
annotations:
|
|
summary: >-
|
|
ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }}
|
|
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync
|
|
labels:
|
|
severity: warning
|
|
service: argocd
|
|
data:
|
|
- refId: A
|
|
datasourceUid: prometheus
|
|
relativeTimeRange:
|
|
from: 300
|
|
to: 0
|
|
model:
|
|
expr: >-
|
|
argocd_app_info{sync_status!="Synced"}
|
|
interval: ""
|
|
refId: A
|
|
- refId: B
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: reduce
|
|
expression: A
|
|
reducer: last
|
|
settings:
|
|
mode: dropNN
|
|
refId: B
|
|
- refId: C
|
|
datasourceUid: "__expr__"
|
|
relativeTimeRange:
|
|
from: 0
|
|
to: 0
|
|
model:
|
|
type: threshold
|
|
expression: B
|
|
conditions:
|
|
- evaluator:
|
|
type: gt
|
|
params:
|
|
- 0
|
|
operator:
|
|
type: and
|
|
refId: C
|
|
|
|
templates:
|
|
- orgId: 1
|
|
name: ntfy-infra
|
|
template: |
|
|
{{ define "ntfy-infra.payload" -}}
|
|
{{- $msg := "" -}}
|
|
{{- range .Alerts -}}
|
|
{{- $msg = (printf "%s%s\n" $msg .Annotations.summary) -}}
|
|
{{- end -}}
|
|
{{- $title := (printf "[%s] %s" (.Status | toUpper) .CommonLabels.alertname) -}}
|
|
{{- $actions := coll.Slice -}}
|
|
{{- range .Alerts -}}
|
|
{{- if .Annotations.runbook_url -}}
|
|
{{- $actions = coll.Append (coll.Dict "action" "view" "label" "Open Runbook" "url" .Annotations.runbook_url) $actions -}}
|
|
{{- end -}}
|
|
{{- end -}}
|
|
{{- coll.Dict "topic" "infra-alerts" "title" $title "message" $msg "priority" 3 "actions" $actions | data.ToJSON -}}
|
|
{{- end }}
|