blumeops/argocd/manifests/grafana/alerting.yaml

apiVersion: 1

contactPoints:
  - orgId: 1
    name: ntfy-infra
    receivers:
      - uid: ntfy-infra-webhook
        type: webhook
        settings:
          url: https://ntfy.ops.eblu.me
          httpMethod: POST
          maxAlerts: "0"
          payload:
            template: >-
              {{ template "ntfy-infra.payload" . }}
        disableResolveMessage: false

policies:
  - orgId: 1
    receiver: ntfy-infra
    group_by:
      - alertname
      - service
    group_wait: 1m
    group_interval: 12h
    repeat_interval: 24h

groups:
  - orgId: 1
    name: service-health
    folder: Infrastructure Alerts
    interval: 30s
    rules:
      - uid: service-probe-failure
        title: ServiceProbeFailure
        condition: C
        for: 2m
        noDataState: Alerting
        execErrState: Alerting
        annotations:
          summary: >-
            {{ index $labels "service" }} health check is failing
          runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure
        labels:
          severity: warning
        data:
          - refId: A
            datasourceUid: prometheus
            relativeTimeRange:
              from: 300
              to: 0
            model:
              expr: >-
                label_replace(probe_success, "service",
                "$1", "job", "integrations/blackbox/(.*)")
              interval: ""
              refId: A
          - refId: B
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: reduce
              expression: A
              reducer: last
              settings:
                mode: dropNN
              refId: B
          - refId: C
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
                  operator:
                    type: and
              refId: C

  - orgId: 1
    name: textfile-freshness
    folder: Infrastructure Alerts
    interval: 60s
    rules:
      - uid: textfile-stale
        title: TextfileStale
        condition: C
        for: 15m
        noDataState: Alerting
        execErrState: Alerting
        annotations:
          summary: >-
            Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour
          runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale
        labels:
          severity: warning
          service: indri-metrics
        data:
          - refId: A
            datasourceUid: prometheus
            relativeTimeRange:
              from: 300
              to: 0
            model:
              expr: >-
                time() - node_textfile_mtime_seconds
              interval: ""
              refId: A
          - refId: B
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: reduce
              expression: A
              reducer: last
              settings:
                mode: dropNN
              refId: B
          - refId: C
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator:
                    type: gt
                    params:
                      - 3600
                  operator:
                    type: and
              refId: C

  - orgId: 1
    name: frigate-health
    folder: Infrastructure Alerts
    interval: 60s
    rules:
      - uid: frigate-camera-down
        title: FrigateCameraDown
        condition: C
        for: 5m
        noDataState: Alerting
        execErrState: Alerting
        annotations:
          summary: >-
            Frigate camera {{ index $labels "camera_name" }} has 0 FPS
          runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down
        labels:
          severity: warning
          service: frigate
        data:
          - refId: A
            datasourceUid: prometheus
            relativeTimeRange:
              from: 300
              to: 0
            model:
              expr: frigate_camera_fps
              interval: ""
              refId: A
          - refId: B
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: reduce
              expression: A
              reducer: last
              settings:
                mode: dropNN
              refId: B
          - refId: C
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
                  operator:
                    type: and
              refId: C

  - orgId: 1
    name: database-health
    folder: Infrastructure Alerts
    interval: 60s
    rules:
      - uid: postgres-cluster-unhealthy
        title: PostgresClusterUnhealthy
        condition: C
        for: 3m
        noDataState: Alerting
        execErrState: Alerting
        annotations:
          summary: >-
            PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy
          runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy
        labels:
          severity: critical
          service: postgresql
        data:
          - refId: A
            datasourceUid: prometheus
            relativeTimeRange:
              from: 300
              to: 0
            model:
              expr: cnpg_collector_up
              interval: ""
              refId: A
          - refId: B
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: reduce
              expression: A
              reducer: last
              settings:
                mode: dropNN
              refId: B
          - refId: C
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
                  operator:
                    type: and
              refId: C

  - orgId: 1
    name: pod-health
    folder: Infrastructure Alerts
    interval: 60s
    rules:
      - uid: pod-not-ready
        title: PodNotReady
        condition: C
        for: 5m
        noDataState: OK
        execErrState: Alerting
        annotations:
          summary: >-
            Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready
          runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready
        labels:
          severity: warning
        data:
          - refId: A
            datasourceUid: prometheus
            relativeTimeRange:
              from: 300
              to: 0
            model:
              expr: >-
                kube_pod_status_ready{condition="true"} == 0
                unless on (namespace, pod)
                kube_pod_owner{owner_kind="Job"}
              interval: ""
              refId: A
          - refId: B
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: reduce
              expression: A
              reducer: last
              settings:
                mode: dropNN
              refId: B
          - refId: C
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator:
                    type: lt
                    params:
                      - 1
                  operator:
                    type: and
              refId: C

  - orgId: 1
    name: argocd-health
    folder: Infrastructure Alerts
    interval: 60s
    rules:
      - uid: argocd-app-out-of-sync
        title: ArgoCDAppOutOfSync
        condition: C
        for: 30m
        noDataState: OK
        execErrState: Alerting
        annotations:
          summary: >-
            ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }}
          runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync
        labels:
          severity: warning
          service: argocd
        data:
          - refId: A
            datasourceUid: prometheus
            relativeTimeRange:
              from: 300
              to: 0
            model:
              expr: >-
                argocd_app_info{sync_status!="Synced"}
              interval: ""
              refId: A
          - refId: B
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: reduce
              expression: A
              reducer: last
              settings:
                mode: dropNN
              refId: B
          - refId: C
            datasourceUid: "__expr__"
            relativeTimeRange:
              from: 0
              to: 0
            model:
              type: threshold
              expression: B
              conditions:
                - evaluator:
                    type: gt
                    params:
                      - 0
                  operator:
                    type: and
              refId: C

templates:
  - orgId: 1
    name: ntfy-infra
    template: |
      {{ define "ntfy-infra.payload" -}}
      {{- $msg := "" -}}
      {{- range .Alerts -}}
        {{- $msg = (printf "%s%s\n" $msg .Annotations.summary) -}}
      {{- end -}}
      {{- $title := (printf "[%s] %s" (.Status | toUpper) .CommonLabels.alertname) -}}
      {{- $actions := coll.Slice -}}
      {{- range .Alerts -}}
        {{- if .Annotations.runbook_url -}}
          {{- $actions = coll.Append (coll.Dict "action" "view" "label" "Open Runbook" "url" .Annotations.runbook_url) $actions -}}
        {{- end -}}
      {{- end -}}
      {{- coll.Dict "topic" "infra-alerts" "title" $title "message" $msg "priority" 3 "actions" $actions | data.ToJSON -}}
      {{- end }}