blumeops/argocd/manifests/grafana/alerting.yaml
Erich Blume fe0e913963
All checks were successful
Deploy Fly.io Proxy / deploy (push) Successful in 1m37s
Switch Fly proxy to upstream keepalive pools (#337)
## Summary

- Replace per-request DNS resolution (variable-based `proxy_pass`) with static `upstream` blocks and `keepalive` connection pools
- Reuses TLS connections through the Tailscale tunnel instead of handshaking per request
- Add `mise run fly-reload` for nginx config reload without full redeploy (re-resolves upstream DNS)

## Trade-off

DNS is resolved at config load, not per-request. If Tailscale Ingress pods get new IPs (restart, reschedule), `mise run fly-reload` is needed. A Grafana alert will be added to detect this.

## Still TODO on this branch

- [ ] Grafana alert for upstream unreachable (triggers fly-reload reminder)
- [ ] Docs pass
- [ ] Deploy from branch and verify latency improvement
- [ ] Changelog fragment

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: #337
2026-04-17 16:39:52 -07:00

453 lines
12 KiB
YAML

apiVersion: 1
contactPoints:
- orgId: 1
name: ntfy-infra
receivers:
- uid: ntfy-infra-webhook
type: webhook
settings:
url: https://ntfy.ops.eblu.me
httpMethod: POST
maxAlerts: "0"
payload:
template: >-
{{ template "ntfy-infra.payload" . }}
disableResolveMessage: false
policies:
- orgId: 1
receiver: ntfy-infra
group_by:
- alertname
- service
group_wait: 1m
group_interval: 12h
repeat_interval: 24h
groups:
- orgId: 1
name: service-health
folder: Infrastructure Alerts
interval: 30s
rules:
- uid: service-probe-failure
title: ServiceProbeFailure
condition: C
for: 2m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
{{ index $labels "service" }} health check is failing
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-service-probe-failure
labels:
severity: warning
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
label_replace(probe_success, "service",
"$1", "job", "integrations/blackbox/(.*)")
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: textfile-freshness
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: textfile-stale
title: TextfileStale
condition: C
for: 15m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
Metrics textfile {{ index $labels "file" }} has not been updated in over 1 hour
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-textfile-stale
labels:
severity: warning
service: indri-metrics
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
time() - node_textfile_mtime_seconds
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params:
- 3600
operator:
type: and
refId: C
- orgId: 1
name: frigate-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: frigate-camera-down
title: FrigateCameraDown
condition: C
for: 5m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
Frigate camera {{ index $labels "camera_name" }} has 0 FPS
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-frigate-camera-down
labels:
severity: warning
service: frigate
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: frigate_camera_fps
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: database-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: postgres-cluster-unhealthy
title: PostgresClusterUnhealthy
condition: C
for: 3m
noDataState: Alerting
execErrState: Alerting
annotations:
summary: >-
PostgreSQL cluster {{ index $labels "cluster" }} is unhealthy
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-postgres-unhealthy
labels:
severity: critical
service: postgresql
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: cnpg_collector_up
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: pod-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: pod-not-ready
title: PodNotReady
condition: C
for: 5m
noDataState: OK
execErrState: Alerting
annotations:
summary: >-
Pod {{ index $labels "pod" }} in {{ index $labels "namespace" }} is not ready
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-pod-not-ready
labels:
severity: warning
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 60
to: 0
model:
expr: >-
kube_pod_status_ready{condition="true"} == 0
unless on (namespace, pod)
kube_pod_owner{owner_kind="Job"}
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: lt
params:
- 1
operator:
type: and
refId: C
- orgId: 1
name: argocd-health
folder: Infrastructure Alerts
interval: 60s
rules:
- uid: argocd-app-out-of-sync
title: ArgoCDAppOutOfSync
condition: C
for: 5m
noDataState: OK
execErrState: Alerting
annotations:
summary: >-
ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }}
runbook_url: https://docs.eblu.me/how-to/runbooks/runbook-argocd-out-of-sync
labels:
severity: warning
service: argocd
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 60
to: 0
model:
expr: >-
argocd_app_info{sync_status!="Synced"}
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params:
- 0
operator:
type: and
refId: C
- orgId: 1
name: flyio-proxy-health
folder: Infrastructure Alerts
interval: 30s
rules:
- uid: flyio-upstream-unreachable
title: FlyioUpstreamUnreachable
condition: C
for: 3m
noDataState: OK
execErrState: Alerting
annotations:
summary: >-
Fly.io proxy returning elevated 502s — upstream DNS may be stale. Run: mise run fly-reload
runbook_url: https://docs.eblu.me/how-to/operations/manage-flyio-proxy
labels:
severity: warning
service: flyio-proxy
data:
- refId: A
datasourceUid: prometheus
relativeTimeRange:
from: 300
to: 0
model:
expr: >-
sum(rate(flyio_nginx_http_requests_total{instance="flyio-proxy",status="502"}[5m]))
/ sum(rate(flyio_nginx_http_requests_total{instance="flyio-proxy"}[5m]))
> 0.5
interval: ""
refId: A
- refId: B
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: reduce
expression: A
reducer: last
settings:
mode: dropNN
refId: B
- refId: C
datasourceUid: "__expr__"
relativeTimeRange:
from: 0
to: 0
model:
type: threshold
expression: B
conditions:
- evaluator:
type: gt
params:
- 0
operator:
type: and
refId: C
templates:
- orgId: 1
name: ntfy-infra
template: |
{{ define "ntfy-infra.payload" -}}
{{- $msg := "" -}}
{{- range .Alerts -}}
{{- $msg = (printf "%s%s\n" $msg .Annotations.summary) -}}
{{- end -}}
{{- $title := (printf "[%s] %s" (.Status | toUpper) .CommonLabels.alertname) -}}
{{- $actions := coll.Slice -}}
{{- range .Alerts -}}
{{- if .Annotations.runbook_url -}}
{{- $actions = coll.Append (coll.Dict "action" "view" "label" "Open Runbook" "url" .Annotations.runbook_url) $actions -}}
{{- end -}}
{{- end -}}
{{- coll.Dict "topic" "infra-alerts" "title" $title "message" $msg "priority" 3 "actions" $actions | data.ToJSON -}}
{{- end }}