diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml index dfcc5a3..dcb6762 100644 --- a/argocd/manifests/grafana/alerting.yaml +++ b/argocd/manifests/grafana/alerting.yaml @@ -315,6 +315,64 @@ groups: type: and refId: C + - orgId: 1 + name: argocd-health + folder: Infrastructure Alerts + interval: 60s + rules: + - uid: argocd-app-out-of-sync + title: ArgoCDAppOutOfSync + condition: C + for: 30m + noDataState: OK + execErrState: Alerting + annotations: + summary: >- + ArgoCD app {{ index $labels "name" }} is {{ index $labels "sync_status" }} + runbook_url: https://docs.eblu.me/how-to/alerts/runbook-argocd-out-of-sync + labels: + severity: warning + service: argocd + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + argocd_app_info{sync_status!="Synced"} + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: gt + params: + - 0 + operator: + type: and + refId: C + templates: - orgId: 1 name: ntfy-infra diff --git a/argocd/manifests/prometheus/prometheus.yml b/argocd/manifests/prometheus/prometheus.yml index 2d2dbcf..f96ce12 100644 --- a/argocd/manifests/prometheus/prometheus.yml +++ b/argocd/manifests/prometheus/prometheus.yml @@ -80,6 +80,14 @@ scrape_configs: - target_label: cluster replacement: indri + # ArgoCD application metrics + - job_name: "argocd" + static_configs: + - targets: ["argocd-metrics.argocd.svc.cluster.local:8082"] + metric_relabel_configs: + - target_label: cluster + replacement: indri + # Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail) - job_name: "frigate" scheme: https diff --git a/docs/how-to/alerts/runbook-argocd-out-of-sync.md b/docs/how-to/alerts/runbook-argocd-out-of-sync.md new file mode 100644 index 0000000..753b336 --- /dev/null +++ b/docs/how-to/alerts/runbook-argocd-out-of-sync.md @@ -0,0 +1,65 @@ +--- +title: "Runbook: ArgoCD App Out of Sync" +modified: 2026-03-22 +tags: + - how-to + - alerting + - runbook +--- + +# Runbook: ArgoCD App Out of Sync + +**Alert name:** `ArgoCDAppOutOfSync` + +An ArgoCD application has been out of sync for 30+ minutes. This means the live state in Kubernetes differs from what's declared in Git. + +## Diagnostic Steps + +1. **Check which app is out of sync** — the `name` label in the alert tells you: + ```fish + argocd app get + ``` + +2. **View the diff**: + ```fish + argocd app diff + ``` + +3. **Check if it's a branch revision issue** — during C1/C2 work, apps may be pointed at a feature branch. After merge, they need to be reset to main: + ```fish + argocd app get -o json | python3 -c "import json,sys; print(json.load(sys.stdin)['spec']['source']['targetRevision'])" + ``` + +4. **Check ArgoCD UI** — https://argocd.ops.eblu.me — look for sync errors or degraded status. + +## Common Causes + +- **Forgot to sync after push** — ArgoCD uses manual sync; changes require explicit `argocd app sync` +- **Branch revision not reset after PR merge** — app still points at a deleted branch +- **Kustomize/manifest error** — invalid YAML or unsatisfiable resource requirements +- **Pruning needed** — old ConfigMaps from `configMapGenerator` need pruning + +## Resolution + +```fish +# Simple sync +argocd app sync + +# If pruning is needed +argocd app sync --prune + +# If stuck on a deleted branch +argocd app set --revision main +argocd app sync +``` + +## Silencing + +During active C1/C2 development, apps may intentionally be out of sync: +1. Grafana → Alerting → Silences → Create Silence +2. Match `alertname = ArgoCDAppOutOfSync` and `name = ` + +## Related + +- [[argocd]] — ArgoCD reference +- [[deploy-infra-alerting]] — Alerting pipeline overview diff --git a/docs/reference/operations/observability.md b/docs/reference/operations/observability.md index 9d4a7a0..35136d5 100644 --- a/docs/reference/operations/observability.md +++ b/docs/reference/operations/observability.md @@ -25,3 +25,4 @@ Metrics, logs, traces, and dashboards for BlumeOps infrastructure. - [[runbook-pod-not-ready]] - Pod not ready runbook - [[runbook-textfile-stale]] - Metrics textfile freshness runbook - [[runbook-frigate-camera-down]] - Frigate camera health runbook +- [[runbook-argocd-out-of-sync]] - ArgoCD sync status runbook