From be3cdad1cbb0967bbcc4a118d5f8034821a1bb40 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 26 Feb 2026 07:53:21 -0800 Subject: [PATCH] Add HA for CV and Docs: zero-downtime deploys (#273) ## Summary - Set `replicas: 2` with `maxUnavailable: 0` / `maxSurge: 1` on CV and Docs deployments so rolling updates never drop below 2 ready pods - Add PodDisruptionBudgets (`minAvailable: 1`) to protect against node drains and cluster maintenance - Add Fly.io cache purge step to `cv-deploy.yaml` workflow (docs already had this) so CV deploys don't serve stale cached content ## Deployment and Testing - [ ] `argocd app diff cv` / `argocd app diff docs` from branch - [ ] Deploy from branch: `argocd app set cv --revision feature/ha-cv-docs-zero-downtime && argocd app sync cv` - [ ] Verify 2 pods running: `kubectl get pods -n cv --context=minikube-indri` - [ ] Test rolling restart: `kubectl rollout restart deployment/cv -n cv --context=minikube-indri` - [ ] During rollout, confirm continuous availability via `curl -I https://cv.eblu.me` - [ ] After merge: reset ArgoCD to main, re-sync both apps Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/273 --- .forgejo/workflows/cv-deploy.yaml | 8 ++++++++ argocd/manifests/cv/deployment.yaml | 7 ++++++- argocd/manifests/cv/kustomization.yaml | 1 + argocd/manifests/cv/pdb.yaml | 10 ++++++++++ argocd/manifests/docs/deployment.yaml | 7 ++++++- argocd/manifests/docs/kustomization.yaml | 1 + argocd/manifests/docs/pdb.yaml | 10 ++++++++++ .../feature/ha-cv-docs-zero-downtime.infra.md | 1 + 8 files changed, 43 insertions(+), 2 deletions(-) create mode 100644 argocd/manifests/cv/pdb.yaml create mode 100644 argocd/manifests/docs/pdb.yaml create mode 100644 docs/changelog.d/feature/ha-cv-docs-zero-downtime.infra.md diff --git a/.forgejo/workflows/cv-deploy.yaml b/.forgejo/workflows/cv-deploy.yaml index 4aec393..983154b 100644 --- a/.forgejo/workflows/cv-deploy.yaml +++ b/.forgejo/workflows/cv-deploy.yaml @@ -112,6 +112,14 @@ jobs: echo "CV app synced successfully!" + - name: Purge Fly.io proxy cache + env: + FLY_API_TOKEN: ${{ secrets.FLY_DEPLOY_TOKEN }} + run: | + echo "Purging nginx cache on Fly.io proxy..." + fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'" + echo "Cache purged" + - name: Summary run: | VERSION="${{ steps.version.outputs.version }}" diff --git a/argocd/manifests/cv/deployment.yaml b/argocd/manifests/cv/deployment.yaml index 57c850b..ba969fc 100644 --- a/argocd/manifests/cv/deployment.yaml +++ b/argocd/manifests/cv/deployment.yaml @@ -5,7 +5,12 @@ metadata: name: cv namespace: cv spec: - replicas: 1 + replicas: 2 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 selector: matchLabels: app: cv diff --git a/argocd/manifests/cv/kustomization.yaml b/argocd/manifests/cv/kustomization.yaml index 9ba628f..21e5a1f 100644 --- a/argocd/manifests/cv/kustomization.yaml +++ b/argocd/manifests/cv/kustomization.yaml @@ -6,6 +6,7 @@ resources: - deployment.yaml - service.yaml - ingress-tailscale.yaml + - pdb.yaml images: - name: registry.ops.eblu.me/blumeops/cv newTag: v1.0.3-ffa8727 diff --git a/argocd/manifests/cv/pdb.yaml b/argocd/manifests/cv/pdb.yaml new file mode 100644 index 0000000..db5240d --- /dev/null +++ b/argocd/manifests/cv/pdb.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: cv +spec: + minAvailable: 1 + selector: + matchLabels: + app: cv diff --git a/argocd/manifests/docs/deployment.yaml b/argocd/manifests/docs/deployment.yaml index f19f6f8..e4409a9 100644 --- a/argocd/manifests/docs/deployment.yaml +++ b/argocd/manifests/docs/deployment.yaml @@ -5,7 +5,12 @@ metadata: name: docs namespace: docs spec: - replicas: 1 + replicas: 2 + strategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 0 + maxSurge: 1 selector: matchLabels: app: docs diff --git a/argocd/manifests/docs/kustomization.yaml b/argocd/manifests/docs/kustomization.yaml index 492c8a3..d014c23 100644 --- a/argocd/manifests/docs/kustomization.yaml +++ b/argocd/manifests/docs/kustomization.yaml @@ -6,6 +6,7 @@ resources: - deployment.yaml - service.yaml - ingress-tailscale.yaml + - pdb.yaml images: - name: registry.ops.eblu.me/blumeops/quartz newTag: v1.28.2-ffa8727 diff --git a/argocd/manifests/docs/pdb.yaml b/argocd/manifests/docs/pdb.yaml new file mode 100644 index 0000000..a87b8e9 --- /dev/null +++ b/argocd/manifests/docs/pdb.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: docs +spec: + minAvailable: 1 + selector: + matchLabels: + app: docs diff --git a/docs/changelog.d/feature/ha-cv-docs-zero-downtime.infra.md b/docs/changelog.d/feature/ha-cv-docs-zero-downtime.infra.md new file mode 100644 index 0000000..b6a7e03 --- /dev/null +++ b/docs/changelog.d/feature/ha-cv-docs-zero-downtime.infra.md @@ -0,0 +1 @@ +Add HA (2 replicas + PDB) for CV and Docs services for zero-downtime deploys.