From 959b6842bc6bd31697dcec363e4b50019fd2c84b Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 11:34:19 -0800 Subject: [PATCH] Zero-downtime Fly.io deploys (#132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Start nginx after Tailscale connects (community best practice for Tailscale sidecars) - Switch to `bluegreen` deploy strategy — old machine serves until new one is healthy - Replace top-level `[checks]` with `[[http_service.checks]]` — only service-level checks gate traffic routing ([confirmed by Fly.io staff](https://community.fly.io/t/clarifying-the-types-of-health-checks/20379)) - Remove sentinel file and nginx if-check (no longer needed) Supersedes the approach in #131 — that helped (502 window dropped from ~30s to ~3s) but couldn't fully eliminate it because top-level checks don't gate routing and Fly.io's proxy sends traffic as soon as the port is reachable. ## Deployment and Testing - [ ] Merge and `fly deploy` from `fly/` directory - [ ] Verify deploy completes with zero 502s (watch `fly logs` and Grafana docs-apm) - [ ] Confirm `fly checks list` shows the new service-level check passing Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/132 --- .../fix-zero-downtime-deploy.infra.md | 1 + fly/fly.toml | 14 +++++++----- fly/nginx.conf | 7 ++---- fly/start.sh | 22 ++++++++----------- 4 files changed, 20 insertions(+), 24 deletions(-) create mode 100644 docs/changelog.d/fix-zero-downtime-deploy.infra.md diff --git a/docs/changelog.d/fix-zero-downtime-deploy.infra.md b/docs/changelog.d/fix-zero-downtime-deploy.infra.md new file mode 100644 index 0000000..0bb7b16 --- /dev/null +++ b/docs/changelog.d/fix-zero-downtime-deploy.infra.md @@ -0,0 +1 @@ +Eliminate 502 errors during Fly.io proxy deploys by starting nginx after Tailscale, switching to bluegreen deploys, and using service-level health checks for traffic gating. diff --git a/fly/fly.toml b/fly/fly.toml index 90b649e..9399c8b 100644 --- a/fly/fly.toml +++ b/fly/fly.toml @@ -3,6 +3,9 @@ primary_region = "sjc" [build] +[deploy] +strategy = "bluegreen" + [http_service] internal_port = 8080 force_https = true @@ -10,10 +13,9 @@ auto_stop_machines = "off" auto_start_machines = true min_machines_running = 1 -[checks] -[checks.health] -port = 8080 -type = "http" -interval = "30s" -timeout = "5s" +[[http_service.checks]] +grace_period = "15s" +interval = "10s" +method = "GET" path = "/healthz" +timeout = "5s" diff --git a/fly/nginx.conf b/fly/nginx.conf index d27ff15..1884150 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -34,8 +34,8 @@ http { max_size=200m inactive=24h; # MagicDNS resolver — using a variable in proxy_pass defers upstream DNS - # resolution to request time, letting nginx start before Tailscale connects. - # Results are cached for 30s per worker to avoid per-request DNS lookups. + # resolution to request time (not config time). Results are cached for + # 30s per worker to avoid per-request DNS lookups. resolver 100.100.100.100 valid=30s; resolver_timeout 5s; @@ -76,9 +76,6 @@ http { listen 8080 default_server; location /healthz { - if (!-f /tmp/tailscale-ready) { - return 503 "starting\n"; - } return 200 "ok\n"; } diff --git a/fly/start.sh b/fly/start.sh index f923b81..96ccbf0 100644 --- a/fly/start.sh +++ b/fly/start.sh @@ -1,26 +1,22 @@ #!/bin/sh set -e -# Start nginx immediately so port 8080 is bound (avoids connection refused). -# Health check returns 503 until /tmp/tailscale-ready exists, so Fly.io -# keeps the old machine serving traffic until Tailscale connects. -nginx -g "daemon off;" & -NGINX_PID=$! -echo "Nginx started (waiting for Tailscale before proxying)" - -# Start tailscale daemon. Fly.io runs Firecracker microVMs which support -# TUN devices natively — no need for --tun=userspace-networking. +# Connect to tailnet first — nginx needs MagicDNS for upstream resolution. +# With bluegreen deploys, the old machine serves traffic until this one is +# fully ready. Fly.io runs Firecracker microVMs that support TUN devices +# natively — no need for --tun=userspace-networking. tailscaled --statedir=/var/lib/tailscale & sleep 2 -# Authenticate and join tailnet tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy - -# Wait for tailscale to be ready, then signal nginx health check until tailscale status > /dev/null 2>&1; do sleep 1; done -touch /tmp/tailscale-ready echo "Tailscale connected" +# Start nginx — MagicDNS is available, health check passes immediately. +nginx -g "daemon off;" & +NGINX_PID=$! +echo "Nginx started" + # Start Alloy for observability (logs → Loki, metrics → Prometheus) alloy run /etc/alloy/config.alloy \ --server.http.listen-addr=127.0.0.1:12345 \