From bd61da4f857eeb1312a5fa90ea4e291c06941ff5 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 11:07:36 -0800 Subject: [PATCH] Fix 502 errors during Fly.io proxy deploys (#131) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Health check (`/healthz`) now returns 503 until Tailscale is connected - `start.sh` creates `/tmp/tailscale-ready` sentinel after `tailscale up` succeeds - Fly.io keeps the old machine serving traffic during the ~7s startup window Previously, nginx passed the health check immediately, Fly.io routed traffic to the new machine, but MagicDNS wasn't available yet — causing upstream DNS timeouts and 502s on every request until Tailscale connected. ## Deployment and Testing - [ ] Merge and `fly deploy` from `fly/` directory - [ ] Verify deploy completes with zero 502s (check Grafana docs-apm dashboard) - [ ] Confirm health check transitions from 503 → 200 in `fly logs` Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/131 --- docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md | 1 + fly/nginx.conf | 3 +++ fly/start.sh | 9 +++++---- 3 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md diff --git a/docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md b/docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md new file mode 100644 index 0000000..0220ca3 --- /dev/null +++ b/docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md @@ -0,0 +1 @@ +Fix 502 errors during Fly.io proxy deploys by deferring health check until Tailscale is connected. diff --git a/fly/nginx.conf b/fly/nginx.conf index f01a1a3..d27ff15 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -76,6 +76,9 @@ http { listen 8080 default_server; location /healthz { + if (!-f /tmp/tailscale-ready) { + return 503 "starting\n"; + } return 200 "ok\n"; } diff --git a/fly/start.sh b/fly/start.sh index 620dfea..f923b81 100644 --- a/fly/start.sh +++ b/fly/start.sh @@ -1,9 +1,9 @@ #!/bin/sh set -e -# Start nginx immediately so port 8080 is bound before Fly's deploy checks. -# Upstream DNS resolution is deferred via resolver + variable in nginx.conf, -# so nginx starts cleanly even before Tailscale connects. +# Start nginx immediately so port 8080 is bound (avoids connection refused). +# Health check returns 503 until /tmp/tailscale-ready exists, so Fly.io +# keeps the old machine serving traffic until Tailscale connects. nginx -g "daemon off;" & NGINX_PID=$! echo "Nginx started (waiting for Tailscale before proxying)" @@ -16,8 +16,9 @@ sleep 2 # Authenticate and join tailnet tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy -# Wait for tailscale to be ready +# Wait for tailscale to be ready, then signal nginx health check until tailscale status > /dev/null 2>&1; do sleep 1; done +touch /tmp/tailscale-ready echo "Tailscale connected" # Start Alloy for observability (logs → Loki, metrics → Prometheus)