diff --git a/docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md b/docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md new file mode 100644 index 0000000..0220ca3 --- /dev/null +++ b/docs/changelog.d/fix-deploy-healthcheck-race.bugfix.md @@ -0,0 +1 @@ +Fix 502 errors during Fly.io proxy deploys by deferring health check until Tailscale is connected. diff --git a/fly/nginx.conf b/fly/nginx.conf index f01a1a3..d27ff15 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -76,6 +76,9 @@ http { listen 8080 default_server; location /healthz { + if (!-f /tmp/tailscale-ready) { + return 503 "starting\n"; + } return 200 "ok\n"; } diff --git a/fly/start.sh b/fly/start.sh index 620dfea..f923b81 100644 --- a/fly/start.sh +++ b/fly/start.sh @@ -1,9 +1,9 @@ #!/bin/sh set -e -# Start nginx immediately so port 8080 is bound before Fly's deploy checks. -# Upstream DNS resolution is deferred via resolver + variable in nginx.conf, -# so nginx starts cleanly even before Tailscale connects. +# Start nginx immediately so port 8080 is bound (avoids connection refused). +# Health check returns 503 until /tmp/tailscale-ready exists, so Fly.io +# keeps the old machine serving traffic until Tailscale connects. nginx -g "daemon off;" & NGINX_PID=$! echo "Nginx started (waiting for Tailscale before proxying)" @@ -16,8 +16,9 @@ sleep 2 # Authenticate and join tailnet tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy -# Wait for tailscale to be ready +# Wait for tailscale to be ready, then signal nginx health check until tailscale status > /dev/null 2>&1; do sleep 1; done +touch /tmp/tailscale-ready echo "Tailscale connected" # Start Alloy for observability (logs → Loki, metrics → Prometheus)