Zero-downtime Fly.io deploys #132
4 changed files with 20 additions and 24 deletions
Zero-downtime Fly.io deploys: bluegreen + startup reorder
Three changes to eliminate 502s during proxy deploys: 1. Start nginx after Tailscale connects (not before) so MagicDNS is always available when the first request arrives. This is the community-recommended pattern for Tailscale sidecars on Fly.io. 2. Switch deploy strategy to bluegreen — the old machine keeps serving traffic until the new one passes health checks, then Fly.io cuts over. Rolling deploys with a single machine always cause downtime. 3. Replace top-level [checks] with [[http_service.checks]]. Top-level checks only monitor; they don't gate traffic routing. Service-level checks tell the Fly Proxy to hold traffic until the app is ready. The sentinel file (/tmp/tailscale-ready) and nginx if-check are removed since nginx no longer starts before Tailscale. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
commit
4bbe4e7c20
1
docs/changelog.d/fix-zero-downtime-deploy.infra.md
Normal file
1
docs/changelog.d/fix-zero-downtime-deploy.infra.md
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
Eliminate 502 errors during Fly.io proxy deploys by starting nginx after Tailscale, switching to bluegreen deploys, and using service-level health checks for traffic gating.
|
||||||
14
fly/fly.toml
14
fly/fly.toml
|
|
@ -3,6 +3,9 @@ primary_region = "sjc"
|
||||||
|
|
||||||
[build]
|
[build]
|
||||||
|
|
||||||
|
[deploy]
|
||||||
|
strategy = "bluegreen"
|
||||||
|
|
||||||
[http_service]
|
[http_service]
|
||||||
internal_port = 8080
|
internal_port = 8080
|
||||||
force_https = true
|
force_https = true
|
||||||
|
|
@ -10,10 +13,9 @@ auto_stop_machines = "off"
|
||||||
auto_start_machines = true
|
auto_start_machines = true
|
||||||
min_machines_running = 1
|
min_machines_running = 1
|
||||||
|
|
||||||
[checks]
|
[[http_service.checks]]
|
||||||
[checks.health]
|
grace_period = "15s"
|
||||||
port = 8080
|
interval = "10s"
|
||||||
type = "http"
|
method = "GET"
|
||||||
interval = "30s"
|
|
||||||
timeout = "5s"
|
|
||||||
path = "/healthz"
|
path = "/healthz"
|
||||||
|
timeout = "5s"
|
||||||
|
|
|
||||||
|
|
@ -34,8 +34,8 @@ http {
|
||||||
max_size=200m inactive=24h;
|
max_size=200m inactive=24h;
|
||||||
|
|
||||||
# MagicDNS resolver — using a variable in proxy_pass defers upstream DNS
|
# MagicDNS resolver — using a variable in proxy_pass defers upstream DNS
|
||||||
# resolution to request time, letting nginx start before Tailscale connects.
|
# resolution to request time (not config time). Results are cached for
|
||||||
# Results are cached for 30s per worker to avoid per-request DNS lookups.
|
# 30s per worker to avoid per-request DNS lookups.
|
||||||
resolver 100.100.100.100 valid=30s;
|
resolver 100.100.100.100 valid=30s;
|
||||||
resolver_timeout 5s;
|
resolver_timeout 5s;
|
||||||
|
|
||||||
|
|
@ -76,9 +76,6 @@ http {
|
||||||
listen 8080 default_server;
|
listen 8080 default_server;
|
||||||
|
|
||||||
location /healthz {
|
location /healthz {
|
||||||
if (!-f /tmp/tailscale-ready) {
|
|
||||||
return 503 "starting\n";
|
|
||||||
}
|
|
||||||
return 200 "ok\n";
|
return 200 "ok\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
22
fly/start.sh
22
fly/start.sh
|
|
@ -1,26 +1,22 @@
|
||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Start nginx immediately so port 8080 is bound (avoids connection refused).
|
# Connect to tailnet first — nginx needs MagicDNS for upstream resolution.
|
||||||
# Health check returns 503 until /tmp/tailscale-ready exists, so Fly.io
|
# With bluegreen deploys, the old machine serves traffic until this one is
|
||||||
# keeps the old machine serving traffic until Tailscale connects.
|
# fully ready. Fly.io runs Firecracker microVMs that support TUN devices
|
||||||
nginx -g "daemon off;" &
|
# natively — no need for --tun=userspace-networking.
|
||||||
NGINX_PID=$!
|
|
||||||
echo "Nginx started (waiting for Tailscale before proxying)"
|
|
||||||
|
|
||||||
# Start tailscale daemon. Fly.io runs Firecracker microVMs which support
|
|
||||||
# TUN devices natively — no need for --tun=userspace-networking.
|
|
||||||
tailscaled --statedir=/var/lib/tailscale &
|
tailscaled --statedir=/var/lib/tailscale &
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
# Authenticate and join tailnet
|
|
||||||
tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy
|
tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy
|
||||||
|
|
||||||
# Wait for tailscale to be ready, then signal nginx health check
|
|
||||||
until tailscale status > /dev/null 2>&1; do sleep 1; done
|
until tailscale status > /dev/null 2>&1; do sleep 1; done
|
||||||
touch /tmp/tailscale-ready
|
|
||||||
echo "Tailscale connected"
|
echo "Tailscale connected"
|
||||||
|
|
||||||
|
# Start nginx — MagicDNS is available, health check passes immediately.
|
||||||
|
nginx -g "daemon off;" &
|
||||||
|
NGINX_PID=$!
|
||||||
|
echo "Nginx started"
|
||||||
|
|
||||||
# Start Alloy for observability (logs → Loki, metrics → Prometheus)
|
# Start Alloy for observability (logs → Loki, metrics → Prometheus)
|
||||||
alloy run /etc/alloy/config.alloy \
|
alloy run /etc/alloy/config.alloy \
|
||||||
--server.http.listen-addr=127.0.0.1:12345 \
|
--server.http.listen-addr=127.0.0.1:12345 \
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue