From 959b6842bc6bd31697dcec363e4b50019fd2c84b Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Mon, 9 Feb 2026 11:34:19 -0800
Subject: [PATCH] Zero-downtime Fly.io deploys (#132)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary
- Start nginx after Tailscale connects (community best practice for Tailscale sidecars)
- Switch to `bluegreen` deploy strategy — old machine serves until new one is healthy
- Replace top-level `[checks]` with `[[http_service.checks]]` — only service-level checks gate traffic routing ([confirmed by Fly.io staff](https://community.fly.io/t/clarifying-the-types-of-health-checks/20379))
- Remove sentinel file and nginx if-check (no longer needed)

Supersedes the approach in #131 — that helped (502 window dropped from ~30s to ~3s) but couldn't fully eliminate it because top-level checks don't gate routing and Fly.io's proxy sends traffic as soon as the port is reachable.

## Deployment and Testing
- [ ] Merge and `fly deploy` from `fly/` directory
- [ ] Verify deploy completes with zero 502s (watch `fly logs` and Grafana docs-apm)
- [ ] Confirm `fly checks list` shows the new service-level check passing

Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/132
---
 .../fix-zero-downtime-deploy.infra.md         |  1 +
 fly/fly.toml                                  | 14 +++++++-----
 fly/nginx.conf                                |  7 ++----
 fly/start.sh                                  | 22 ++++++++-----------
 4 files changed, 20 insertions(+), 24 deletions(-)
 create mode 100644 docs/changelog.d/fix-zero-downtime-deploy.infra.md

diff --git a/docs/changelog.d/fix-zero-downtime-deploy.infra.md b/docs/changelog.d/fix-zero-downtime-deploy.infra.md
new file mode 100644
index 0000000..0bb7b16
--- /dev/null
+++ b/docs/changelog.d/fix-zero-downtime-deploy.infra.md
@@ -0,0 +1 @@
+Eliminate 502 errors during Fly.io proxy deploys by starting nginx after Tailscale, switching to bluegreen deploys, and using service-level health checks for traffic gating.
diff --git a/fly/fly.toml b/fly/fly.toml
index 90b649e..9399c8b 100644
--- a/fly/fly.toml
+++ b/fly/fly.toml
@@ -3,6 +3,9 @@ primary_region = "sjc"
 
 [build]
 
+[deploy]
+strategy = "bluegreen"
+
 [http_service]
 internal_port = 8080
 force_https = true
@@ -10,10 +13,9 @@ auto_stop_machines = "off"
 auto_start_machines = true
 min_machines_running = 1
 
-[checks]
-[checks.health]
-port = 8080
-type = "http"
-interval = "30s"
-timeout = "5s"
+[[http_service.checks]]
+grace_period = "15s"
+interval = "10s"
+method = "GET"
 path = "/healthz"
+timeout = "5s"
diff --git a/fly/nginx.conf b/fly/nginx.conf
index d27ff15..1884150 100644
--- a/fly/nginx.conf
+++ b/fly/nginx.conf
@@ -34,8 +34,8 @@ http {
                      max_size=200m inactive=24h;
 
     # MagicDNS resolver — using a variable in proxy_pass defers upstream DNS
-    # resolution to request time, letting nginx start before Tailscale connects.
-    # Results are cached for 30s per worker to avoid per-request DNS lookups.
+    # resolution to request time (not config time). Results are cached for
+    # 30s per worker to avoid per-request DNS lookups.
     resolver 100.100.100.100 valid=30s;
     resolver_timeout 5s;
 
@@ -76,9 +76,6 @@ http {
         listen 8080 default_server;
 
         location /healthz {
-            if (!-f /tmp/tailscale-ready) {
-                return 503 "starting\n";
-            }
             return 200 "ok\n";
         }
 
diff --git a/fly/start.sh b/fly/start.sh
index f923b81..96ccbf0 100644
--- a/fly/start.sh
+++ b/fly/start.sh
@@ -1,26 +1,22 @@
 #!/bin/sh
 set -e
 
-# Start nginx immediately so port 8080 is bound (avoids connection refused).
-# Health check returns 503 until /tmp/tailscale-ready exists, so Fly.io
-# keeps the old machine serving traffic until Tailscale connects.
-nginx -g "daemon off;" &
-NGINX_PID=$!
-echo "Nginx started (waiting for Tailscale before proxying)"
-
-# Start tailscale daemon. Fly.io runs Firecracker microVMs which support
-# TUN devices natively — no need for --tun=userspace-networking.
+# Connect to tailnet first — nginx needs MagicDNS for upstream resolution.
+# With bluegreen deploys, the old machine serves traffic until this one is
+# fully ready. Fly.io runs Firecracker microVMs that support TUN devices
+# natively — no need for --tun=userspace-networking.
 tailscaled --statedir=/var/lib/tailscale &
 sleep 2
 
-# Authenticate and join tailnet
 tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy
-
-# Wait for tailscale to be ready, then signal nginx health check
 until tailscale status > /dev/null 2>&1; do sleep 1; done
-touch /tmp/tailscale-ready
 echo "Tailscale connected"
 
+# Start nginx — MagicDNS is available, health check passes immediately.
+nginx -g "daemon off;" &
+NGINX_PID=$!
+echo "Nginx started"
+
 # Start Alloy for observability (logs → Loki, metrics → Prometheus)
 alloy run /etc/alloy/config.alloy \
     --server.http.listen-addr=127.0.0.1:12345 \