From 6a1d9cc0bf8b677e01365dd2206c5469a38fe763 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 17 Apr 2026 15:42:57 -0700
Subject: [PATCH 1/6] Switch Fly proxy to upstream keepalive pools

Replace per-request DNS resolution (variable-based proxy_pass) with
static upstream blocks and keepalive connection pools. This reuses
TLS connections through the Tailscale tunnel instead of handshaking
per request, which should significantly reduce latency at >1 req/s.

Trade-off: DNS is resolved at config load, not per-request. If
Tailscale Ingress pods get new IPs, run `mise run fly-reload` to
re-resolve.

Also adds mise-tasks/fly-reload for nginx config reload without
full redeploy.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fly/nginx.conf        | 62 ++++++++++++++++++++++++++++---------------
 mise-tasks/fly-reload | 16 +++++++++++
 2 files changed, 57 insertions(+), 21 deletions(-)
 create mode 100755 mise-tasks/fly-reload

diff --git a/fly/nginx.conf b/fly/nginx.conf
index db02a21..ca4eb11 100644
--- a/fly/nginx.conf
+++ b/fly/nginx.conf
@@ -46,18 +46,32 @@ http {
     proxy_cache_path /tmp/cache levels=1:2 keys_zone=services:10m
                      max_size=200m inactive=24h;
 
-    # MagicDNS resolver — using a variable in proxy_pass defers upstream DNS
-    # resolution to request time (not config time). Results are cached for
-    # 30s per worker to avoid per-request DNS lookups.
+    # WebSocket-aware Connection header. Only send "upgrade" when the client
+    # actually requests a protocol switch; otherwise empty string to preserve
+    # upstream keepalive connections.
+    map $http_upgrade $connection_upgrade {
+        default "";
+        websocket upgrade;
+    }
+
+    # --- Upstream pools with keepalive ---
+    # DNS is resolved once at config load via MagicDNS. If Tailscale Ingress
+    # pods get new IPs (restart, reschedule), run `mise run fly-reload` to
+    # re-resolve. A Grafana alert fires when upstreams are unreachable.
     resolver 100.100.100.100 valid=30s;
     resolver_timeout 5s;
 
-    # WebSocket-aware Connection header. Only send "upgrade" when the client
-    # actually requests a protocol switch; otherwise "close" (the HTTP/1.1
-    # default when keepalive pooling is not available).
-    map $http_upgrade $connection_upgrade {
-        default close;
-        websocket upgrade;
+    upstream forge_backend {
+        server forge.tail8d86e.ts.net:443;
+        keepalive 8;
+    }
+    upstream docs_backend {
+        server docs.tail8d86e.ts.net:443;
+        keepalive 4;
+    }
+    upstream cv_backend {
+        server cv.tail8d86e.ts.net:443;
+        keepalive 4;
     }
 
     # --- docs.eblu.me (static site) ---
@@ -76,12 +90,14 @@ http {
             internal;
         }
         location / {
-            set $upstream_docs https://docs.tail8d86e.ts.net;
-            proxy_pass $upstream_docs$request_uri;
+            proxy_pass https://docs_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
             proxy_intercept_errors on;
 
+            proxy_http_version 1.1;
+            proxy_set_header Connection $connection_upgrade;
+
             # Cache aggressively — static site only.
             # Do NOT use these settings for dynamic services.
             proxy_cache services;
@@ -116,12 +132,14 @@ http {
         }
 
         location / {
-            set $upstream_cv https://cv.tail8d86e.ts.net;
-            proxy_pass $upstream_cv$request_uri;
+            proxy_pass https://cv_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
             proxy_intercept_errors on;
 
+            proxy_http_version 1.1;
+            proxy_set_header Connection $connection_upgrade;
+
             proxy_cache services;
             proxy_cache_valid 200 1d;
             proxy_cache_valid 404 1m;
@@ -187,8 +205,7 @@ http {
         location ~ ^/user/(login|sign_up|forgot_password) {
             limit_req zone=forge_auth burst=5 nodelay;
 
-            set $upstream_forge https://forge.tail8d86e.ts.net;
-            proxy_pass $upstream_forge$request_uri;
+            proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
             proxy_intercept_errors on;
@@ -206,11 +223,13 @@ http {
         # Cache release artifact downloads — immutable files keyed by tag+filename.
         # Avoids hammering Forgejo when crawlers or users re-download the same asset.
         location ~ ^/[^/]+/[^/]+/releases/download/ {
-            set $upstream_forge_releases https://forge.tail8d86e.ts.net;
-            proxy_pass $upstream_forge_releases$request_uri;
+            proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
 
+            proxy_http_version 1.1;
+            proxy_set_header Connection $connection_upgrade;
+
             proxy_cache services;
             proxy_cache_valid 200 7d;
             proxy_cache_key $host$uri;
@@ -226,11 +245,13 @@ http {
 
         # Selectively cache static assets only
         location ~* \.(css|js|png|jpg|svg|woff2?)$ {
-            set $upstream_forge_static https://forge.tail8d86e.ts.net;
-            proxy_pass $upstream_forge_static$request_uri;
+            proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
 
+            proxy_http_version 1.1;
+            proxy_set_header Connection $connection_upgrade;
+
             proxy_cache services;
             proxy_cache_valid 200 7d;
             proxy_cache_key $host$uri;
@@ -240,8 +261,7 @@ http {
         }
 
         location / {
-            set $upstream_forge https://forge.tail8d86e.ts.net;
-            proxy_pass $upstream_forge$request_uri;
+            proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
             proxy_intercept_errors on;
diff --git a/mise-tasks/fly-reload b/mise-tasks/fly-reload
new file mode 100755
index 0000000..34806c5
--- /dev/null
+++ b/mise-tasks/fly-reload
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+#MISE description="Reload Fly.io proxy nginx config (re-resolves upstream DNS)"
+
+set -euo pipefail
+
+export FLY_API_TOKEN
+FLY_API_TOKEN="$(op read 'op://blumeops/fly.io admin/add more/deploy-token')"
+
+# SSH into the Fly machine and send nginx a reload signal.
+# This re-resolves upstream DNS without a full redeploy.
+APP="blumeops-proxy"
+MACHINE_ID=$(fly machines list -a "$APP" --json | python3 -c "import sys,json; print(json.load(sys.stdin)[0]['id'])")
+
+echo "Reloading nginx on machine $MACHINE_ID..."
+fly ssh console -a "$APP" -C "nginx -s reload"
+echo "Done. Upstream DNS re-resolved."
-- 
2.50.1 (Apple Git-155)


From 1236d381ebabe92e4b7e7e98fde0fc8cf5a2fe31 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 17 Apr 2026 15:47:21 -0700
Subject: [PATCH 2/6] Wait for MagicDNS readiness before starting nginx

Upstream blocks resolve DNS at config load. If MagicDNS isn't ready yet
(Tailscale just connected), nginx gets empty resolution and returns 502.
Poll nslookup until resolution works before launching nginx.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fly/start.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fly/start.sh b/fly/start.sh
index 5b08490..8fd1fd4 100644
--- a/fly/start.sh
+++ b/fly/start.sh
@@ -11,10 +11,18 @@ tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy
 until tailscale status > /dev/null 2>&1; do sleep 1; done
 echo "Tailscale connected"
 
+# Wait for MagicDNS to be ready — upstream blocks resolve DNS at config
+# load, so nginx will fail to start if MagicDNS can't resolve yet.
+echo "Waiting for MagicDNS..."
+until nslookup forge.tail8d86e.ts.net 100.100.100.100 > /dev/null 2>&1; do
+    sleep 1
+done
+echo "MagicDNS ready"
+
 # Ensure fail2ban deny file exists before nginx starts
 touch /etc/nginx/forge-deny.conf
 
-# Start nginx — MagicDNS is available, health check passes immediately.
+# Start nginx — MagicDNS is available, upstreams resolved.
 nginx -g "daemon off;" &
 NGINX_PID=$!
 echo "Nginx started"
-- 
2.50.1 (Apple Git-155)


From 903db4079d213f28b298c4318f4b1f8ed6b447fc Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 17 Apr 2026 15:51:51 -0700
Subject: [PATCH 3/6] Fix upstream keepalive: set proxy_ssl_name for correct
 SNI

With upstream blocks, nginx sends the block name as SNI instead of
the actual hostname. The Tailscale Ingress proxy needs the correct
SNI to route TLS connections. Add explicit proxy_ssl_name for each
upstream, and set Host header for docs/cv backends.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 fly/nginx.conf | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fly/nginx.conf b/fly/nginx.conf
index ca4eb11..5723722 100644
--- a/fly/nginx.conf
+++ b/fly/nginx.conf
@@ -93,6 +93,8 @@ http {
             proxy_pass https://docs_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
+            proxy_ssl_name docs.tail8d86e.ts.net;
+            proxy_set_header Host docs.tail8d86e.ts.net;
             proxy_intercept_errors on;
 
             proxy_http_version 1.1;
@@ -135,6 +137,8 @@ http {
             proxy_pass https://cv_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
+            proxy_ssl_name cv.tail8d86e.ts.net;
+            proxy_set_header Host cv.tail8d86e.ts.net;
             proxy_intercept_errors on;
 
             proxy_http_version 1.1;
@@ -208,6 +212,7 @@ http {
             proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
+            proxy_ssl_name forge.tail8d86e.ts.net;
             proxy_intercept_errors on;
 
             proxy_set_header Host $host;
@@ -226,6 +231,7 @@ http {
             proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
+            proxy_ssl_name forge.tail8d86e.ts.net;
 
             proxy_http_version 1.1;
             proxy_set_header Connection $connection_upgrade;
@@ -248,6 +254,7 @@ http {
             proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
+            proxy_ssl_name forge.tail8d86e.ts.net;
 
             proxy_http_version 1.1;
             proxy_set_header Connection $connection_upgrade;
@@ -264,6 +271,7 @@ http {
             proxy_pass https://forge_backend$request_uri;
             proxy_ssl_verify off;
             proxy_ssl_server_name on;
+            proxy_ssl_name forge.tail8d86e.ts.net;
             proxy_intercept_errors on;
 
             # NO proxy_cache — dynamic content with sessions
-- 
2.50.1 (Apple Git-155)


From a700befd5b2947d7faba54f5b725cbfa4e7adcb9 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 17 Apr 2026 16:04:54 -0700
Subject: [PATCH 4/6] Docs: update proxy architecture for upstream keepalive

Update flyio-proxy, forgejo, routing, manage-flyio-proxy,
expose-service-publicly, and mise-tasks docs to reflect:

- Upstream keepalive pools replacing variable-based proxy_pass
- proxy_ssl_name requirement for upstream blocks
- MagicDNS readiness check in start.sh
- fly-reload task for DNS re-resolution
- Crawler mitigation (robots.txt, archive redirect, release caching)
- Forgejo /metrics endpoint and archive cleanup cron
- cv.eblu.me in routing and exposed services tables
- upstream_response_time histogram metric
- Changelog fragment

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 docs/changelog.d/fly-proxy-keepalive.infra.md |  1 +
 docs/how-to/operations/manage-flyio-proxy.md  | 20 +++-
 docs/reference/infrastructure/routing.md      |  3 +-
 docs/reference/services/flyio-proxy.md        | 33 ++++++-
 docs/reference/services/forgejo.md            | 14 ++-
 docs/reference/tools/mise-tasks.md            |  3 +-
 docs/tutorials/expose-service-publicly.md     | 97 ++++++-------------
 7 files changed, 91 insertions(+), 80 deletions(-)
 create mode 100644 docs/changelog.d/fly-proxy-keepalive.infra.md

diff --git a/docs/changelog.d/fly-proxy-keepalive.infra.md b/docs/changelog.d/fly-proxy-keepalive.infra.md
new file mode 100644
index 0000000..8853150
--- /dev/null
+++ b/docs/changelog.d/fly-proxy-keepalive.infra.md
@@ -0,0 +1 @@
+Switched Fly proxy to upstream keepalive pools, reducing forge.eblu.me latency from 35s+ p50 to sub-second. Added `mise run fly-reload` for DNS re-resolution without redeploy.
diff --git a/docs/how-to/operations/manage-flyio-proxy.md b/docs/how-to/operations/manage-flyio-proxy.md
index 519481f..73e61d1 100644
--- a/docs/how-to/operations/manage-flyio-proxy.md
+++ b/docs/how-to/operations/manage-flyio-proxy.md
@@ -1,7 +1,7 @@
 ---
 title: Manage Fly.io Proxy
-modified: 2026-02-08
-last-reviewed: 2026-03-07
+modified: 2026-04-17
+last-reviewed: 2026-04-17
 tags:
   - how-to
   - fly-io
@@ -23,6 +23,16 @@ mise run fly-deploy
 
 Pushes to `fly/` on main also trigger automatic deployment via the Forgejo CI workflow.
 
+## Reload Nginx (Re-resolve Upstream DNS)
+
+Nginx uses `upstream` blocks with keepalive connection pools. DNS is resolved at config load. If Tailscale Ingress pods get new IPs (restart, reschedule, minikube restart), reload nginx to re-resolve without a full redeploy:
+
+```bash
+mise run fly-reload
+```
+
+A Grafana alert fires when upstreams are unreachable, prompting this action. A full `fly-deploy` also re-resolves DNS (it replaces the container).
+
 ## Add a New Public Service
 
 See [[expose-service-publicly#Per-service setup]] for the full walkthrough. In short:
@@ -78,12 +88,16 @@ The auth key expires every 90 days. To rotate:
 
 ## Troubleshooting
 
-**502 Bad Gateway**: Check `fly logs` for nginx upstream errors. Verify the backend Tailscale service is running (`tailscale status` from inside the container via `fly ssh console`).
+**502 Bad Gateway after Tailscale Ingress restart**: Upstream DNS is stale. Run `mise run fly-reload` to re-resolve. This is the most common cause of 502s.
+
+**502 Bad Gateway on fresh deploy**: MagicDNS may not be ready when nginx starts. The `start.sh` script polls `nslookup` before launching nginx, but if it still fails, check that `tailscale status` is healthy inside the container.
 
 **Health check failing**: `fly ssh console -a blumeops-proxy` then `curl localhost:8080/healthz` to test locally.
 
 **TLS errors on custom domain**: Check cert status with `fly certs show <domain> -a blumeops-proxy`. Certs auto-provision via Let's Encrypt and may take a few minutes.
 
+**High latency (>1s p50)**: Likely lost keepalive — redeploy with `mise run fly-deploy`. Before the keepalive change (April 2026), per-request TLS handshakes through the WireGuard tunnel caused 35s+ p50 at >1 req/s.
+
 ## Related
 
 - [[flyio-proxy]] - Service reference card
diff --git a/docs/reference/infrastructure/routing.md b/docs/reference/infrastructure/routing.md
index a8049d6..229e724 100644
--- a/docs/reference/infrastructure/routing.md
+++ b/docs/reference/infrastructure/routing.md
@@ -1,6 +1,6 @@
 ---
 title: Routing
-modified: 2026-03-03
+modified: 2026-04-17
 tags:
   - infrastructure
   - networking
@@ -51,6 +51,7 @@ DNS CNAMEs point to `blumeops-proxy.fly.dev`. TLS via Fly.io-managed Let's Encry
 | Service | URL | Description |
 |---------|-----|-------------|
 | [[docs]] | https://docs.eblu.me | Documentation site |
+| [[cv]] | https://cv.eblu.me | CV / resume |
 | [[forgejo]] | https://forge.eblu.me | Git hosting (public) |
 
 ## Tailscale-Only Services
diff --git a/docs/reference/services/flyio-proxy.md b/docs/reference/services/flyio-proxy.md
index 3c66d4e..ad32b8a 100644
--- a/docs/reference/services/flyio-proxy.md
+++ b/docs/reference/services/flyio-proxy.md
@@ -1,6 +1,6 @@
 ---
 title: Fly.io Proxy
-modified: 2026-02-08
+modified: 2026-04-17
 tags:
   - service
   - networking
@@ -26,11 +26,21 @@ Public reverse proxy on [Fly.io](https://fly.io) that exposes selected BlumeOps
 | Public domain | Backend | Service |
 |---------------|---------|---------|
 | `docs.eblu.me` | `docs.tail8d86e.ts.net` | [[docs]] |
+| `cv.eblu.me` | `cv.tail8d86e.ts.net` | [[cv]] |
+| `forge.eblu.me` | `forge.tail8d86e.ts.net` | [[forgejo]] |
 
 ## Architecture
 
 Internet traffic hits Fly.io's Anycast edge, terminates TLS with a Let's Encrypt certificate, and is proxied by nginx to the backend service over a Tailscale WireGuard tunnel. See [[expose-service-publicly]] for the full architecture diagram.
 
+### Upstream Keepalive
+
+Nginx uses `upstream` blocks with `keepalive` connection pools to reuse TLS connections through the WireGuard tunnel. This avoids a per-request TLS handshake, which was previously the dominant source of latency (35s+ p50 before keepalive, sub-second after).
+
+**Trade-off:** DNS for upstream hostnames is resolved once at config load, not per-request. If Tailscale Ingress pods get new IPs (restart, reschedule, minikube restart), run `mise run fly-reload` to re-resolve without a full redeploy. A Grafana alert fires when upstreams are unreachable.
+
+Each upstream requires `proxy_ssl_name` set to the actual Tailscale hostname — nginx sends the upstream block name as SNI by default, which the Tailscale Ingress proxy won't recognize.
+
 ## Key Files
 
 | File | Purpose |
@@ -39,7 +49,7 @@ Internet traffic hits Fly.io's Anycast edge, terminates TLS with a Let's Encrypt
 | `fly/Dockerfile` | nginx + Tailscale + Alloy container |
 | `fly/nginx.conf` | Reverse proxy, caching, rate limiting, JSON logging |
 | `fly/alloy.river` | Alloy config: log tailing, metric extraction, remote_write |
-| `fly/start.sh` | Entrypoint: start Tailscale, Alloy, then nginx |
+| `fly/start.sh` | Entrypoint: start Tailscale, wait for MagicDNS, then nginx + Alloy |
 | `pulumi/tailscale/__main__.py` | Auth key (`tag:flyio-proxy`) |
 | `pulumi/tailscale/policy.hujson` | ACL grants for proxy |
 | `pulumi/gandi/__main__.py` | DNS CNAMEs |
@@ -57,7 +67,8 @@ The Tailscale auth key is `preauthorized=True` to avoid device approval hangs on
 - **Logs**: nginx JSON access logs tailed and pushed to [[loki|Loki]] (`{instance="flyio-proxy", job="flyio-nginx"}`)
 - **Metrics**: Derived from access logs, pushed to [[prometheus|Prometheus]] via `remote_write`
   - `flyio_nginx_http_requests_total` — request rate by status/method/host
-  - `flyio_nginx_http_request_duration_seconds` — latency histogram
+  - `flyio_nginx_http_request_duration_seconds` — total request latency histogram (includes proxy overhead)
+  - `flyio_nginx_upstream_response_time_seconds` — backend response time histogram (Forgejo processing only)
   - `flyio_nginx_http_response_bytes_total` — response bandwidth
   - `flyio_nginx_cache_requests_total` — cache HIT/MISS/EXPIRED counts
 
@@ -74,7 +85,21 @@ Alloy listens on `127.0.0.1:12345` for self-scraping its `/metrics` endpoint. Al
 
 The `tag:flyio-proxy` ACL grants access only to `tag:flyio-target:443`. Services must explicitly opt in by adding a `tailscale.com/tags: "tag:k8s,tag:flyio-target"` annotation to their Tailscale Ingress. This means the proxy can only reach endpoints that have been individually tagged — a compromised nginx config cannot route to arbitrary services on the tailnet.
 
-Currently tagged as `tag:flyio-target`: [[docs]], [[loki]], [[prometheus]]. Loki and Prometheus are tagged so that [[alloy|Alloy]] (running inside the container) can push logs and metrics directly via their Tailscale Ingress endpoints — the restricted ACL means Caddy on indri (`tag:homelab`) is not reachable from the proxy.
+Currently tagged as `tag:flyio-target`: [[docs]], [[cv]], [[forgejo]], [[loki]], [[prometheus]]. Loki and Prometheus are tagged so that [[alloy|Alloy]] (running inside the container) can push logs and metrics directly via their Tailscale Ingress endpoints — the restricted ACL means Caddy on indri (`tag:homelab`) is not reachable from the proxy.
+
+### Crawler Mitigation
+
+The proxy serves a `robots.txt` blocking crawlers from expensive endpoints:
+
+- `/mirrors/` — large mirrored repos
+- `/user/` — auth endpoints (crawlers follow redirect loops)
+- `/users/` — user profile pages
+- `/*/archive/` — git bundle generation (DoS vector, see below)
+- `/*/releases/download/` — release artifacts
+
+Archive requests (`/<owner>/<repo>/archive/*`) are 302-redirected to `forge.ops.eblu.me` (tailnet-only), preventing unauthenticated archive generation. This mitigates a known Forgejo DoS vector where crawlers requesting unique commit SHAs trigger unbounded git bundle generation.
+
+Release downloads are cached at the proxy layer (7-day TTL, keyed by URI) to absorb repeated downloads of the same artifact.
 
 To expose an additional service through the proxy, add the `tag:flyio-target` annotation to its Tailscale Ingress. See [[expose-service-publicly]] for the full workflow.
 
diff --git a/docs/reference/services/forgejo.md b/docs/reference/services/forgejo.md
index ad64cf4..11bb9a5 100644
--- a/docs/reference/services/forgejo.md
+++ b/docs/reference/services/forgejo.md
@@ -1,6 +1,6 @@
 ---
 title: Forgejo
-modified: 2026-03-28
+modified: 2026-04-17
 tags:
   - service
   - git
@@ -148,12 +148,24 @@ The UI shows `forge.eblu.me` for HTTPS clone URLs and `forge.ops.eblu.me` for SS
 - **Rate limiting:** nginx rate limits login/signup/forgot-password endpoints (3r/s per client IP via `Fly-Client-IP` header)
 - **fail2ban:** Runs in the Fly.io container; bans IPs after 5 failed logins in 10 minutes via nginx deny list (ephemeral across deploys)
 - **Swagger:** Blocked at the proxy (`/swagger` returns 403); use forge.ops.eblu.me for API access
+- **Archive redirect:** Archive endpoints (`/*/archive/*`) are 302-redirected to `forge.ops.eblu.me` — prevents unauthenticated crawlers from triggering unbounded git bundle generation (known DoS vector, see [[flyio-proxy#Crawler Mitigation]])
+- **robots.txt:** Blocks crawlers from `/mirrors/`, `/user/`, `/users/`, `/*/archive/`, `/*/releases/download/`
 - **OAuth dead-end:** "Sign in with Authentik" redirects to the (tailnet-only) Authentik URL — SSO only works from the tailnet
 
 ### Break-glass
 
 `mise run fly-shutoff` stops all public traffic immediately. forge.ops.eblu.me continues to work from the tailnet. See [[expose-service-publicly#Break-glass shutoff]].
 
+## Monitoring
+
+Forgejo exposes a Prometheus `/metrics` endpoint (enabled via `[metrics]` in `app.ini`). Alloy on indri scrapes it at `localhost:3001/metrics`. Metrics are mostly Go runtime stats and repo counters (no per-request latency histogram).
+
+Request latency is measured at the Fly.io proxy layer via the `flyio_nginx_upstream_response_time_seconds` histogram, visible on the Forgejo Grafana dashboard under "Forgejo: Upstream Response Time".
+
+### Archive Cleanup
+
+The `[cron.archive_cleanup]` section is enabled with `OLDER_THAN = 2h` and `RUN_AT_START = true`. This prevents the `repo-archive/` directory from growing unboundedly when crawlers or users trigger archive downloads. Without this, the directory grew to 54GB in 2 days during a crawler incident in April 2026.
+
 ## Mirrors
 
 Forgejo hosts pull mirrors of external repositories (GitHub, etc.) for supply chain control. Mirrors live in the `mirrors/` org and sync on a configurable interval. See [[manage-forgejo-mirrors]] for operations.
diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md
index 02b8859..fefb30f 100644
--- a/docs/reference/tools/mise-tasks.md
+++ b/docs/reference/tools/mise-tasks.md
@@ -33,7 +33,8 @@ Run `mise tasks --sort name` for the live list with descriptions.
 | `provision-indri` | Run Ansible playbook for [[indri]] |
 | `provision-ringtail` | Run Ansible playbook for [[ringtail]] (NixOS) |
 | `provision-sifaka` | Run Ansible playbook for [[sifaka]] |
-| `fly-deploy` | Deploy Fly.io public proxy |
+| `fly-deploy` | Deploy Fly.io public proxy (uses op for auth) |
+| `fly-reload` | Reload nginx config, re-resolve upstream DNS (no redeploy) |
 | `fly-setup` | One-time Fly.io secrets and certs setup |
 | `fly-shutoff` | Emergency shutoff: stop all Fly.io proxy machines |
 | `dns-preview` | Preview DNS changes with [[pulumi]] |
diff --git a/docs/tutorials/expose-service-publicly.md b/docs/tutorials/expose-service-publicly.md
index b3fdda6..9a44c15 100644
--- a/docs/tutorials/expose-service-publicly.md
+++ b/docs/tutorials/expose-service-publicly.md
@@ -1,7 +1,7 @@
 ---
 title: Expose a Service Publicly
-modified: 2026-03-15
-last-reviewed: 2026-03-03
+modified: 2026-04-17
+last-reviewed: 2026-04-17
 tags:
   - tutorials
   - fly-io
@@ -116,8 +116,8 @@ See the actual files in `fly/` for current configuration. Key design points:
 
 - **`fly.toml`** — uses bluegreen deploys so the old machine serves traffic until the new one passes health checks. `auto_stop_machines = "off"` keeps the proxy always-on.
 - **`Dockerfile`** — multi-stage build pulling nginx, Tailscale, and [[alloy]] binaries. Alloy runs as a sidecar inside the container for observability (see below).
-- **`start.sh`** — starts `tailscaled` first (MagicDNS must be available before nginx resolves upstreams), then nginx in the background, then Alloy, and blocks on the nginx process.
-- **`nginx.conf`** — uses a `resolver 100.100.100.100` directive so upstream DNS resolution is deferred to request time (not config load time). Each service gets a `server` block with a `set $upstream` variable pattern. Includes a JSON access log format that Alloy tails for log collection and metric extraction. A catch-all server block serves `/healthz` and rejects unknown hosts.
+- **`start.sh`** — starts `tailscaled` first, waits for MagicDNS readiness (polls `nslookup` against `100.100.100.100`), then starts nginx, fail2ban, and Alloy, and blocks on the nginx process. The MagicDNS check is required because `upstream` blocks resolve DNS at config load.
+- **`nginx.conf`** — uses `upstream` blocks with `keepalive` connection pools for each backend service. DNS is resolved at config load via MagicDNS (`resolver 100.100.100.100`). Each upstream requires `proxy_ssl_name` set explicitly to the Tailscale hostname (nginx sends the block name as SNI by default). A `map` directive conditionally sets the `Connection` header — empty string for keepalive on normal requests, `upgrade` only for WebSocket requests. Includes a JSON access log format that Alloy tails for log collection and metric extraction. A catch-all server block serves `/healthz` and rejects unknown hosts.
 - **`error.html`** — shown via `proxy_intercept_errors` when upstreams are unreachable (indri offline, tunnel down, etc.). Cached responses still take priority via `proxy_cache_use_stale`.
 
 #### Observability sidecar
@@ -216,11 +216,18 @@ To expose an additional service (example: `wiki.eblu.me`):
 
 ### 1. Add nginx server block
 
-Edit `fly/nginx.conf` — add a new `server` block. The configuration
-differs significantly between static and dynamic services. See the
-existing `docs.eblu.me` and `cv.eblu.me` blocks in `fly/nginx.conf`
-for the current pattern (uses `set $upstream` variable for deferred
-DNS resolution, `proxy_intercept_errors` for error pages, etc.).
+Edit `fly/nginx.conf` — two changes needed:
+
+1. **Add an `upstream` block** (in the `http` context, alongside the existing ones):
+
+```nginx
+upstream wiki_backend {
+    server wiki.tail8d86e.ts.net:443;
+    keepalive 4;
+}
+```
+
+2. **Add a `server` block.** The configuration differs significantly between static and dynamic services. See the existing blocks in `fly/nginx.conf` for the current pattern.
 
 **Static site template** (simplified — adapt from existing blocks):
 
@@ -239,12 +246,16 @@ server {
     }
 
     location / {
-        set $upstream_wiki https://wiki.tail8d86e.ts.net;
-        proxy_pass $upstream_wiki$request_uri;
+        proxy_pass https://wiki_backend$request_uri;
         proxy_ssl_verify off;
         proxy_ssl_server_name on;
+        proxy_ssl_name wiki.tail8d86e.ts.net;
+        proxy_set_header Host wiki.tail8d86e.ts.net;
         proxy_intercept_errors on;
 
+        proxy_http_version 1.1;
+        proxy_set_header Connection $connection_upgrade;
+
         proxy_cache services;
         proxy_cache_valid 200 1d;
         proxy_cache_valid 404 1m;
@@ -259,66 +270,12 @@ server {
 }
 ```
 
-**Dynamic service template** (e.g., Forgejo — see `fly/nginx.conf` for the live configuration):
+**Key points for all upstream blocks:**
+- `proxy_ssl_name` must be set explicitly — nginx sends the upstream block name as SNI by default, which the Tailscale Ingress won't recognize
+- `proxy_http_version 1.1` + `Connection $connection_upgrade` enables keepalive (empty string for normal requests, "upgrade" for WebSocket)
+- `keepalive` pool size: 4 for low-traffic static sites, 8 for higher-traffic dynamic services
 
-```nginx
-# --- forge.eblu.me (dynamic, authenticated) ---
-server {
-    listen 8080;
-    server_name forge.eblu.me;
-
-    # Higher rate limit — git operations, CI webhooks, and API calls
-    # can legitimately burst. Forgejo also has its own rate limiting,
-    # so this is a safety net, not the primary control.
-    limit_req zone=general burst=50 nodelay;
-
-    # Git LFS and repo uploads can be large
-    client_max_body_size 512m;
-
-    error_page 502 503 504 /error.html;
-    location = /error.html {
-        root /usr/share/nginx/html;
-        internal;
-    }
-
-    location / {
-        set $upstream_forge https://forge.tail8d86e.ts.net;
-        proxy_pass $upstream_forge$request_uri;
-        proxy_ssl_verify off;
-        proxy_ssl_server_name on;
-        proxy_intercept_errors on;
-
-        # NO proxy_cache — dynamic content with sessions.
-        # Caching would serve stale pages and break authentication.
-
-        # Pass through headers needed for proper proxying
-        proxy_set_header Host $host;
-        proxy_set_header X-Real-IP $remote_addr;
-        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
-        proxy_set_header X-Forwarded-Proto $scheme;
-
-        # WebSocket support (Forgejo uses it for live updates)
-        proxy_http_version 1.1;
-        proxy_set_header Upgrade $http_upgrade;
-        proxy_set_header Connection "upgrade";
-    }
-
-    # Selectively cache static assets only
-    location ~* \.(css|js|png|jpg|svg|woff2?)$ {
-        set $upstream_forge_static https://forge.tail8d86e.ts.net;
-        proxy_pass $upstream_forge_static$request_uri;
-        proxy_ssl_verify off;
-        proxy_ssl_server_name on;
-
-        proxy_cache services;
-        proxy_cache_valid 200 7d;
-        proxy_cache_key $host$uri;
-
-        add_header X-Cache-Status $upstream_cache_status;
-        add_header X-Clacks-Overhead "GNU Terry Pratchett" always;
-    }
-}
-```
+**Dynamic service template** — see `fly/nginx.conf` for the live Forgejo configuration, which includes rate-limited auth endpoints, cached static assets and release downloads, archive endpoint redirects, robots.txt, and WebSocket support.
 
 Key differences for dynamic services:
 - **No blanket caching** — only static assets (CSS, JS, images) are cached
-- 
2.50.1 (Apple Git-155)


From f5ba7f03aadd2e8ea6519cfa076e2d605d2daf64 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 17 Apr 2026 16:08:12 -0700
Subject: [PATCH 5/6] Add Grafana alert for Fly proxy upstream unreachable (502
 rate)

Fires when >50% of requests return 502 for 3+ minutes, indicating
stale upstream DNS after Tailscale Ingress pod restart. Alert message
includes the fix: mise run fly-reload.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/grafana/alerting.yaml | 60 ++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml
index b220044..4ae70d3 100644
--- a/argocd/manifests/grafana/alerting.yaml
+++ b/argocd/manifests/grafana/alerting.yaml
@@ -373,6 +373,66 @@ groups:
                     type: and
               refId: C
 
+  - orgId: 1
+    name: flyio-proxy-health
+    folder: Infrastructure Alerts
+    interval: 30s
+    rules:
+      - uid: flyio-upstream-unreachable
+        title: FlyioUpstreamUnreachable
+        condition: C
+        for: 3m
+        noDataState: OK
+        execErrState: Alerting
+        annotations:
+          summary: >-
+            Fly.io proxy returning elevated 502s — upstream DNS may be stale. Run: mise run fly-reload
+          runbook_url: https://docs.eblu.me/how-to/operations/manage-flyio-proxy
+        labels:
+          severity: warning
+          service: flyio-proxy
+        data:
+          - refId: A
+            datasourceUid: prometheus
+            relativeTimeRange:
+              from: 300
+              to: 0
+            model:
+              expr: >-
+                sum(rate(flyio_nginx_http_requests_total{instance="flyio-proxy",status="502"}[5m]))
+                / sum(rate(flyio_nginx_http_requests_total{instance="flyio-proxy"}[5m]))
+                > 0.5
+              interval: ""
+              refId: A
+          - refId: B
+            datasourceUid: "__expr__"
+            relativeTimeRange:
+              from: 0
+              to: 0
+            model:
+              type: reduce
+              expression: A
+              reducer: last
+              settings:
+                mode: dropNN
+              refId: B
+          - refId: C
+            datasourceUid: "__expr__"
+            relativeTimeRange:
+              from: 0
+              to: 0
+            model:
+              type: threshold
+              expression: B
+              conditions:
+                - evaluator:
+                    type: gt
+                    params:
+                      - 0
+                  operator:
+                    type: and
+              refId: C
+
 templates:
   - orgId: 1
     name: ntfy-infra
-- 
2.50.1 (Apple Git-155)


From 5aa4cb403aaf154efb36a3ee309d76754476ea69 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 17 Apr 2026 16:30:49 -0700
Subject: [PATCH 6/6] Bump ProxyGroup ingress pod resource requests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Increase from 1m CPU / 1Mi memory to 100m CPU / 128Mi memory. The
ingress pods handle TLS termination for all 19 Tailscale Ingress
services — the previous minimal requests may have caused the scheduler
to deprioritize them under load.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/tailscale-operator-base/proxyclass.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/argocd/manifests/tailscale-operator-base/proxyclass.yaml b/argocd/manifests/tailscale-operator-base/proxyclass.yaml
index a5c4675..9fb46d6 100644
--- a/argocd/manifests/tailscale-operator-base/proxyclass.yaml
+++ b/argocd/manifests/tailscale-operator-base/proxyclass.yaml
@@ -21,5 +21,9 @@ spec:
     pod:
       tailscaleContainer:
         image: docker.io/tailscale/tailscale:v1.94.2
+        resources:
+          requests:
+            cpu: 100m
+            memory: 128Mi
       tailscaleInitContainer:
         image: docker.io/tailscale/tailscale:v1.94.2
-- 
2.50.1 (Apple Git-155)