diff --git a/argocd/manifests/grafana/alerting.yaml b/argocd/manifests/grafana/alerting.yaml index b220044..4ae70d3 100644 --- a/argocd/manifests/grafana/alerting.yaml +++ b/argocd/manifests/grafana/alerting.yaml @@ -373,6 +373,66 @@ groups: type: and refId: C + - orgId: 1 + name: flyio-proxy-health + folder: Infrastructure Alerts + interval: 30s + rules: + - uid: flyio-upstream-unreachable + title: FlyioUpstreamUnreachable + condition: C + for: 3m + noDataState: OK + execErrState: Alerting + annotations: + summary: >- + Fly.io proxy returning elevated 502s — upstream DNS may be stale. Run: mise run fly-reload + runbook_url: https://docs.eblu.me/how-to/operations/manage-flyio-proxy + labels: + severity: warning + service: flyio-proxy + data: + - refId: A + datasourceUid: prometheus + relativeTimeRange: + from: 300 + to: 0 + model: + expr: >- + sum(rate(flyio_nginx_http_requests_total{instance="flyio-proxy",status="502"}[5m])) + / sum(rate(flyio_nginx_http_requests_total{instance="flyio-proxy"}[5m])) + > 0.5 + interval: "" + refId: A + - refId: B + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: reduce + expression: A + reducer: last + settings: + mode: dropNN + refId: B + - refId: C + datasourceUid: "__expr__" + relativeTimeRange: + from: 0 + to: 0 + model: + type: threshold + expression: B + conditions: + - evaluator: + type: gt + params: + - 0 + operator: + type: and + refId: C + templates: - orgId: 1 name: ntfy-infra diff --git a/argocd/manifests/tailscale-operator-base/proxyclass.yaml b/argocd/manifests/tailscale-operator-base/proxyclass.yaml index a5c4675..9fb46d6 100644 --- a/argocd/manifests/tailscale-operator-base/proxyclass.yaml +++ b/argocd/manifests/tailscale-operator-base/proxyclass.yaml @@ -21,5 +21,9 @@ spec: pod: tailscaleContainer: image: docker.io/tailscale/tailscale:v1.94.2 + resources: + requests: + cpu: 100m + memory: 128Mi tailscaleInitContainer: image: docker.io/tailscale/tailscale:v1.94.2 diff --git a/docs/changelog.d/fly-proxy-keepalive.infra.md b/docs/changelog.d/fly-proxy-keepalive.infra.md new file mode 100644 index 0000000..8853150 --- /dev/null +++ b/docs/changelog.d/fly-proxy-keepalive.infra.md @@ -0,0 +1 @@ +Switched Fly proxy to upstream keepalive pools, reducing forge.eblu.me latency from 35s+ p50 to sub-second. Added `mise run fly-reload` for DNS re-resolution without redeploy. diff --git a/docs/how-to/operations/manage-flyio-proxy.md b/docs/how-to/operations/manage-flyio-proxy.md index 519481f..73e61d1 100644 --- a/docs/how-to/operations/manage-flyio-proxy.md +++ b/docs/how-to/operations/manage-flyio-proxy.md @@ -1,7 +1,7 @@ --- title: Manage Fly.io Proxy -modified: 2026-02-08 -last-reviewed: 2026-03-07 +modified: 2026-04-17 +last-reviewed: 2026-04-17 tags: - how-to - fly-io @@ -23,6 +23,16 @@ mise run fly-deploy Pushes to `fly/` on main also trigger automatic deployment via the Forgejo CI workflow. +## Reload Nginx (Re-resolve Upstream DNS) + +Nginx uses `upstream` blocks with keepalive connection pools. DNS is resolved at config load. If Tailscale Ingress pods get new IPs (restart, reschedule, minikube restart), reload nginx to re-resolve without a full redeploy: + +```bash +mise run fly-reload +``` + +A Grafana alert fires when upstreams are unreachable, prompting this action. A full `fly-deploy` also re-resolves DNS (it replaces the container). + ## Add a New Public Service See [[expose-service-publicly#Per-service setup]] for the full walkthrough. In short: @@ -78,12 +88,16 @@ The auth key expires every 90 days. To rotate: ## Troubleshooting -**502 Bad Gateway**: Check `fly logs` for nginx upstream errors. Verify the backend Tailscale service is running (`tailscale status` from inside the container via `fly ssh console`). +**502 Bad Gateway after Tailscale Ingress restart**: Upstream DNS is stale. Run `mise run fly-reload` to re-resolve. This is the most common cause of 502s. + +**502 Bad Gateway on fresh deploy**: MagicDNS may not be ready when nginx starts. The `start.sh` script polls `nslookup` before launching nginx, but if it still fails, check that `tailscale status` is healthy inside the container. **Health check failing**: `fly ssh console -a blumeops-proxy` then `curl localhost:8080/healthz` to test locally. **TLS errors on custom domain**: Check cert status with `fly certs show -a blumeops-proxy`. Certs auto-provision via Let's Encrypt and may take a few minutes. +**High latency (>1s p50)**: Likely lost keepalive — redeploy with `mise run fly-deploy`. Before the keepalive change (April 2026), per-request TLS handshakes through the WireGuard tunnel caused 35s+ p50 at >1 req/s. + ## Related - [[flyio-proxy]] - Service reference card diff --git a/docs/reference/infrastructure/routing.md b/docs/reference/infrastructure/routing.md index a8049d6..229e724 100644 --- a/docs/reference/infrastructure/routing.md +++ b/docs/reference/infrastructure/routing.md @@ -1,6 +1,6 @@ --- title: Routing -modified: 2026-03-03 +modified: 2026-04-17 tags: - infrastructure - networking @@ -51,6 +51,7 @@ DNS CNAMEs point to `blumeops-proxy.fly.dev`. TLS via Fly.io-managed Let's Encry | Service | URL | Description | |---------|-----|-------------| | [[docs]] | https://docs.eblu.me | Documentation site | +| [[cv]] | https://cv.eblu.me | CV / resume | | [[forgejo]] | https://forge.eblu.me | Git hosting (public) | ## Tailscale-Only Services diff --git a/docs/reference/services/flyio-proxy.md b/docs/reference/services/flyio-proxy.md index 3c66d4e..ad32b8a 100644 --- a/docs/reference/services/flyio-proxy.md +++ b/docs/reference/services/flyio-proxy.md @@ -1,6 +1,6 @@ --- title: Fly.io Proxy -modified: 2026-02-08 +modified: 2026-04-17 tags: - service - networking @@ -26,11 +26,21 @@ Public reverse proxy on [Fly.io](https://fly.io) that exposes selected BlumeOps | Public domain | Backend | Service | |---------------|---------|---------| | `docs.eblu.me` | `docs.tail8d86e.ts.net` | [[docs]] | +| `cv.eblu.me` | `cv.tail8d86e.ts.net` | [[cv]] | +| `forge.eblu.me` | `forge.tail8d86e.ts.net` | [[forgejo]] | ## Architecture Internet traffic hits Fly.io's Anycast edge, terminates TLS with a Let's Encrypt certificate, and is proxied by nginx to the backend service over a Tailscale WireGuard tunnel. See [[expose-service-publicly]] for the full architecture diagram. +### Upstream Keepalive + +Nginx uses `upstream` blocks with `keepalive` connection pools to reuse TLS connections through the WireGuard tunnel. This avoids a per-request TLS handshake, which was previously the dominant source of latency (35s+ p50 before keepalive, sub-second after). + +**Trade-off:** DNS for upstream hostnames is resolved once at config load, not per-request. If Tailscale Ingress pods get new IPs (restart, reschedule, minikube restart), run `mise run fly-reload` to re-resolve without a full redeploy. A Grafana alert fires when upstreams are unreachable. + +Each upstream requires `proxy_ssl_name` set to the actual Tailscale hostname — nginx sends the upstream block name as SNI by default, which the Tailscale Ingress proxy won't recognize. + ## Key Files | File | Purpose | @@ -39,7 +49,7 @@ Internet traffic hits Fly.io's Anycast edge, terminates TLS with a Let's Encrypt | `fly/Dockerfile` | nginx + Tailscale + Alloy container | | `fly/nginx.conf` | Reverse proxy, caching, rate limiting, JSON logging | | `fly/alloy.river` | Alloy config: log tailing, metric extraction, remote_write | -| `fly/start.sh` | Entrypoint: start Tailscale, Alloy, then nginx | +| `fly/start.sh` | Entrypoint: start Tailscale, wait for MagicDNS, then nginx + Alloy | | `pulumi/tailscale/__main__.py` | Auth key (`tag:flyio-proxy`) | | `pulumi/tailscale/policy.hujson` | ACL grants for proxy | | `pulumi/gandi/__main__.py` | DNS CNAMEs | @@ -57,7 +67,8 @@ The Tailscale auth key is `preauthorized=True` to avoid device approval hangs on - **Logs**: nginx JSON access logs tailed and pushed to [[loki|Loki]] (`{instance="flyio-proxy", job="flyio-nginx"}`) - **Metrics**: Derived from access logs, pushed to [[prometheus|Prometheus]] via `remote_write` - `flyio_nginx_http_requests_total` — request rate by status/method/host - - `flyio_nginx_http_request_duration_seconds` — latency histogram + - `flyio_nginx_http_request_duration_seconds` — total request latency histogram (includes proxy overhead) + - `flyio_nginx_upstream_response_time_seconds` — backend response time histogram (Forgejo processing only) - `flyio_nginx_http_response_bytes_total` — response bandwidth - `flyio_nginx_cache_requests_total` — cache HIT/MISS/EXPIRED counts @@ -74,7 +85,21 @@ Alloy listens on `127.0.0.1:12345` for self-scraping its `/metrics` endpoint. Al The `tag:flyio-proxy` ACL grants access only to `tag:flyio-target:443`. Services must explicitly opt in by adding a `tailscale.com/tags: "tag:k8s,tag:flyio-target"` annotation to their Tailscale Ingress. This means the proxy can only reach endpoints that have been individually tagged — a compromised nginx config cannot route to arbitrary services on the tailnet. -Currently tagged as `tag:flyio-target`: [[docs]], [[loki]], [[prometheus]]. Loki and Prometheus are tagged so that [[alloy|Alloy]] (running inside the container) can push logs and metrics directly via their Tailscale Ingress endpoints — the restricted ACL means Caddy on indri (`tag:homelab`) is not reachable from the proxy. +Currently tagged as `tag:flyio-target`: [[docs]], [[cv]], [[forgejo]], [[loki]], [[prometheus]]. Loki and Prometheus are tagged so that [[alloy|Alloy]] (running inside the container) can push logs and metrics directly via their Tailscale Ingress endpoints — the restricted ACL means Caddy on indri (`tag:homelab`) is not reachable from the proxy. + +### Crawler Mitigation + +The proxy serves a `robots.txt` blocking crawlers from expensive endpoints: + +- `/mirrors/` — large mirrored repos +- `/user/` — auth endpoints (crawlers follow redirect loops) +- `/users/` — user profile pages +- `/*/archive/` — git bundle generation (DoS vector, see below) +- `/*/releases/download/` — release artifacts + +Archive requests (`///archive/*`) are 302-redirected to `forge.ops.eblu.me` (tailnet-only), preventing unauthenticated archive generation. This mitigates a known Forgejo DoS vector where crawlers requesting unique commit SHAs trigger unbounded git bundle generation. + +Release downloads are cached at the proxy layer (7-day TTL, keyed by URI) to absorb repeated downloads of the same artifact. To expose an additional service through the proxy, add the `tag:flyio-target` annotation to its Tailscale Ingress. See [[expose-service-publicly]] for the full workflow. diff --git a/docs/reference/services/forgejo.md b/docs/reference/services/forgejo.md index ad64cf4..11bb9a5 100644 --- a/docs/reference/services/forgejo.md +++ b/docs/reference/services/forgejo.md @@ -1,6 +1,6 @@ --- title: Forgejo -modified: 2026-03-28 +modified: 2026-04-17 tags: - service - git @@ -148,12 +148,24 @@ The UI shows `forge.eblu.me` for HTTPS clone URLs and `forge.ops.eblu.me` for SS - **Rate limiting:** nginx rate limits login/signup/forgot-password endpoints (3r/s per client IP via `Fly-Client-IP` header) - **fail2ban:** Runs in the Fly.io container; bans IPs after 5 failed logins in 10 minutes via nginx deny list (ephemeral across deploys) - **Swagger:** Blocked at the proxy (`/swagger` returns 403); use forge.ops.eblu.me for API access +- **Archive redirect:** Archive endpoints (`/*/archive/*`) are 302-redirected to `forge.ops.eblu.me` — prevents unauthenticated crawlers from triggering unbounded git bundle generation (known DoS vector, see [[flyio-proxy#Crawler Mitigation]]) +- **robots.txt:** Blocks crawlers from `/mirrors/`, `/user/`, `/users/`, `/*/archive/`, `/*/releases/download/` - **OAuth dead-end:** "Sign in with Authentik" redirects to the (tailnet-only) Authentik URL — SSO only works from the tailnet ### Break-glass `mise run fly-shutoff` stops all public traffic immediately. forge.ops.eblu.me continues to work from the tailnet. See [[expose-service-publicly#Break-glass shutoff]]. +## Monitoring + +Forgejo exposes a Prometheus `/metrics` endpoint (enabled via `[metrics]` in `app.ini`). Alloy on indri scrapes it at `localhost:3001/metrics`. Metrics are mostly Go runtime stats and repo counters (no per-request latency histogram). + +Request latency is measured at the Fly.io proxy layer via the `flyio_nginx_upstream_response_time_seconds` histogram, visible on the Forgejo Grafana dashboard under "Forgejo: Upstream Response Time". + +### Archive Cleanup + +The `[cron.archive_cleanup]` section is enabled with `OLDER_THAN = 2h` and `RUN_AT_START = true`. This prevents the `repo-archive/` directory from growing unboundedly when crawlers or users trigger archive downloads. Without this, the directory grew to 54GB in 2 days during a crawler incident in April 2026. + ## Mirrors Forgejo hosts pull mirrors of external repositories (GitHub, etc.) for supply chain control. Mirrors live in the `mirrors/` org and sync on a configurable interval. See [[manage-forgejo-mirrors]] for operations. diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md index 02b8859..fefb30f 100644 --- a/docs/reference/tools/mise-tasks.md +++ b/docs/reference/tools/mise-tasks.md @@ -33,7 +33,8 @@ Run `mise tasks --sort name` for the live list with descriptions. | `provision-indri` | Run Ansible playbook for [[indri]] | | `provision-ringtail` | Run Ansible playbook for [[ringtail]] (NixOS) | | `provision-sifaka` | Run Ansible playbook for [[sifaka]] | -| `fly-deploy` | Deploy Fly.io public proxy | +| `fly-deploy` | Deploy Fly.io public proxy (uses op for auth) | +| `fly-reload` | Reload nginx config, re-resolve upstream DNS (no redeploy) | | `fly-setup` | One-time Fly.io secrets and certs setup | | `fly-shutoff` | Emergency shutoff: stop all Fly.io proxy machines | | `dns-preview` | Preview DNS changes with [[pulumi]] | diff --git a/docs/tutorials/expose-service-publicly.md b/docs/tutorials/expose-service-publicly.md index b3fdda6..9a44c15 100644 --- a/docs/tutorials/expose-service-publicly.md +++ b/docs/tutorials/expose-service-publicly.md @@ -1,7 +1,7 @@ --- title: Expose a Service Publicly -modified: 2026-03-15 -last-reviewed: 2026-03-03 +modified: 2026-04-17 +last-reviewed: 2026-04-17 tags: - tutorials - fly-io @@ -116,8 +116,8 @@ See the actual files in `fly/` for current configuration. Key design points: - **`fly.toml`** — uses bluegreen deploys so the old machine serves traffic until the new one passes health checks. `auto_stop_machines = "off"` keeps the proxy always-on. - **`Dockerfile`** — multi-stage build pulling nginx, Tailscale, and [[alloy]] binaries. Alloy runs as a sidecar inside the container for observability (see below). -- **`start.sh`** — starts `tailscaled` first (MagicDNS must be available before nginx resolves upstreams), then nginx in the background, then Alloy, and blocks on the nginx process. -- **`nginx.conf`** — uses a `resolver 100.100.100.100` directive so upstream DNS resolution is deferred to request time (not config load time). Each service gets a `server` block with a `set $upstream` variable pattern. Includes a JSON access log format that Alloy tails for log collection and metric extraction. A catch-all server block serves `/healthz` and rejects unknown hosts. +- **`start.sh`** — starts `tailscaled` first, waits for MagicDNS readiness (polls `nslookup` against `100.100.100.100`), then starts nginx, fail2ban, and Alloy, and blocks on the nginx process. The MagicDNS check is required because `upstream` blocks resolve DNS at config load. +- **`nginx.conf`** — uses `upstream` blocks with `keepalive` connection pools for each backend service. DNS is resolved at config load via MagicDNS (`resolver 100.100.100.100`). Each upstream requires `proxy_ssl_name` set explicitly to the Tailscale hostname (nginx sends the block name as SNI by default). A `map` directive conditionally sets the `Connection` header — empty string for keepalive on normal requests, `upgrade` only for WebSocket requests. Includes a JSON access log format that Alloy tails for log collection and metric extraction. A catch-all server block serves `/healthz` and rejects unknown hosts. - **`error.html`** — shown via `proxy_intercept_errors` when upstreams are unreachable (indri offline, tunnel down, etc.). Cached responses still take priority via `proxy_cache_use_stale`. #### Observability sidecar @@ -216,11 +216,18 @@ To expose an additional service (example: `wiki.eblu.me`): ### 1. Add nginx server block -Edit `fly/nginx.conf` — add a new `server` block. The configuration -differs significantly between static and dynamic services. See the -existing `docs.eblu.me` and `cv.eblu.me` blocks in `fly/nginx.conf` -for the current pattern (uses `set $upstream` variable for deferred -DNS resolution, `proxy_intercept_errors` for error pages, etc.). +Edit `fly/nginx.conf` — two changes needed: + +1. **Add an `upstream` block** (in the `http` context, alongside the existing ones): + +```nginx +upstream wiki_backend { + server wiki.tail8d86e.ts.net:443; + keepalive 4; +} +``` + +2. **Add a `server` block.** The configuration differs significantly between static and dynamic services. See the existing blocks in `fly/nginx.conf` for the current pattern. **Static site template** (simplified — adapt from existing blocks): @@ -239,12 +246,16 @@ server { } location / { - set $upstream_wiki https://wiki.tail8d86e.ts.net; - proxy_pass $upstream_wiki$request_uri; + proxy_pass https://wiki_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name wiki.tail8d86e.ts.net; + proxy_set_header Host wiki.tail8d86e.ts.net; proxy_intercept_errors on; + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + proxy_cache services; proxy_cache_valid 200 1d; proxy_cache_valid 404 1m; @@ -259,66 +270,12 @@ server { } ``` -**Dynamic service template** (e.g., Forgejo — see `fly/nginx.conf` for the live configuration): +**Key points for all upstream blocks:** +- `proxy_ssl_name` must be set explicitly — nginx sends the upstream block name as SNI by default, which the Tailscale Ingress won't recognize +- `proxy_http_version 1.1` + `Connection $connection_upgrade` enables keepalive (empty string for normal requests, "upgrade" for WebSocket) +- `keepalive` pool size: 4 for low-traffic static sites, 8 for higher-traffic dynamic services -```nginx -# --- forge.eblu.me (dynamic, authenticated) --- -server { - listen 8080; - server_name forge.eblu.me; - - # Higher rate limit — git operations, CI webhooks, and API calls - # can legitimately burst. Forgejo also has its own rate limiting, - # so this is a safety net, not the primary control. - limit_req zone=general burst=50 nodelay; - - # Git LFS and repo uploads can be large - client_max_body_size 512m; - - error_page 502 503 504 /error.html; - location = /error.html { - root /usr/share/nginx/html; - internal; - } - - location / { - set $upstream_forge https://forge.tail8d86e.ts.net; - proxy_pass $upstream_forge$request_uri; - proxy_ssl_verify off; - proxy_ssl_server_name on; - proxy_intercept_errors on; - - # NO proxy_cache — dynamic content with sessions. - # Caching would serve stale pages and break authentication. - - # Pass through headers needed for proper proxying - proxy_set_header Host $host; - proxy_set_header X-Real-IP $remote_addr; - proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; - proxy_set_header X-Forwarded-Proto $scheme; - - # WebSocket support (Forgejo uses it for live updates) - proxy_http_version 1.1; - proxy_set_header Upgrade $http_upgrade; - proxy_set_header Connection "upgrade"; - } - - # Selectively cache static assets only - location ~* \.(css|js|png|jpg|svg|woff2?)$ { - set $upstream_forge_static https://forge.tail8d86e.ts.net; - proxy_pass $upstream_forge_static$request_uri; - proxy_ssl_verify off; - proxy_ssl_server_name on; - - proxy_cache services; - proxy_cache_valid 200 7d; - proxy_cache_key $host$uri; - - add_header X-Cache-Status $upstream_cache_status; - add_header X-Clacks-Overhead "GNU Terry Pratchett" always; - } -} -``` +**Dynamic service template** — see `fly/nginx.conf` for the live Forgejo configuration, which includes rate-limited auth endpoints, cached static assets and release downloads, archive endpoint redirects, robots.txt, and WebSocket support. Key differences for dynamic services: - **No blanket caching** — only static assets (CSS, JS, images) are cached diff --git a/fly/nginx.conf b/fly/nginx.conf index db02a21..5723722 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -46,18 +46,32 @@ http { proxy_cache_path /tmp/cache levels=1:2 keys_zone=services:10m max_size=200m inactive=24h; - # MagicDNS resolver — using a variable in proxy_pass defers upstream DNS - # resolution to request time (not config time). Results are cached for - # 30s per worker to avoid per-request DNS lookups. + # WebSocket-aware Connection header. Only send "upgrade" when the client + # actually requests a protocol switch; otherwise empty string to preserve + # upstream keepalive connections. + map $http_upgrade $connection_upgrade { + default ""; + websocket upgrade; + } + + # --- Upstream pools with keepalive --- + # DNS is resolved once at config load via MagicDNS. If Tailscale Ingress + # pods get new IPs (restart, reschedule), run `mise run fly-reload` to + # re-resolve. A Grafana alert fires when upstreams are unreachable. resolver 100.100.100.100 valid=30s; resolver_timeout 5s; - # WebSocket-aware Connection header. Only send "upgrade" when the client - # actually requests a protocol switch; otherwise "close" (the HTTP/1.1 - # default when keepalive pooling is not available). - map $http_upgrade $connection_upgrade { - default close; - websocket upgrade; + upstream forge_backend { + server forge.tail8d86e.ts.net:443; + keepalive 8; + } + upstream docs_backend { + server docs.tail8d86e.ts.net:443; + keepalive 4; + } + upstream cv_backend { + server cv.tail8d86e.ts.net:443; + keepalive 4; } # --- docs.eblu.me (static site) --- @@ -76,12 +90,16 @@ http { internal; } location / { - set $upstream_docs https://docs.tail8d86e.ts.net; - proxy_pass $upstream_docs$request_uri; + proxy_pass https://docs_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name docs.tail8d86e.ts.net; + proxy_set_header Host docs.tail8d86e.ts.net; proxy_intercept_errors on; + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + # Cache aggressively — static site only. # Do NOT use these settings for dynamic services. proxy_cache services; @@ -116,12 +134,16 @@ http { } location / { - set $upstream_cv https://cv.tail8d86e.ts.net; - proxy_pass $upstream_cv$request_uri; + proxy_pass https://cv_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name cv.tail8d86e.ts.net; + proxy_set_header Host cv.tail8d86e.ts.net; proxy_intercept_errors on; + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + proxy_cache services; proxy_cache_valid 200 1d; proxy_cache_valid 404 1m; @@ -187,10 +209,10 @@ http { location ~ ^/user/(login|sign_up|forgot_password) { limit_req zone=forge_auth burst=5 nodelay; - set $upstream_forge https://forge.tail8d86e.ts.net; - proxy_pass $upstream_forge$request_uri; + proxy_pass https://forge_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name forge.tail8d86e.ts.net; proxy_intercept_errors on; proxy_set_header Host $host; @@ -206,10 +228,13 @@ http { # Cache release artifact downloads — immutable files keyed by tag+filename. # Avoids hammering Forgejo when crawlers or users re-download the same asset. location ~ ^/[^/]+/[^/]+/releases/download/ { - set $upstream_forge_releases https://forge.tail8d86e.ts.net; - proxy_pass $upstream_forge_releases$request_uri; + proxy_pass https://forge_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name forge.tail8d86e.ts.net; + + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; proxy_cache services; proxy_cache_valid 200 7d; @@ -226,10 +251,13 @@ http { # Selectively cache static assets only location ~* \.(css|js|png|jpg|svg|woff2?)$ { - set $upstream_forge_static https://forge.tail8d86e.ts.net; - proxy_pass $upstream_forge_static$request_uri; + proxy_pass https://forge_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name forge.tail8d86e.ts.net; + + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; proxy_cache services; proxy_cache_valid 200 7d; @@ -240,10 +268,10 @@ http { } location / { - set $upstream_forge https://forge.tail8d86e.ts.net; - proxy_pass $upstream_forge$request_uri; + proxy_pass https://forge_backend$request_uri; proxy_ssl_verify off; proxy_ssl_server_name on; + proxy_ssl_name forge.tail8d86e.ts.net; proxy_intercept_errors on; # NO proxy_cache — dynamic content with sessions diff --git a/fly/start.sh b/fly/start.sh index 5b08490..8fd1fd4 100644 --- a/fly/start.sh +++ b/fly/start.sh @@ -11,10 +11,18 @@ tailscale up --authkey="${TS_AUTHKEY}" --hostname=flyio-proxy until tailscale status > /dev/null 2>&1; do sleep 1; done echo "Tailscale connected" +# Wait for MagicDNS to be ready — upstream blocks resolve DNS at config +# load, so nginx will fail to start if MagicDNS can't resolve yet. +echo "Waiting for MagicDNS..." +until nslookup forge.tail8d86e.ts.net 100.100.100.100 > /dev/null 2>&1; do + sleep 1 +done +echo "MagicDNS ready" + # Ensure fail2ban deny file exists before nginx starts touch /etc/nginx/forge-deny.conf -# Start nginx — MagicDNS is available, health check passes immediately. +# Start nginx — MagicDNS is available, upstreams resolved. nginx -g "daemon off;" & NGINX_PID=$! echo "Nginx started" diff --git a/mise-tasks/fly-reload b/mise-tasks/fly-reload new file mode 100755 index 0000000..34806c5 --- /dev/null +++ b/mise-tasks/fly-reload @@ -0,0 +1,16 @@ +#!/usr/bin/env bash +#MISE description="Reload Fly.io proxy nginx config (re-resolves upstream DNS)" + +set -euo pipefail + +export FLY_API_TOKEN +FLY_API_TOKEN="$(op read 'op://blumeops/fly.io admin/add more/deploy-token')" + +# SSH into the Fly machine and send nginx a reload signal. +# This re-resolves upstream DNS without a full redeploy. +APP="blumeops-proxy" +MACHINE_ID=$(fly machines list -a "$APP" --json | python3 -c "import sys,json; print(json.load(sys.stdin)[0]['id'])") + +echo "Reloading nginx on machine $MACHINE_ID..." +fly ssh console -a "$APP" -C "nginx -s reload" +echo "Done. Upstream DNS re-resolved."