From b64010b3c7917c6782cd8047aa0231997b1166ae Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 6 Mar 2026 18:34:37 -0800 Subject: [PATCH] Replace spider-trap nginx 404s with robots.txt disallowing /explorer/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The /explorer/ SPA endpoints were the source of all spider-trap traffic. A robots.txt Disallow is a better fix than serving 404s — it prevents crawlers from entering the infinite URL tree in the first place, avoids serving large numbers of 404s that hurt SEO, and doesn't break legitimate deep links. Co-Authored-By: Claude Opus 4.6 --- containers/quartz/default.conf | 26 +++++-------------- .../changelog.d/+robots-txt-explorer.infra.md | 1 + 2 files changed, 7 insertions(+), 20 deletions(-) create mode 100644 docs/changelog.d/+robots-txt-explorer.infra.md diff --git a/containers/quartz/default.conf b/containers/quartz/default.conf index 2705f1e..70b8fcc 100644 --- a/containers/quartz/default.conf +++ b/containers/quartz/default.conf @@ -14,26 +14,12 @@ server { add_header Cache-Control "public, immutable"; } - # --- Spider-trap guards ------------------------------------------------ - # Quartz emits relative links (../path). When a crawler resolves these - # from a phantom URL that was already served by the SPA fallback, the - # relative prefix compounds (e.g. /tags/ref/infra → /tags/ref/infra/ref/infra) - # creating an infinite tree of unique URIs — all served as 200 via the - # fallback to index.html. Two rules cut this off: - # - # 1. /tags/ is always flat (/tags/), so block anything deeper. - # 2. Real content never exceeds depth 4 (/how-to//). - # A depth-5 cutoff gives headroom while stopping recursive paths. - # - # Together these caught ~95% of trap requests in the March 2026 incident. - # The proper fix is root-absolute links in Quartz (planned for fork). - - location ~ "^/tags/[^/]+/" { - return 404; - } - - location ~ "^(/[^/]+){5,}" { - return 404; + # Serve robots.txt inline to prevent crawlers from entering /explorer/, + # which is an SPA feature that generates infinite relative-link trees + # when crawled (the March 2026 spider-trap incident). + location = /robots.txt { + default_type text/plain; + return 200 "User-agent: *\nDisallow: /explorer/\n"; } # SPA fallback - serve index.html for client-side routing diff --git a/docs/changelog.d/+robots-txt-explorer.infra.md b/docs/changelog.d/+robots-txt-explorer.infra.md new file mode 100644 index 0000000..25ece70 --- /dev/null +++ b/docs/changelog.d/+robots-txt-explorer.infra.md @@ -0,0 +1 @@ +Replace nginx spider-trap 404 guards with robots.txt disallowing /explorer/ to prevent crawler-induced infinite URL trees.