From 4d1f4af25b9d2a55c1b0731e3a6b83259fc33dfa Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Thu, 28 May 2026 09:59:46 -0700
Subject: [PATCH 01/29] =?UTF-8?q?Upgrade=20unpoller=20v2.34.0=20=E2=86=92?=
=?UTF-8?q?=20v3.2.0,=20migrate=20to=20container.py=20(#361)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
- Service Review pickup: unpoller (last reviewed 73 days ago).
- Upgrades unpoller from v2.34.0 to v3.2.0 (major version bump).
- Migrates the container build from a Dockerfile to a native Dagger pipeline (`containers/unpoller/container.py`) following the navidrome / miniflux pattern.
- Refreshes `service-versions.yaml` (last-reviewed, current-version).
## Breaking changes (upstream)
- **v3.0.0** β UniFi network API shifts (later 10.x). Some metric / event / log names and labels may have changed. Worth a follow-up sweep of the unpoller Grafana dashboard for missing series.
- **v3.2.0** β defaults to a 60s background poll feeding cached Prometheus scrapes (was on-demand poll per scrape). To restore previous behavior, set `interval = 0` in `up.conf`. Leaving the new default in this PR β every-15s scrapes will simply serve from cache, which is fine for our use.
## Build
- Image: `registry.ops.eblu.me/blumeops/unpoller:v3.2.0-1b27242`
- Built by build-container workflow run #559 from this branch.
## Test plan
- [ ] `argocd app set unpoller --revision unpoller-v3 && argocd app sync unpoller`
- [ ] Pod comes Ready
- [ ] Verify metrics exported (`Site/Client/UAP/USG/USW` counts in logs, `unpoller_*` series in Prometheus)
- [ ] Spot-check unpoller Grafana dashboard for missing series after the v3 API shift
- [ ] After merge: `argocd app set unpoller --revision main && argocd app sync unpoller`
π€ Generated with [Claude Code](https://claude.com/claude-code)
Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/361
---
argocd/manifests/unpoller/kustomization.yaml | 2 +-
containers/unpoller/Dockerfile | 43 ----------------
containers/unpoller/container.py | 53 ++++++++++++++++++++
docs/changelog.d/unpoller-v3.infra.md | 1 +
service-versions.yaml | 4 +-
5 files changed, 57 insertions(+), 46 deletions(-)
delete mode 100644 containers/unpoller/Dockerfile
create mode 100644 containers/unpoller/container.py
create mode 100644 docs/changelog.d/unpoller-v3.infra.md
diff --git a/argocd/manifests/unpoller/kustomization.yaml b/argocd/manifests/unpoller/kustomization.yaml
index 5b7a9e2..d2c4e28 100644
--- a/argocd/manifests/unpoller/kustomization.yaml
+++ b/argocd/manifests/unpoller/kustomization.yaml
@@ -10,7 +10,7 @@ resources:
images:
- name: registry.ops.eblu.me/blumeops/unpoller
- newTag: v2.34.0-613f05d
+ newTag: v3.2.0-1b27242
configMapGenerator:
- name: unpoller-config
diff --git a/containers/unpoller/Dockerfile b/containers/unpoller/Dockerfile
deleted file mode 100644
index 241b375..0000000
--- a/containers/unpoller/Dockerfile
+++ /dev/null
@@ -1,43 +0,0 @@
-# UnPoller β UniFi metrics exporter for Prometheus
-# Two-stage build: Go compilation, then minimal Alpine runtime
-
-ARG CONTAINER_APP_VERSION=v2.34.0
-
-FROM golang:alpine3.22 AS build
-
-ARG CONTAINER_APP_VERSION
-RUN apk add --no-cache git
-
-RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \
- https://forge.ops.eblu.me/mirrors/unpoller.git /app
-
-WORKDIR /app
-
-ENV CGO_ENABLED=0
-
-RUN go build -ldflags="-s -w \
- -X main.version=${CONTAINER_APP_VERSION} \
- -X main.builtBy=blumeops \
- -X golift.io/version.Version=${CONTAINER_APP_VERSION} \
- -X golift.io/version.Branch=HEAD \
- -X golift.io/version.BuildUser=blumeops \
- -X golift.io/version.Revision=blumeops-build" \
- -o /bin/unpoller .
-
-FROM alpine:3.22
-
-ARG CONTAINER_APP_VERSION
-LABEL org.opencontainers.image.title="UnPoller"
-LABEL org.opencontainers.image.description="UniFi metrics exporter for Prometheus"
-LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}"
-LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops"
-LABEL org.opencontainers.image.vendor="blumeops"
-
-RUN apk add --no-cache ca-certificates tzdata
-
-COPY --from=build /bin/unpoller /usr/bin/unpoller
-
-EXPOSE 9130
-USER 65534:65534
-ENTRYPOINT ["/usr/bin/unpoller"]
-CMD ["--config", "/etc/unpoller/up.conf"]
diff --git a/containers/unpoller/container.py b/containers/unpoller/container.py
new file mode 100644
index 0000000..bfc75ba
--- /dev/null
+++ b/containers/unpoller/container.py
@@ -0,0 +1,53 @@
+"""UnPoller β UniFi metrics exporter for Prometheus.
+
+Two-stage build: Go backend, Alpine runtime.
+Source cloned from forge mirror.
+"""
+
+import dagger
+
+from blumeops.containers import (
+ alpine_runtime,
+ clone_from_forge,
+ go_build,
+ oci_labels,
+)
+
+VERSION = "v3.2.0"
+
+
+async def build(src: dagger.Directory) -> dagger.Container:
+ source = clone_from_forge("unpoller", VERSION)
+
+ backend = go_build(
+ source,
+ "/unpoller",
+ ldflags=(
+ f"-s -w "
+ f"-X main.version={VERSION} "
+ f"-X main.builtBy=blumeops "
+ f"-X golift.io/version.Version={VERSION} "
+ f"-X golift.io/version.Branch=HEAD "
+ f"-X golift.io/version.BuildUser=blumeops "
+ f"-X golift.io/version.Revision=blumeops-build"
+ ),
+ )
+
+ runtime = alpine_runtime(
+ extra_apk=["ca-certificates", "tzdata"],
+ create_user=False,
+ )
+ runtime = oci_labels(
+ runtime,
+ title="UnPoller",
+ description="UniFi metrics exporter for Prometheus",
+ version=VERSION,
+ )
+ return (
+ runtime.with_file("/usr/bin/unpoller", backend.file("/unpoller"))
+ .with_exposed_port(9130)
+ .with_user("65534")
+ .with_default_args(
+ args=["/usr/bin/unpoller", "--config", "/etc/unpoller/up.conf"]
+ )
+ )
diff --git a/docs/changelog.d/unpoller-v3.infra.md b/docs/changelog.d/unpoller-v3.infra.md
new file mode 100644
index 0000000..fa6eaf9
--- /dev/null
+++ b/docs/changelog.d/unpoller-v3.infra.md
@@ -0,0 +1 @@
+Upgrade unpoller v2.34.0 β v3.2.0 and migrate container build from Dockerfile to native Dagger (container.py). v3.0.0 carries breaking UniFi API changes; v3.2.0 introduces a 60s background poll (cached scrapes) by default β set `interval = 0` in `up.conf` to restore on-demand polling.
diff --git a/service-versions.yaml b/service-versions.yaml
index 02f2979..63b0f15 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -345,8 +345,8 @@ services:
- name: unpoller
type: argocd
- last-reviewed: 2026-03-16
- current-version: "v2.34.0"
+ last-reviewed: 2026-05-28
+ current-version: "v3.2.0"
upstream-source: https://github.com/unpoller/unpoller/releases
notes: UniFi metrics exporter for Prometheus
From e703d25efe2b2da12793a6c459bce95ecdc48435 Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Thu, 28 May 2026 10:10:21 -0700
Subject: [PATCH 02/29] C0: rebuild unpoller container from squashed main
commit
Image was previously tagged with the unpoller-v3 branch SHA (1b27242),
which doesn't exist in main's history after squash-merge. Rebuilt from
the squashed commit so the tag references a reachable commit.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
argocd/manifests/unpoller/kustomization.yaml | 2 +-
docs/changelog.d/+unpoller-rebuild-on-main.infra.md | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
create mode 100644 docs/changelog.d/+unpoller-rebuild-on-main.infra.md
diff --git a/argocd/manifests/unpoller/kustomization.yaml b/argocd/manifests/unpoller/kustomization.yaml
index d2c4e28..bf776bb 100644
--- a/argocd/manifests/unpoller/kustomization.yaml
+++ b/argocd/manifests/unpoller/kustomization.yaml
@@ -10,7 +10,7 @@ resources:
images:
- name: registry.ops.eblu.me/blumeops/unpoller
- newTag: v3.2.0-1b27242
+ newTag: v3.2.0-4d1f4af
configMapGenerator:
- name: unpoller-config
diff --git a/docs/changelog.d/+unpoller-rebuild-on-main.infra.md b/docs/changelog.d/+unpoller-rebuild-on-main.infra.md
new file mode 100644
index 0000000..60ae8fa
--- /dev/null
+++ b/docs/changelog.d/+unpoller-rebuild-on-main.infra.md
@@ -0,0 +1 @@
+Rebuild unpoller container from squashed main commit so the image SHA tag matches a commit in main's history (was tagged with the pre-squash branch SHA).
From 1ce381cb6e15ca1226feee1d6a0fa2c449f929b7 Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Thu, 28 May 2026 14:36:33 -0700
Subject: [PATCH 03/29] C0: surface missing-log failures in runner-logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
`mise run runner-logs -j ` previously silently succeeded with
no output when forgejo had no log for the task. Two layered causes:
1. zstdcat exits 0 even when the file is missing (writes "can't stat
β¦ -- ignored" to stderr).
2. ssh to indri runs fish, which silently drops the remote exit code so
the subprocess returncode is always 0.
Probe `test -f` over SSH and parse a stdout marker (EXISTS / MISSING) to
detect the missing-log case, then report it explicitly with the indri
path and a hint about action_task.log_in_storage = 0 so the operator
knows where to look next.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../+runner-logs-missing-log.misc.md | 1 +
mise-tasks/runner-logs | 25 ++++++++++++++++++-
2 files changed, 25 insertions(+), 1 deletion(-)
create mode 100644 docs/changelog.d/+runner-logs-missing-log.misc.md
diff --git a/docs/changelog.d/+runner-logs-missing-log.misc.md b/docs/changelog.d/+runner-logs-missing-log.misc.md
new file mode 100644
index 0000000..c06704a
--- /dev/null
+++ b/docs/changelog.d/+runner-logs-missing-log.misc.md
@@ -0,0 +1 @@
+`mise run runner-logs -j ` now reports a clear error when the log file doesn't exist on indri (e.g. a runner crash that left `action_task.log_in_storage = 0`). Previously it printed only the header and exited 0, because `zstdcat` exits 0 with a "can't stat β¦ -- ignored" stderr message and ssh+fish on indri swallows the remote exit code.
diff --git a/mise-tasks/runner-logs b/mise-tasks/runner-logs
index 3c5e8e3..0d3028b 100755
--- a/mise-tasks/runner-logs
+++ b/mise-tasks/runner-logs
@@ -229,12 +229,35 @@ def fetch_log(run_number: int, job_index: int, repo: str, token: str) -> None:
hex_prefix = f"{task_id & 0xff:02x}"
log_path = f"~/forgejo/data/actions_log/{repo}/{hex_prefix}/{task_id}.log.zst"
+ # indri's login shell (fish) silently swallows SSH exit codes, so we can't
+ # rely on returncode. zstdcat itself also exits 0 with a "can't stat ...
+ # -- ignored" stderr message when the file is missing. Detect missing logs
+ # by running `test -f` over SSH and parsing the marker line from stdout.
+ probe = subprocess.run(
+ ["ssh", "indri", f"test -f {log_path} && echo EXISTS || echo MISSING"],
+ capture_output=True,
+ text=True,
+ )
+ marker = probe.stdout.strip().splitlines()[-1] if probe.stdout.strip() else ""
+ if marker != "EXISTS":
+ typer.echo(
+ f"Error: log not found for run #{run_number} job {job_index} (task {task_id})",
+ err=True,
+ )
+ typer.echo(f"Path: indri:{log_path}", err=True)
+ typer.echo(
+ "The runner may have crashed before uploading its log buffer "
+ "(action_task.log_in_storage = 0).",
+ err=True,
+ )
+ raise typer.Exit(1)
+
result = subprocess.run(
["ssh", "indri", f"zstdcat {log_path}"],
capture_output=True,
text=True,
)
- if result.returncode != 0:
+ if result.returncode != 0 or not result.stdout:
typer.echo(
f"Error: could not read log for run #{run_number} job {job_index} (task {task_id})",
err=True,
From ecded3007368e094baebeed10fbf2a3fe49aed90 Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Thu, 28 May 2026 14:51:09 -0700
Subject: [PATCH 04/29] Make valkey local on ringtail (nix amd64) + bump to
8.1.7 (#362)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## Summary
Weekly "make one non-local container local" pickup: immich-ringtail still pulled `docker.io/valkey/valkey:8.1.6` because the existing `containers/valkey/container.py` build was arm64-only.
- Adds `containers/valkey/default.nix` β nix-built amd64 valkey image, packaged by the ringtail nix-container-builder runner using `pkgs.dockerTools.buildLayeredImage`. Mirrors the existing `containers/authentik-redis/default.nix` pattern.
- `containers/valkey/container.py` keeps building the Alpine arm64 image for paperless on indri. Bumped both builds to upstream valkey 8.1.7 (Alpine 3.22 now ships `8.1.7-r0`; nixpkgs has 8.1.7).
- Splits `VERSION` (upstream app) from `ALPINE_PIN` (apk pin) in `container.py` so both build files can declare the same upstream version and pass `container-version-check`.
- Updates `service-versions.yaml`: current-version 8.1.7, refreshed last-reviewed, upstream-source now points at the canonical valkey-io releases page.
- Switches kustomizations:
- `immich-ringtail/kustomization.yaml`: `docker.io/valkey/valkey:8.1.6` β `registry.ops.eblu.me/blumeops/valkey:v8.1.7-02859c5-nix`, comment updated.
- `paperless/kustomization.yaml`: `v8.1.6-r0-fabca04` β `v8.1.7-02859c5`.
## Build
build-container run #563 β both jobs succeeded after a transient runner crash on the first dispatch (#562 build-nix), which surfaced two separate bugs that landed in a separate C0 on main:
- `runner-logs` silently returned 0 with no output when the log file didn't exist on indri
- `ssh indri` swallowing remote exit codes (fish login shell), which the wrapper now works around via a stdout marker
## Test plan
- [ ] `argocd app set immich-ringtail --revision valkey-nix && argocd app sync immich-ringtail`
- [ ] `argocd app set paperless --revision valkey-nix && argocd app sync paperless`
- [ ] Both valkey pods come Ready and start serving on :6379
- [ ] Immich app + paperless can read/write their respective cache
- [ ] After merge: rebuild from squashed main commit + update kustomization tags (squash-tag follow-up)
π€ Generated with [Claude Code](https://claude.com/claude-code)
Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/362
---
.../immich-ringtail/kustomization.yaml | 9 +++---
argocd/manifests/paperless/kustomization.yaml | 2 +-
containers/valkey/container.py | 15 +++++-----
containers/valkey/default.nix | 30 +++++++++++++++++++
docs/changelog.d/valkey-nix.infra.md | 1 +
service-versions.yaml | 15 +++++-----
6 files changed, 53 insertions(+), 19 deletions(-)
create mode 100644 containers/valkey/default.nix
create mode 100644 docs/changelog.d/valkey-nix.infra.md
diff --git a/argocd/manifests/immich-ringtail/kustomization.yaml b/argocd/manifests/immich-ringtail/kustomization.yaml
index c1f639e..7a97fef 100644
--- a/argocd/manifests/immich-ringtail/kustomization.yaml
+++ b/argocd/manifests/immich-ringtail/kustomization.yaml
@@ -21,8 +21,9 @@ images:
- name: ghcr.io/immich-app/immich-machine-learning
# CUDA variant of the same release β ringtail has an RTX 4080
newTag: v2.6.3-cuda
- # Using upstream multi-arch valkey image directly; the
- # registry.ops.eblu.me/blumeops/valkey mirror is arm64-only (built
- # on indri) and would crashloop on ringtail.
+ # amd64 valkey built via nix on the ringtail nix-container-builder
+ # (see containers/valkey/default.nix). The Alpine container.py build
+ # is arm64-only and serves paperless on indri.
- name: docker.io/valkey/valkey
- newTag: "8.1.6"
+ newName: registry.ops.eblu.me/blumeops/valkey
+ newTag: v8.1.7-02859c5-nix
diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml
index 9c6a086..575dfb4 100644
--- a/argocd/manifests/paperless/kustomization.yaml
+++ b/argocd/manifests/paperless/kustomization.yaml
@@ -16,4 +16,4 @@ images:
newTag: v2.20.13-07f52e9
- name: docker.io/library/redis
newName: registry.ops.eblu.me/blumeops/valkey
- newTag: v8.1.6-r0-fabca04
+ newTag: v8.1.7-02859c5
diff --git a/containers/valkey/container.py b/containers/valkey/container.py
index 5d150e7..34e8524 100644
--- a/containers/valkey/container.py
+++ b/containers/valkey/container.py
@@ -1,8 +1,8 @@
-"""Valkey β native Dagger build.
+"""Valkey β native Dagger build (arm64, indri).
Alpine 3.22 base with the `valkey` apk package (8.1.x β Redis-compatible).
-Mirrors `docker.io/valkey/valkey:8.1-alpine`, used by paperless and immich
-as a cache/queue sidecar.
+Used by paperless (sidecar) on indri. immich on ringtail uses the
+nix-built amd64 variant from `default.nix` in this directory.
"""
import dagger
@@ -10,9 +10,10 @@ from dagger import dag
from blumeops.containers import oci_labels
-# Alpine 3.22 ships valkey 8.1.6-r0. Alpine 3.23 jumps to 9.0 β hold on 3.22
-# to keep this a 1:1 swap for the upstream `valkey:8.1-alpine` image.
-VERSION = "8.1.6-r0"
+# Alpine 3.22 currently ships valkey 8.1.7-r0. Alpine 3.23 jumps to 9.0 β
+# hold on 3.22 to keep this aligned with the 8.1 line.
+VERSION = "8.1.7"
+ALPINE_PIN = "8.1.7-r0"
ALPINE_BASE = "alpine:3.22"
@@ -21,7 +22,7 @@ async def build(src: dagger.Directory) -> dagger.Container:
ctr = (
dag.container()
.from_(ALPINE_BASE)
- .with_exec(["apk", "add", "--no-cache", f"valkey={VERSION}"])
+ .with_exec(["apk", "add", "--no-cache", f"valkey={ALPINE_PIN}"])
.with_exec(["mkdir", "-p", "/data"])
.with_exec(["chown", "valkey:valkey", "/data"])
.with_workdir("/data")
diff --git a/containers/valkey/default.nix b/containers/valkey/default.nix
new file mode 100644
index 0000000..9cb1713
--- /dev/null
+++ b/containers/valkey/default.nix
@@ -0,0 +1,30 @@
+# Nix-built Valkey for ringtail (amd64)
+# Companion to container.py (Alpine 3.22, arm64 on indri).
+# Used by immich-ringtail which needs an amd64 image; paperless on indri
+# continues to use the Alpine container.py build.
+#
+# The version assertion ensures nix-build fails if a flake.lock update
+# changes the Valkey version β forcing an explicit version acknowledgment
+# here and in service-versions.yaml (enforced by container-version-check).
+{ pkgs ? import { } }:
+
+let
+ version = "8.1.7";
+in
+
+assert pkgs.valkey.version == version;
+
+pkgs.dockerTools.buildLayeredImage {
+ name = "blumeops/valkey";
+ contents = [
+ pkgs.valkey
+ ];
+
+ config = {
+ Entrypoint = [ "${pkgs.valkey}/bin/valkey-server" ];
+ Cmd = [ "--bind" "0.0.0.0" "--protected-mode" "no" "--dir" "/data" ];
+ ExposedPorts = {
+ "6379/tcp" = { };
+ };
+ };
+}
diff --git a/docs/changelog.d/valkey-nix.infra.md b/docs/changelog.d/valkey-nix.infra.md
new file mode 100644
index 0000000..e41eb63
--- /dev/null
+++ b/docs/changelog.d/valkey-nix.infra.md
@@ -0,0 +1 @@
+Add nix-built amd64 valkey for ringtail (`containers/valkey/default.nix`) so immich-ringtail can stop pulling the upstream multi-arch `docker.io/valkey/valkey` image. Existing `container.py` continues to build Alpine arm64 for paperless on indri. Both bump to valkey 8.1.7 (Alpine 3.22 8.1.7-r0 / nixpkgs 8.1.7).
diff --git a/service-versions.yaml b/service-versions.yaml
index 63b0f15..5440f01 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -146,14 +146,15 @@ services:
- name: valkey
type: argocd
- last-reviewed: 2026-05-01
- current-version: "8.1.6-r0"
- upstream-source: https://pkgs.alpinelinux.org/package/v3.22/community/aarch64/valkey
+ last-reviewed: 2026-05-28
+ current-version: "8.1.7"
+ upstream-source: https://github.com/valkey-io/valkey/releases
notes: >-
- Shared Alpine-built valkey image, used as a sidecar/cache by paperless
- (sidecar) and immich (separate Deployment). Mirrors the upstream
- docker.io/valkey/valkey:8.1-alpine. Pinned to Alpine 3.22 for valkey 8.1.x;
- Alpine 3.23 jumps to 9.0. Distinct from authentik-redis (nix-built Redis
+ Dual-build valkey image: container.py builds Alpine 3.22 + apk valkey
+ (arm64, indri) for paperless; default.nix builds via nixpkgs (amd64,
+ ringtail) for immich-ringtail. Both track upstream valkey 8.1.x; Alpine
+ 3.22 currently ships 8.1.7-r0 and nixpkgs valkey is 8.1.7. Alpine 3.23
+ jumps to 9.0. Distinct from authentik-redis (nix-built Redis
8.x) which has its own entry.
- name: external-secrets
From f588638331567d921e189cbff25db5425ccebaef Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Thu, 28 May 2026 14:53:21 -0700
Subject: [PATCH 05/29] C0: rebuild valkey from squashed main commit
Image tags from PR #362 (v8.1.7-02859c5{,-nix}) referenced a branch
SHA that no longer exists on main after squash-merge. Rebuilt both
the dagger arm64 and nix amd64 variants from the squashed commit
(ecded30) and updated paperless + immich-ringtail to the new tags.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
argocd/manifests/immich-ringtail/kustomization.yaml | 2 +-
argocd/manifests/paperless/kustomization.yaml | 2 +-
docs/changelog.d/+valkey-rebuild-on-main.infra.md | 1 +
3 files changed, 3 insertions(+), 2 deletions(-)
create mode 100644 docs/changelog.d/+valkey-rebuild-on-main.infra.md
diff --git a/argocd/manifests/immich-ringtail/kustomization.yaml b/argocd/manifests/immich-ringtail/kustomization.yaml
index 7a97fef..2fa131c 100644
--- a/argocd/manifests/immich-ringtail/kustomization.yaml
+++ b/argocd/manifests/immich-ringtail/kustomization.yaml
@@ -26,4 +26,4 @@ images:
# is arm64-only and serves paperless on indri.
- name: docker.io/valkey/valkey
newName: registry.ops.eblu.me/blumeops/valkey
- newTag: v8.1.7-02859c5-nix
+ newTag: v8.1.7-ecded30-nix
diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml
index 575dfb4..3cd0d74 100644
--- a/argocd/manifests/paperless/kustomization.yaml
+++ b/argocd/manifests/paperless/kustomization.yaml
@@ -16,4 +16,4 @@ images:
newTag: v2.20.13-07f52e9
- name: docker.io/library/redis
newName: registry.ops.eblu.me/blumeops/valkey
- newTag: v8.1.7-02859c5
+ newTag: v8.1.7-ecded30
diff --git a/docs/changelog.d/+valkey-rebuild-on-main.infra.md b/docs/changelog.d/+valkey-rebuild-on-main.infra.md
new file mode 100644
index 0000000..c743e61
--- /dev/null
+++ b/docs/changelog.d/+valkey-rebuild-on-main.infra.md
@@ -0,0 +1 @@
+Rebuild valkey container from squashed main commit (both arm64 dagger and amd64 nix variants), and update paperless + immich-ringtail kustomizations to the main-SHA tags `v8.1.7-ecded30` and `v8.1.7-ecded30-nix`.
From e0064de83d0d15a1f34f16146542a62817dca3ef Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Mon, 1 Jun 2026 15:52:09 -0700
Subject: [PATCH 06/29] C0: update ringtail flake inputs (nixpkgs, disko)
Co-Authored-By: Claude Opus 4.8 (1M context)
---
.../+ringtail-flake-update-2026-06-01.infra.md | 4 ++++
nixos/ringtail/flake.lock | 12 ++++++------
2 files changed, 10 insertions(+), 6 deletions(-)
create mode 100644 docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md
diff --git a/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md b/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md
new file mode 100644
index 0000000..dd488b6
--- /dev/null
+++ b/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md
@@ -0,0 +1,4 @@
+Update the ringtail NixOS flake lockfile (`nixos/ringtail/flake.lock`): bump
+`nixpkgs` (b77b3de β 25f5383) and `disko` (5ba0c95 β 115e521) to latest.
+`nixpkgs-services` was intentionally left pinned (skipped by the
+`flake-update` pipeline). Routine recurring maintenance per [[manage-lockfile]].
diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock
index 0f0da7e..bb60501 100644
--- a/nixos/ringtail/flake.lock
+++ b/nixos/ringtail/flake.lock
@@ -7,11 +7,11 @@
]
},
"locked": {
- "lastModified": 1779699611,
- "narHash": "sha256-EcCaSTKnmg2o4wLKaN1aqQFomwyhO7ik0bX9COdyCas=",
+ "lastModified": 1780290312,
+ "narHash": "sha256-eTAlX0CwgB84Ts3GaBd944A3DRXVMzgA0EqroZBISUo=",
"owner": "nix-community",
"repo": "disko",
- "rev": "5ba0c9555c28685e57fa54c7a25e42c7efdbfc8d",
+ "rev": "115e5211780054d8a890b41f0b7734cafad54dfe",
"type": "github"
},
"original": {
@@ -43,11 +43,11 @@
},
"nixpkgs": {
"locked": {
- "lastModified": 1779467186,
- "narHash": "sha256-nOesoDCiXcUftqbRBMz9tt4blI5PvljMWbm3kuCA+0s=",
+ "lastModified": 1779796641,
+ "narHash": "sha256-ZsIrKmhp4vbBXoXXmR/tBXA/UCsAQiJL9vsgZEduhVY=",
"owner": "NixOS",
"repo": "nixpkgs",
- "rev": "b77b3de8775677f84492abe84635f87b0e153f0f",
+ "rev": "25f538306313eae3927264466c70d7001dcea1df",
"type": "github"
},
"original": {
From a36a18aaa6714e187834edc09eb2fc565d0f5fbb Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Mon, 1 Jun 2026 20:52:20 -0700
Subject: [PATCH 07/29] C0: black-hole /mirrors/* at Fly edge + name-and-shame
scrapers
A $29.60 Fly bill traced to ~1.25 TB/30d egress on forge.eblu.me (99.95% of
all proxy egress), ~71% of it AI scrapers (Meta meta-externalagent, OpenAI
GPTBot, Amazonbot, Bytespider) crawling the public mirror repos' infinite
git-history URL space and timing out Forgejo. robots.txt already disallowed
/mirrors/ but those agents ignore it, so enforce at the edge: return 403 (^~
to beat the regex asset locations), served as a roll-of-dishonour page with an
X-Naughty-Scrapers header. Mirrors stay reachable on the tailnet via
forge.ops.eblu.me. Tier 2 (UA denylist + Anubis) and the Cloudflare rejection
are documented in docs/explanation/ai-scraper-mitigation.md.
Co-Authored-By: Claude Opus 4.8 (1M context)
---
.../+ai-scraper-mitigation-doc.doc.md | 1 +
.../+forge-mirrors-blackhole.infra.md | 1 +
docs/explanation/ai-scraper-mitigation.md | 201 ++++++++++++++++++
docs/tutorials/expose-service-publicly.md | 7 +
fly/Dockerfile | 1 +
fly/naughty.html | 64 ++++++
fly/nginx.conf | 27 +++
7 files changed, 302 insertions(+)
create mode 100644 docs/changelog.d/+ai-scraper-mitigation-doc.doc.md
create mode 100644 docs/changelog.d/+forge-mirrors-blackhole.infra.md
create mode 100644 docs/explanation/ai-scraper-mitigation.md
create mode 100644 fly/naughty.html
diff --git a/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md b/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md
new file mode 100644
index 0000000..246fedb
--- /dev/null
+++ b/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md
@@ -0,0 +1 @@
+Add `docs/explanation/ai-scraper-mitigation.md` β the egress-cost / AI-crawler threat model for the public Fly proxy, the tiered mitigation plan (Tier 1: mirror black-hole, shipped; Tier 2: user-agent denylist + Anubis; Tier 3: Cloudflare, rejected on principle), and the data behind it.
diff --git a/docs/changelog.d/+forge-mirrors-blackhole.infra.md b/docs/changelog.d/+forge-mirrors-blackhole.infra.md
new file mode 100644
index 0000000..29a5e6a
--- /dev/null
+++ b/docs/changelog.d/+forge-mirrors-blackhole.infra.md
@@ -0,0 +1 @@
+Black-hole the `/mirrors/*` repositories at the Fly proxy edge (`return 403` β `forge.ops.eblu.me`). A surprise $29.60 Fly bill traced to ~1.24 TB/30d of egress on `forge.eblu.me`, 99.95% of all proxy egress β of which ~71% was AI scrapers (Meta `meta-externalagent`, OpenAI `GPTBot`, Amazonbot) crawling the near-infinite git-history URL space of the public mirror repos and timing out Forgejo in the process. Mirrors exist for supply-chain control and are consumed over the tailnet, so their public web UI had no legitimate audience. `robots.txt` already disallowed `/mirrors/`, but the offending agents ignore it. Tier-2 mitigations (user-agent denylist, Anubis proof-of-work gateway) are documented in `docs/explanation/ai-scraper-mitigation.md`.
diff --git a/docs/explanation/ai-scraper-mitigation.md b/docs/explanation/ai-scraper-mitigation.md
new file mode 100644
index 0000000..fe4ba3d
--- /dev/null
+++ b/docs/explanation/ai-scraper-mitigation.md
@@ -0,0 +1,201 @@
+---
+title: AI Scraper Mitigation
+modified: 2026-06-01
+last-reviewed: 2026-06-01
+tags:
+ - explanation
+ - fly-io
+ - forgejo
+ - security
+ - networking
+---
+
+# AI Scraper Mitigation on the Public Proxy
+
+> **Note:** This article was drafted by AI and reviewed by Erich. I plan to rewrite all explanatory content in my own words β these serve as placeholders to establish the documentation structure.
+
+How BlumeOps keeps AI crawlers from running up the [[expose-service-publicly|Fly.io proxy]] egress bill and DoS-ing [[forgejo|Forgejo]] on [[indri]].
+
+## The incident
+
+A $29.60 Fly.io invoice arrived, nearly all of it a single line:
+
+```
+Bandwidth: Egress (iad) β 958,524,714,138 bytes β $19.17
+```
+
+The `iad` (Ashburn) region is a red herring: the proxy machine runs in `sjc`,
+but Fly bills egress at the edge PoP nearest the *client*, so `iad` just means
+"the traffic went to clients on the US East Coast."
+
+Tracing it through the nginx access logs (shipped to Loki via [[alloy|Alloy]]):
+
+| Signal | Value |
+|--------|-------|
+| Total proxy egress (30d) | ~1.25 TB |
+| Share that was `forge.eblu.me` | **99.95%** |
+| Share of forge egress that was `/mirrors/*` | **~71%** |
+| Share that was declared AI bots | **~85%+** |
+| Top offenders | Meta `meta-externalagent` (66% of bytes), OpenAI `GPTBot` (16%), Amazonbot, Bytespider |
+| Forgejo `5xx` (upstream timeouts) | tens of thousands/day, spiking to 112k |
+
+The crawlers were walking [[forgejo|Forgejo]]'s git-history browse endpoints β
+`src/commit/`, `commits/`, `blame/`, `raw/commit/`, plus `.patch`/`.diff`
+and `?page=N` pagination. That URL space is effectively **infinite**: every
+file Γ every commit Γ every page, multiplied across every mirrored repo. A
+crawler that follows links never finishes, and every page is a cache `MISS`
+that both tunnels to indri *and* bills as egress.
+
+Two distinct harms, not one:
+
+1. **Cost** β ~1.25 TB/mo of egress on a free-tier-ish proxy.
+2. **Availability** β the crawl alone generates ~400β530k requests/day,
+ enough to time out Forgejo regardless of how much RAM [[indri]] has. Moving
+ egress elsewhere would *not* fix this; the crawl has to be throttled at the
+ source.
+
+`robots.txt` already `Disallow`s `/mirrors/`, `/user/`, and archive/download
+paths β but **`meta-externalagent` and `GPTBot` ignore it.** For these agents,
+`robots.txt` is a dead letter, which is why edge enforcement is required.
+
+## The tiered plan
+
+### Tier 1 β Black-hole `/mirrors/*` (shipped)
+
+The mirror repositories (`tailscale`, `prometheus`, `mealie`, `paperless-ngx`,
+β¦) are mirrors of *already-public upstreams*, kept for supply-chain control
+(see [[spork-strategy]] and the container/mirror story in [[why-gitops]]). They
+are consumed by CI, gilbert, and other tailnet clients over
+`forge.ops.eblu.me`. Their web UI on the public internet served **no
+legitimate audience** β only scrapers. So the proxy now returns `403` for
+anything under `/mirrors/`, pointing humans at the tailnet host:
+
+```nginx
+location ^~ /mirrors/ {
+ return 403 "Mirror repositories are tailnet-only β use forge.ops.eblu.me.\n";
+}
+```
+
+The `^~` modifier matters: without it, the regex `location` blocks for static
+assets (`*.css`, `*.js`, release downloads) would match first and leak content
+under `/mirrors/`. `^~` tells nginx to stop at the prefix match and skip the
+regex round.
+
+This is config, not bot-fighting β we simply stopped serving an infinite
+tarpit to the world. It removes ~71% of forge egress and a large share of the
+upstream timeouts, with zero impact on any human or tailnet consumer. It
+mirrors the existing tailnet-only blocks for `/api/packages/` and `/swagger`.
+
+The `403` is also a small act of public shaming. Blocked requests are served a
+"roll of dishonour" page (`fly/naughty.html`, status kept at `403` via
+`error_page 403 /naughty.html`) that names the offending operators and their
+share of the stolen bytes, and every response carries an `X-Naughty-Scrapers`
+header:
+
+```
+X-Naughty-Scrapers: OpenAI/GPTBot, Meta/meta-externalagent, Amazonbot, ByteDance/Bytespider β robots.txt ignorers
+```
+
+Petty? A little. But it costs nothing, documents *why* the block exists for the
+next person who hits it, and the page is a few KB versus the megabytes of git
+HTML the crawlers were taking.
+
+**Trade-off accepted:** mirror release-artifact downloads over WAN now also
+`403`. Legitimate consumers already pull these over the tailnet, and the public
+exposure was the same crawl liability, so this is intentional.
+
+### Tier 2 β Defend the repos that *stay* public (planned)
+
+`/eblume/*` is intentionally public (a public profile is a feature). But the
+same git-history endpoints are still a tarpit there, just lower-volume. Two
+layers, in increasing order of effort and effectiveness:
+
+#### 2a. User-agent denylist (cheap, evadable)
+
+Block the declared AI crawlers at the edge regardless of path:
+
+```nginx
+# Illustrative β not yet deployed.
+map $http_user_agent $is_ai_bot {
+ default 0;
+ "~*meta-externalagent" 1;
+ "~*GPTBot" 1;
+ "~*ClaudeBot" 1;
+ "~*Amazonbot" 1;
+ "~*Bytespider" 1;
+ "~*SemrushBot" 1;
+}
+# in the forge.eblu.me server block:
+if ($is_ai_bot) { return 403; }
+```
+
+This catches ~85% of *current* traffic for a few lines of config. It is
+trivially evadable β a scraper need only spoof a browser UA β so it is a
+speed-bump, not a wall. Keep `robots.txt` too: well-behaved crawlers
+(Googlebot, Bingbot) do honor it, and it documents intent.
+
+#### 2b. Anubis proof-of-work gateway (the real wall)
+
+[Anubis](https://github.com/TecharoHQ/anubis) is a Go reverse proxy that
+weighs each request with a browser-based proof-of-work challenge before passing
+it upstream. It was written for *exactly this scenario* β its author built it
+after Amazon's scraper took down their Git server β and is widely deployed in
+front of Forgejo/Gitea (Codeberg, the UN, etc.). Headless scrapers that can't
+run the challenge JS never reach the application; humans clear it once and
+proceed.
+
+Why it fits BlumeOps better than the alternatives:
+
+- **It attacks cost *and* availability at once.** Bots receive a few-KB
+ challenge page instead of MB of git HTML (egress collapses) and never reach
+ Forgejo (timeouts collapse). No other single lever does both.
+- **It stays in-house.** No third party terminates our TLS or sees our
+ traffic.
+
+Placement options:
+
+| Where | Pros | Cons |
+|-------|------|------|
+| On [[indri]], between [[caddy|Caddy]] and Forgejo | Protects every path and every entry (WAN *and* tailnet); one config | Adds a hop and a service to the indri critical path; the challenge page still tunnels back through Fly for WAN clients (small egress) |
+| On the Fly proxy machine, in front of nginx | Challenge served at the edge β bots never even tunnel to indri | Fly VM is small (512 MB); another moving part in the boot sequence alongside `tailscaled`/nginx/`fail2ban`/Alloy |
+
+Leaning toward Caddy-side on indri for simplicity and uniform coverage, but
+this is the open design question for Tier 2. Anubis is MIT-licensed and the
+author has signalled a future move to an `equi-x`-based challenge, so pin a
+version and track upstream.
+
+### Tier 3 β Move egress off Fly entirely (rejected)
+
+A [[#The incident|Cloudflare]] Tunnel (`cloudflared` on indri β Cloudflare
+edge) would make this a non-problem on the cost axis: Cloudflare does not meter
+proxied bandwidth, and it bundles free AI-bot mitigation (Bot Fight Mode, the
+"block AI scrapers" toggle, Managed Challenge, AI Labyrinth). One move would
+zero the egress bill and add bot defense.
+
+**We are not doing this, on principle.** Cloudflare is a solid platform and a
+defensible engineering choice β but it already sits in front of an enormous
+fraction of the modern web, and routing BlumeOps through it would add one more
+site to the pile of the internet that one company can see and gate. BlumeOps
+deliberately keeps its own backbone ([[expose-service-publicly|Fly + Tailscale
++ Caddy]], DNS at [[gandi|Gandi]] β see the "no Cloudflare dependency" line in
+that doc). This is a values decision, not a technical one: we would rather pay
+a few dollars and run our own mitigation than centralize on Cloudflare.
+
+It is also worth noting that **Tier 3 would not, by itself, fix the upstream
+timeouts** β free egress just means we'd stop *caring* that bots crawl, while
+they continued to hammer Forgejo. Crawl mitigation (Tier 1 + Tier 2) is
+required regardless of where egress is billed.
+
+## Summary
+
+| Tier | Lever | Cost | Availability | Status |
+|------|-------|------|--------------|--------|
+| 1 | Black-hole `/mirrors/*` at edge | β~71% | big drop | **shipped** |
+| 2a | UA denylist on remaining repos | βmost of the rest | further drop | planned |
+| 2b | Anubis PoW gateway | βnear-total | near-total | planned |
+| 3 | Cloudflare Tunnel | βtotal | needs 2b anyway | **rejected (principle)** |
+
+The guiding insight: the cheapest, lowest-risk mitigation is to **not serve an
+infinite-URL surface that has no human audience.** Everything past Tier 1 is
+about defending the surface we *do* want public, in-house, without ceding
+control of our traffic to a third party.
diff --git a/docs/tutorials/expose-service-publicly.md b/docs/tutorials/expose-service-publicly.md
index 886cad4..65af611 100644
--- a/docs/tutorials/expose-service-publicly.md
+++ b/docs/tutorials/expose-service-publicly.md
@@ -376,6 +376,13 @@ Mitigations for dynamic services:
- fail2ban on indri (see below) can block IPs showing abuse patterns
- The break-glass shutoff remains the last resort
+The most acute version of this in practice has been **AI scrapers**, which
+ignore `robots.txt` and crawl dynamic services (notably [[forgejo|Forgejo]]'s
+infinite git-history URL space) into both a surprise egress bill and an
+effective L7 DoS. See [[ai-scraper-mitigation]] for the incident, the tiered
+defense (mirror black-hole, user-agent denylist, Anubis proof-of-work), and
+why a Cloudflare Tunnel is *not* the chosen answer here.
+
If a publicly exposed dynamic service attracts targeted attacks or the
home network bandwidth is impacted, consider migrating to Cloudflare
Tunnel for enterprise-grade DDoS protection (requires DNS migration;
diff --git a/fly/Dockerfile b/fly/Dockerfile
index d4e7a18..406c849 100644
--- a/fly/Dockerfile
+++ b/fly/Dockerfile
@@ -25,6 +25,7 @@ COPY fail2ban/action.d/nginx-deny.conf /etc/fail2ban/action.d/nginx-deny.conf
COPY nginx.conf /etc/nginx/nginx.conf
COPY error.html /usr/share/nginx/html/error.html
+COPY naughty.html /usr/share/nginx/html/naughty.html
COPY alloy.river /etc/alloy/config.alloy
COPY start.sh /start.sh
RUN chmod +x /start.sh
diff --git a/fly/naughty.html b/fly/naughty.html
new file mode 100644
index 0000000..d899171
--- /dev/null
+++ b/fly/naughty.html
@@ -0,0 +1,64 @@
+
+
+
+
+
+
+ 403 Β· Roll of Dishonour
+
+
+
+
+ πͺ€ 403 β you walked into the scraper trap
+ These are mirror repositories. They are tailnet-only.
+
+
+ This path used to serve the web UI for mirrors of public upstream
+ projects. It exists for supply-chain control, not for crawling. A
+ robots.txt politely disallowed /mirrors/.
+ A pack of AI scrapers ignored it, walked the infinite git-history URL
+ space, and ran up ~1.25 TB of egress and a real
+ money bill in a single month β while timing out the server for everyone
+ else.
+
+
+ So /mirrors/ is closed at the edge now. Roll of dishonour,
+ by share of the bytes they stole:
+
+
+ | Operator | User-Agent | Bytes |
+
+ | Meta | meta-externalagent | 66% |
+ | OpenAI | GPTBot | 16% |
+ | Amazon | Amazonbot | 3% |
+ | ByteDance | Bytespider | 1% |
+
+
+
+
+ If you are a human who actually wanted these mirrors, they are reachable
+ from the tailnet at forge.ops.eblu.me. If you are a crawler:
+ read the robots.txt next time. We left you a header, too.
+
+
+
+
+
+
diff --git a/fly/nginx.conf b/fly/nginx.conf
index 570e6c9..ec35774 100644
--- a/fly/nginx.conf
+++ b/fly/nginx.conf
@@ -215,6 +215,33 @@ http {
return 403 "API documentation is only available at forge.ops.eblu.me (tailnet).\n";
}
+ # Black-hole the mirror repositories on WAN. These are mirrors of
+ # already-public upstreams (tailscale, prometheus, mealie, β¦) kept
+ # for supply-chain control; CI, gilbert, and tailnet clients consume
+ # them via forge.ops.eblu.me. Their web UI served no public purpose
+ # but AI scrapers, which crawled the near-infinite git-history URL
+ # space (src/commit, commits, blame, raw) and drove ~70% of Fly
+ # egress (1.24 TB/30d β a surprise bill) plus enough upstream load to
+ # time out Forgejo. robots.txt already Disallows /mirrors/, but
+ # meta-externalagent and GPTBot ignore it β so enforce at the edge.
+ # `^~` makes this win over the regex locations below (e.g. *.css), so
+ # static assets under /mirrors/ can't leak through. We also name and
+ # shame: blocked requests get a "roll of dishonour" page (403 status
+ # preserved) and an X-Naughty-Scrapers header. See
+ # docs/explanation/ai-scraper-mitigation.md.
+ location ^~ /mirrors/ {
+ error_page 403 /naughty.html;
+ return 403;
+ }
+
+ # Roll of dishonour β served on the /mirrors/ 403, status kept at 403.
+ location = /naughty.html {
+ internal;
+ root /usr/share/nginx/html;
+ add_header X-Naughty-Scrapers "OpenAI/GPTBot, Meta/meta-externalagent, Amazonbot, ByteDance/Bytespider β robots.txt ignorers" always;
+ add_header X-Clacks-Overhead "GNU Terry Pratchett" always;
+ }
+
# Redirect archive endpoints to tailnet β archive requests generate full
# git bundles on demand. Unauthenticated crawlers hitting unique commit
# SHAs cause unbounded CPU and disk usage (DoS vector). Legitimate users
From 40bd92982015582cb7aa2680c6dc8412706498fb Mon Sep 17 00:00:00 2001
From: Erich Blume
Date: Mon, 1 Jun 2026 20:55:05 -0700
Subject: [PATCH 08/29] C0: remove visible GNU Terry Pratchett from
naughty.html body
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
GNU lives in the overhead β the X-Clacks-Overhead header β never on the
visible page. Keep the header, drop the footer.
Co-Authored-By: Claude Opus 4.8 (1M context)
---
fly/naughty.html | 3 ---
1 file changed, 3 deletions(-)
diff --git a/fly/naughty.html b/fly/naughty.html
index d899171..b6eada8 100644
--- a/fly/naughty.html
+++ b/fly/naughty.html
@@ -21,7 +21,6 @@
td.share { color: #f2c14e; text-align: right; font-variant-numeric: tabular-nums; }
.name { color: #e8867a; }
a { color: #7fb3d5; }
- footer { margin-top: 2rem; color: #5c574f; font-size: .85rem; }
@@ -57,8 +56,6 @@
from the tailnet at forge.ops.eblu.me. If you are a crawler:
read the robots.txt next time. We left you a header, too.
-
-