From 813ce2ddaf4f62108af6778361c5544005bde2aa Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Thu, 4 Jun 2026 13:23:03 -0700
Subject: [PATCH 01/20] Recurring review sweep: 4 doc cards +
 nvidia-device-plugin v0.19.2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Doc review (last-reviewed 2026-06-04):
- cluster.md: k8s v1.34.0→v1.35.0; ringtail workload list updated for
  the in-progress minikube→k3s migration
- ntfy/tempo/alloy: images are now locally-built registry.ops.eblu.me
  nix containers (v2.19.2 / v2.10.3 / v1.16.0); Fly alloy binary v1.16.1

Service review:
- nvidia-device-plugin v0.19.0→v0.19.2 (upstream patch, no breaking
  changes for our CDI + RuntimeClass manifests)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/nvidia-device-plugin/kustomization.yaml | 2 +-
 docs/changelog.d/reviews-jun4.doc.md                     | 1 +
 docs/changelog.d/reviews-jun4.infra.md                   | 1 +
 docs/reference/kubernetes/cluster.md                     | 9 ++++++---
 docs/reference/services/alloy.md                         | 7 ++++---
 docs/reference/services/ntfy.md                          | 5 +++--
 docs/reference/services/tempo.md                         | 5 +++--
 service-versions.yaml                                    | 4 ++--
 8 files changed, 21 insertions(+), 13 deletions(-)
 create mode 100644 docs/changelog.d/reviews-jun4.doc.md
 create mode 100644 docs/changelog.d/reviews-jun4.infra.md

diff --git a/argocd/manifests/nvidia-device-plugin/kustomization.yaml b/argocd/manifests/nvidia-device-plugin/kustomization.yaml
index a46edf6..f5a33ae 100644
--- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml
+++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml
@@ -10,4 +10,4 @@ resources:
 
 images:
   - name: nvcr.io/nvidia/k8s-device-plugin
-    newTag: v0.19.0
+    newTag: v0.19.2
diff --git a/docs/changelog.d/reviews-jun4.doc.md b/docs/changelog.d/reviews-jun4.doc.md
new file mode 100644
index 0000000..f1aeaa8
--- /dev/null
+++ b/docs/changelog.d/reviews-jun4.doc.md
@@ -0,0 +1 @@
+Reviewed four never-reviewed reference cards (`cluster`, `ntfy`, `tempo`, `alloy`) and corrected drift: minikube is now Kubernetes v1.35.0; ntfy, tempo, and alloy-k8s images are now locally-built `registry.ops.eblu.me/blumeops/*` nix containers (v2.19.2, v2.10.3, v1.16.0) rather than upstream Docker Hub; the Fly.io alloy binary is v1.16.1; and the ringtail workload list reflects the in-progress minikube→k3s migration.
diff --git a/docs/changelog.d/reviews-jun4.infra.md b/docs/changelog.d/reviews-jun4.infra.md
new file mode 100644
index 0000000..c128e70
--- /dev/null
+++ b/docs/changelog.d/reviews-jun4.infra.md
@@ -0,0 +1 @@
+Upgraded the nvidia-device-plugin on ringtail from v0.19.0 to v0.19.2 (upstream patch release: CDI/Tegra fixes and dependency bumps, no breaking changes for our manifest-based CDI + RuntimeClass setup).
diff --git a/docs/reference/kubernetes/cluster.md b/docs/reference/kubernetes/cluster.md
index 9b632bd..07c14af 100644
--- a/docs/reference/kubernetes/cluster.md
+++ b/docs/reference/kubernetes/cluster.md
@@ -1,6 +1,7 @@
 ---
 title: Cluster
-modified: 2026-02-19
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - kubernetes
 ---
@@ -15,7 +16,7 @@ BlumeOps runs two Kubernetes clusters: a Minikube cluster on [[indri]] (most ser
 |----------|-------|
 | **Driver** | docker |
 | **Container Runtime** | docker |
-| **Kubernetes Version** | v1.34.0 |
+| **Kubernetes Version** | v1.35.0 |
 | **CPUs** | 6 |
 | **Memory** | 11GB |
 | **Disk** | 200GB |
@@ -41,7 +42,9 @@ Single-node k3s cluster for workloads requiring amd64 or GPU access. See [[ringt
 |----------|-------|
 | **Context** | `k3s-ringtail` |
 | **API Server** | `https://ringtail.tail8d86e.ts.net:6443` |
-| **Workloads** | Frigate (GPU), ntfy, frigate-notify, nvidia-device-plugin |
+| **Workloads** | GPU workloads (Frigate, Ollama), notifications (ntfy, frigate-notify), [[authentik]], and services migrated off indri minikube (Immich, Mealie, Paperless, TeslaMate). See [[ringtail]] for the authoritative list. |
+
+Services are being progressively migrated from indri's minikube to ringtail's k3s; the split above reflects an in-progress state, not a fixed boundary.
 
 ## Related
 
diff --git a/docs/reference/services/alloy.md b/docs/reference/services/alloy.md
index d781f2f..97d1e77 100644
--- a/docs/reference/services/alloy.md
+++ b/docs/reference/services/alloy.md
@@ -1,6 +1,7 @@
 ---
 title: Alloy
-modified: 2026-03-13
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - service
   - observability
@@ -20,10 +21,10 @@ Unified observability collector for metrics and logs with three deployments:
 | **Indri Binary** | `~/.local/bin/alloy` |
 | **Indri Config** | `~/.config/grafana-alloy/config.alloy` |
 | **K8s Namespace** | `alloy` |
-| **K8s Image** | `grafana/alloy:v1.14.0` |
+| **K8s Image** | `registry.ops.eblu.me/blumeops/alloy:v1.16.0-9564435` (locally built) |
 | **ArgoCD App** | `alloy-k8s` |
 | **Fly.io Config** | `fly/alloy.river` |
-| **Fly.io Image** | `grafana/alloy:v1.5.1` (binary copied into nginx container) |
+| **Fly.io Image** | `grafana/alloy:v1.16.1` (binary copied into nginx container, sha-pinned) |
 
 ## Metrics Collected
 
diff --git a/docs/reference/services/ntfy.md b/docs/reference/services/ntfy.md
index b549a6d..1bf45af 100644
--- a/docs/reference/services/ntfy.md
+++ b/docs/reference/services/ntfy.md
@@ -1,6 +1,7 @@
 ---
 title: Ntfy
-modified: 2026-02-17
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - service
   - notifications
@@ -17,7 +18,7 @@ Self-hosted push notification service. Ntfy receives HTTP POST messages and deli
 | **URL** | https://ntfy.ops.eblu.me |
 | **Tailscale URL** | https://ntfy.tail8d86e.ts.net |
 | **Namespace** | `ntfy` |
-| **Image** | `binwiederhier/ntfy:v2.17.0` |
+| **Image** | `registry.ops.eblu.me/blumeops/ntfy:v2.19.2-fd0bebb-nix` (locally built) |
 | **Upstream** | https://github.com/binwiederhier/ntfy |
 | **Manifests** | `argocd/manifests/ntfy/` |
 
diff --git a/docs/reference/services/tempo.md b/docs/reference/services/tempo.md
index 771b97f..5eb5d87 100644
--- a/docs/reference/services/tempo.md
+++ b/docs/reference/services/tempo.md
@@ -1,6 +1,7 @@
 ---
 title: Tempo
-modified: 2026-03-05
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - service
   - observability
@@ -18,7 +19,7 @@ Distributed tracing backend for BlumeOps infrastructure. Receives traces via OTL
 | **Tailscale URL** | https://tempo.tail8d86e.ts.net |
 | **OTLP Endpoint** | https://tempo-otlp.tail8d86e.ts.net |
 | **Namespace** | `monitoring` |
-| **Image** | `grafana/tempo:2.10.1` |
+| **Image** | `registry.ops.eblu.me/blumeops/tempo:v2.10.3-75f9ba4` (locally built) |
 | **Storage** | 10Gi PVC (local filesystem) |
 | **Retention** | 7 days |
 
diff --git a/service-versions.yaml b/service-versions.yaml
index 699f89c..11ec9f9 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -56,8 +56,8 @@ services:
 
   - name: nvidia-device-plugin
     type: argocd
-    last-reviewed: 2026-03-27
-    current-version: "v0.19.0"
+    last-reviewed: 2026-06-04
+    current-version: "v0.19.2"
     upstream-source: https://github.com/NVIDIA/k8s-device-plugin/releases
     notes: DaemonSet + RuntimeClass on ringtail for GPU workloads
 

From bb55fa95667903e1b38c084a46690e7da61eef0d Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Thu, 4 Jun 2026 13:37:02 -0700
Subject: [PATCH 02/20] Recurring review sweep: 4 doc cards +
 nvidia-device-plugin v0.19.2 (#366)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Knocks out the two daily recurring review tasks (doc review + service review) in one PR.

## Doc review (4 never-reviewed reference cards, `last-reviewed: 2026-06-04`)
- **cluster.md** — Kubernetes version v1.34.0 → **v1.35.0**; refreshed the stale ringtail workload list and noted the in-progress minikube→k3s migration (points to `[[ringtail]]` as the canonical list).
- **ntfy.md / tempo.md / alloy.md** — corrected image references: these are now **locally-built `registry.ops.eblu.me/blumeops/*` nix containers** (ntfy v2.19.2, tempo v2.10.3, alloy-k8s v1.16.0), not upstream Docker Hub. Fly.io alloy binary bumped to v1.16.1.

## Service review
- **nvidia-device-plugin** (ringtail GPU): v0.19.0 → **v0.19.2**. Upstream patch releases — CDI/Tegra fixes + dependency bumps, no breaking changes for our manifest-based CDI + RuntimeClass setup (the service-account change in the notes is helm-only).

## Not in this PR (need container rebuilds, deferred)
The other stale services are locally-built nix images, so upgrading them is a forge-runner rebuild rather than a clean tag bump — left untouched (not date-bumped, so they resurface): **prometheus** (v3.10.0→v3.12.0), **loki** (3.6.7→3.7.2), **kube-state-metrics**, **homepage**. Happy to do these as a follow-up rebuild PR.

## Deploy / verify
Not yet deployed — `nvidia-device-plugin` still points at `main`. After review:
```
argocd app set nvidia-device-plugin --revision reviews-jun4 && argocd app sync nvidia-device-plugin
# after merge:
argocd app set nvidia-device-plugin --revision main && argocd app sync nvidia-device-plugin
```

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/366
---
 argocd/manifests/nvidia-device-plugin/kustomization.yaml | 2 +-
 docs/changelog.d/reviews-jun4.doc.md                     | 1 +
 docs/changelog.d/reviews-jun4.infra.md                   | 1 +
 docs/reference/kubernetes/cluster.md                     | 9 ++++++---
 docs/reference/services/alloy.md                         | 7 ++++---
 docs/reference/services/ntfy.md                          | 5 +++--
 docs/reference/services/tempo.md                         | 5 +++--
 service-versions.yaml                                    | 4 ++--
 8 files changed, 21 insertions(+), 13 deletions(-)
 create mode 100644 docs/changelog.d/reviews-jun4.doc.md
 create mode 100644 docs/changelog.d/reviews-jun4.infra.md

diff --git a/argocd/manifests/nvidia-device-plugin/kustomization.yaml b/argocd/manifests/nvidia-device-plugin/kustomization.yaml
index a46edf6..f5a33ae 100644
--- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml
+++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml
@@ -10,4 +10,4 @@ resources:
 
 images:
   - name: nvcr.io/nvidia/k8s-device-plugin
-    newTag: v0.19.0
+    newTag: v0.19.2
diff --git a/docs/changelog.d/reviews-jun4.doc.md b/docs/changelog.d/reviews-jun4.doc.md
new file mode 100644
index 0000000..f1aeaa8
--- /dev/null
+++ b/docs/changelog.d/reviews-jun4.doc.md
@@ -0,0 +1 @@
+Reviewed four never-reviewed reference cards (`cluster`, `ntfy`, `tempo`, `alloy`) and corrected drift: minikube is now Kubernetes v1.35.0; ntfy, tempo, and alloy-k8s images are now locally-built `registry.ops.eblu.me/blumeops/*` nix containers (v2.19.2, v2.10.3, v1.16.0) rather than upstream Docker Hub; the Fly.io alloy binary is v1.16.1; and the ringtail workload list reflects the in-progress minikube→k3s migration.
diff --git a/docs/changelog.d/reviews-jun4.infra.md b/docs/changelog.d/reviews-jun4.infra.md
new file mode 100644
index 0000000..c128e70
--- /dev/null
+++ b/docs/changelog.d/reviews-jun4.infra.md
@@ -0,0 +1 @@
+Upgraded the nvidia-device-plugin on ringtail from v0.19.0 to v0.19.2 (upstream patch release: CDI/Tegra fixes and dependency bumps, no breaking changes for our manifest-based CDI + RuntimeClass setup).
diff --git a/docs/reference/kubernetes/cluster.md b/docs/reference/kubernetes/cluster.md
index 9b632bd..07c14af 100644
--- a/docs/reference/kubernetes/cluster.md
+++ b/docs/reference/kubernetes/cluster.md
@@ -1,6 +1,7 @@
 ---
 title: Cluster
-modified: 2026-02-19
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - kubernetes
 ---
@@ -15,7 +16,7 @@ BlumeOps runs two Kubernetes clusters: a Minikube cluster on [[indri]] (most ser
 |----------|-------|
 | **Driver** | docker |
 | **Container Runtime** | docker |
-| **Kubernetes Version** | v1.34.0 |
+| **Kubernetes Version** | v1.35.0 |
 | **CPUs** | 6 |
 | **Memory** | 11GB |
 | **Disk** | 200GB |
@@ -41,7 +42,9 @@ Single-node k3s cluster for workloads requiring amd64 or GPU access. See [[ringt
 |----------|-------|
 | **Context** | `k3s-ringtail` |
 | **API Server** | `https://ringtail.tail8d86e.ts.net:6443` |
-| **Workloads** | Frigate (GPU), ntfy, frigate-notify, nvidia-device-plugin |
+| **Workloads** | GPU workloads (Frigate, Ollama), notifications (ntfy, frigate-notify), [[authentik]], and services migrated off indri minikube (Immich, Mealie, Paperless, TeslaMate). See [[ringtail]] for the authoritative list. |
+
+Services are being progressively migrated from indri's minikube to ringtail's k3s; the split above reflects an in-progress state, not a fixed boundary.
 
 ## Related
 
diff --git a/docs/reference/services/alloy.md b/docs/reference/services/alloy.md
index d781f2f..97d1e77 100644
--- a/docs/reference/services/alloy.md
+++ b/docs/reference/services/alloy.md
@@ -1,6 +1,7 @@
 ---
 title: Alloy
-modified: 2026-03-13
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - service
   - observability
@@ -20,10 +21,10 @@ Unified observability collector for metrics and logs with three deployments:
 | **Indri Binary** | `~/.local/bin/alloy` |
 | **Indri Config** | `~/.config/grafana-alloy/config.alloy` |
 | **K8s Namespace** | `alloy` |
-| **K8s Image** | `grafana/alloy:v1.14.0` |
+| **K8s Image** | `registry.ops.eblu.me/blumeops/alloy:v1.16.0-9564435` (locally built) |
 | **ArgoCD App** | `alloy-k8s` |
 | **Fly.io Config** | `fly/alloy.river` |
-| **Fly.io Image** | `grafana/alloy:v1.5.1` (binary copied into nginx container) |
+| **Fly.io Image** | `grafana/alloy:v1.16.1` (binary copied into nginx container, sha-pinned) |
 
 ## Metrics Collected
 
diff --git a/docs/reference/services/ntfy.md b/docs/reference/services/ntfy.md
index b549a6d..1bf45af 100644
--- a/docs/reference/services/ntfy.md
+++ b/docs/reference/services/ntfy.md
@@ -1,6 +1,7 @@
 ---
 title: Ntfy
-modified: 2026-02-17
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - service
   - notifications
@@ -17,7 +18,7 @@ Self-hosted push notification service. Ntfy receives HTTP POST messages and deli
 | **URL** | https://ntfy.ops.eblu.me |
 | **Tailscale URL** | https://ntfy.tail8d86e.ts.net |
 | **Namespace** | `ntfy` |
-| **Image** | `binwiederhier/ntfy:v2.17.0` |
+| **Image** | `registry.ops.eblu.me/blumeops/ntfy:v2.19.2-fd0bebb-nix` (locally built) |
 | **Upstream** | https://github.com/binwiederhier/ntfy |
 | **Manifests** | `argocd/manifests/ntfy/` |
 
diff --git a/docs/reference/services/tempo.md b/docs/reference/services/tempo.md
index 771b97f..5eb5d87 100644
--- a/docs/reference/services/tempo.md
+++ b/docs/reference/services/tempo.md
@@ -1,6 +1,7 @@
 ---
 title: Tempo
-modified: 2026-03-05
+modified: 2026-06-04
+last-reviewed: 2026-06-04
 tags:
   - service
   - observability
@@ -18,7 +19,7 @@ Distributed tracing backend for BlumeOps infrastructure. Receives traces via OTL
 | **Tailscale URL** | https://tempo.tail8d86e.ts.net |
 | **OTLP Endpoint** | https://tempo-otlp.tail8d86e.ts.net |
 | **Namespace** | `monitoring` |
-| **Image** | `grafana/tempo:2.10.1` |
+| **Image** | `registry.ops.eblu.me/blumeops/tempo:v2.10.3-75f9ba4` (locally built) |
 | **Storage** | 10Gi PVC (local filesystem) |
 | **Retention** | 7 days |
 
diff --git a/service-versions.yaml b/service-versions.yaml
index 699f89c..11ec9f9 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -56,8 +56,8 @@ services:
 
   - name: nvidia-device-plugin
     type: argocd
-    last-reviewed: 2026-03-27
-    current-version: "v0.19.0"
+    last-reviewed: 2026-06-04
+    current-version: "v0.19.2"
     upstream-source: https://github.com/NVIDIA/k8s-device-plugin/releases
     notes: DaemonSet + RuntimeClass on ringtail for GPU workloads
 

From 0e70a1b5242183170a5d7d8ac96ee864063f65bb Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Thu, 4 Jun 2026 14:55:55 -0700
Subject: [PATCH 03/20] Localize external-secrets container (native
 container.py build) (#367)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Knocks out the weekly "pick one non-local container and make it local" task by moving **external-secrets** off `ghcr.io` onto a locally-built image, under our own supply-chain control. Doubles as its overdue service review.

## What changed
- **`containers/external-secrets/container.py`** (new) — native Dagger build (the Dockerfile→container.py migration pattern). Clones the forge mirror at `v2.2.0` and builds the single `all_providers` static Go binary, faithful to upstream's `make build` (CGO off, no version ldflags upstream). ENTRYPOINT is `/bin/external-secrets` so the controller/webhook/cert-controller Deployments select their role via `args:` exactly as before.
- **`argocd/manifests/external-secrets/kustomization.yaml`** — image swapped to `registry.ops.eblu.me/blumeops/external-secrets:v2.2.0-2985007`. **Like-for-like (v2.2.0)**, not an upgrade.
- **`service-versions.yaml`** — marked reviewed (2026-06-04), noted the local build.

## Build
Built on the indri forge runner (run #579, ~4 min) → pushed to Zot. Image config verified: `Entrypoint=/bin/external-secrets`, `User=65534`, version label `v2.2.0`.

## Deployed from branch & verified
- All 3 pods (controller / webhook / cert-controller) rolled to the local image, `1/1 Running`
- Controller + webhook logs clean (no errors; webhook serving TLS)
- **End-to-end secret fetch proven:** force-synced `monitoring/grafana-admin` → `refreshTime` advanced to now, `Ready=True`
- All 10 ExternalSecrets cluster-wide remain `SecretSynced=True` — no collateral damage
- App `Healthy`

## Post-merge
`external-secrets` currently points at this branch (so `apps` reads OutOfSync — expected). After merge:
```
argocd app set external-secrets --revision main && argocd app sync external-secrets
```

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/367
---
 .../external-secrets/kustomization.yaml       |  3 +-
 containers/external-secrets/container.py      | 51 +++++++++++++++++++
 .../local-external-secrets.infra.md           |  1 +
 service-versions.yaml                         |  7 ++-
 4 files changed, 59 insertions(+), 3 deletions(-)
 create mode 100644 containers/external-secrets/container.py
 create mode 100644 docs/changelog.d/local-external-secrets.infra.md

diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml
index 574aaa7..c25a7d5 100644
--- a/argocd/manifests/external-secrets/kustomization.yaml
+++ b/argocd/manifests/external-secrets/kustomization.yaml
@@ -12,4 +12,5 @@ resources:
 
 images:
   - name: ghcr.io/external-secrets/external-secrets
-    newTag: v2.2.0
+    newName: registry.ops.eblu.me/blumeops/external-secrets
+    newTag: v2.2.0-2985007
diff --git a/containers/external-secrets/container.py b/containers/external-secrets/container.py
new file mode 100644
index 0000000..6be5765
--- /dev/null
+++ b/containers/external-secrets/container.py
@@ -0,0 +1,51 @@
+"""External Secrets Operator — native Dagger build.
+
+Two-stage build: Go binary (all providers), Alpine runtime.
+Source cloned from forge mirror.
+
+A single binary serves as the controller, webhook, and cert-controller; the
+Deployments select the role via a subcommand passed in `args:`, so the image
+ENTRYPOINT must be the binary itself (matching upstream's distroless image).
+"""
+
+import dagger
+
+from blumeops.containers import (
+    alpine_runtime,
+    clone_from_forge,
+    go_build,
+    oci_labels,
+)
+
+VERSION = "v2.2.0"
+
+
+async def build(src: dagger.Directory) -> dagger.Container:
+    source = clone_from_forge("external-secrets", VERSION)
+
+    # Upstream `make build` compiles every secret provider into a single
+    # static binary (`-tags all_providers`, CGO disabled). Mirror that so the
+    # local image is functionally identical to ghcr.io/.../external-secrets.
+    backend = go_build(
+        source,
+        "/external-secrets",
+        tags="all_providers",
+    )
+
+    runtime = alpine_runtime(
+        extra_apk=["ca-certificates"],
+        create_user=False,
+    )
+    runtime = oci_labels(
+        runtime,
+        title="External Secrets Operator",
+        description=(
+            "Kubernetes operator that integrates external secret management systems"
+        ),
+        version=VERSION,
+    )
+    return (
+        runtime.with_file("/bin/external-secrets", backend.file("/external-secrets"))
+        .with_user("65534")
+        .with_entrypoint(["/bin/external-secrets"])
+    )
diff --git a/docs/changelog.d/local-external-secrets.infra.md b/docs/changelog.d/local-external-secrets.infra.md
new file mode 100644
index 0000000..13cbb05
--- /dev/null
+++ b/docs/changelog.d/local-external-secrets.infra.md
@@ -0,0 +1 @@
+Localized the external-secrets controller image. It now builds from the forge mirror via a native Dagger `container.py` (single `all_providers` static Go binary, faithful to upstream's `make build`) and is served from `registry.ops.eblu.me/blumeops/external-secrets` instead of `ghcr.io`, bringing another platform component under local supply-chain control.
diff --git a/service-versions.yaml b/service-versions.yaml
index 11ec9f9..cc9dc9e 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -159,10 +159,13 @@ services:
 
   - name: external-secrets
     type: argocd
-    last-reviewed: 2026-03-25
+    last-reviewed: 2026-06-04
     current-version: "v2.2.0"
     upstream-source: https://github.com/external-secrets/external-secrets/releases
-    notes: Static kustomize manifests rendered from upstream Helm chart
+    notes: >-
+      Static kustomize manifests rendered from upstream Helm chart. Controller
+      image is locally built from the forge mirror via containers/external-secrets/container.py
+      (single all_providers static Go binary).
 
   - name: 1password-connect
     type: argocd

From 30c82079b9dbb8e2492586d979cd4ec5b04cd08d Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Thu, 4 Jun 2026 14:59:17 -0700
Subject: [PATCH 04/20] C0: rebuild external-secrets image off main
 (v2.2.0-0e70a1b)

Repoint to the main-branch-built image so the deployed tag traces to a main
commit rather than the merged feature branch. Same v2.2.0 source, stable
provenance.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/external-secrets/kustomization.yaml         | 2 +-
 docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md

diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml
index c25a7d5..8b1aea5 100644
--- a/argocd/manifests/external-secrets/kustomization.yaml
+++ b/argocd/manifests/external-secrets/kustomization.yaml
@@ -13,4 +13,4 @@ resources:
 images:
   - name: ghcr.io/external-secrets/external-secrets
     newName: registry.ops.eblu.me/blumeops/external-secrets
-    newTag: v2.2.0-2985007
+    newTag: v2.2.0-0e70a1b
diff --git a/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md b/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md
new file mode 100644
index 0000000..2e931d4
--- /dev/null
+++ b/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md
@@ -0,0 +1 @@
+Rebuilt the locally-built external-secrets image from the `main` branch so the deployed tag (`v2.2.0-0e70a1b`) traces to a `main` commit rather than the now-merged feature branch, giving a stable provenance reference.

From 13895bb04a5afcbb723d7ab3355d228431d76a5d Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Thu, 4 Jun 2026 15:37:42 -0700
Subject: [PATCH 05/20] Localize external-secrets on ringtail (amd64 nix build)
 (#368)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #367. That PR localized external-secrets but the Dagger build (on indri's Apple Silicon runner) only produces an **arm64** image — and external-secrets also runs on **ringtail (amd64)** via the same shared manifest. This completes the localization so both clusters run the local binary on their native arch.

## Approach (matches the kube-state-metrics dual-build pattern)
- **`containers/external-secrets/default.nix`** (new) — builds the **amd64** image on ringtail's nix-container-builder. `buildGoModule` with Go 1.26 (v2.2.0 requires ≥1.26.1; nixpkgs default is 1.25.x) and `-tags all_providers`, faithful to upstream. Same v2.2.0 source from the forge mirror.
- **`argocd/manifests/external-secrets-ringtail/`** (new) — thin kustomize overlay that reuses the shared indri manifest as a base and overrides **only** the image to the `-nix` (amd64) tag. No manifest duplication.
- **`argocd/apps/external-secrets-ringtail.yaml`** — repointed at the new overlay.

Result: indri → `v2.2.0-…` (arm64, Dagger), ringtail → `v2.2.0-…-nix` (amd64, nix).

## Build
Run #581 built both arches at the branch commit. Verified the nix image is `linux/amd64`, entrypoint = the binary, user 65534.

## Deployed from branch & verified on ringtail (k3s, amd64)
- All 3 pods rolled to the nix amd64 image, `1/1 Running` (no exec-format error → arch correct)
- Controller logs clean
- **Live secret fetch proven:** force-synced `homepage/homepage-grafana` → `refreshTime` advanced, `Ready=True`
- **All 20** ringtail ExternalSecrets remain `SecretSynced=True`

## Post-merge
The `external-secrets-ringtail` app is temporarily pointed at this branch + overlay path (apps app left on `main`, manual-sync, untouched). After merge:
```
argocd app sync apps                       # picks up the new Application path on main
argocd app set external-secrets-ringtail --revision main && argocd app sync external-secrets-ringtail
```
I'll also rebuild off `main` so both clusters land on stable main-sha tags (as done for indri in #367).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/368
---
 argocd/apps/external-secrets-ringtail.yaml    |  2 +-
 .../kustomization.yaml                        | 16 ++++++
 containers/external-secrets/default.nix       | 56 +++++++++++++++++++
 .../external-secrets-ringtail-nix.infra.md    |  1 +
 4 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 argocd/manifests/external-secrets-ringtail/kustomization.yaml
 create mode 100644 containers/external-secrets/default.nix
 create mode 100644 docs/changelog.d/external-secrets-ringtail-nix.infra.md

diff --git a/argocd/apps/external-secrets-ringtail.yaml b/argocd/apps/external-secrets-ringtail.yaml
index e2f5898..0bb8bd7 100644
--- a/argocd/apps/external-secrets-ringtail.yaml
+++ b/argocd/apps/external-secrets-ringtail.yaml
@@ -15,7 +15,7 @@ spec:
   source:
     repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
     targetRevision: main
-    path: argocd/manifests/external-secrets
+    path: argocd/manifests/external-secrets-ringtail
   destination:
     server: https://ringtail.tail8d86e.ts.net:6443
     namespace: external-secrets
diff --git a/argocd/manifests/external-secrets-ringtail/kustomization.yaml b/argocd/manifests/external-secrets-ringtail/kustomization.yaml
new file mode 100644
index 0000000..05b6b54
--- /dev/null
+++ b/argocd/manifests/external-secrets-ringtail/kustomization.yaml
@@ -0,0 +1,16 @@
+# Ringtail (amd64) overlay for external-secrets.
+#
+# Reuses the shared indri manifest as a base and only overrides the controller
+# image to the nix-built amd64 variant (`-nix` tag). The base sets the arm64
+# image (built via containers/external-secrets/container.py on indri's Dagger
+# runner); ringtail's k3s is amd64 and needs the image built by
+# containers/external-secrets/default.nix on the nix-container-builder.
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - ../external-secrets
+
+images:
+  - name: registry.ops.eblu.me/blumeops/external-secrets
+    newTag: v2.2.0-59dace8-nix
diff --git a/containers/external-secrets/default.nix b/containers/external-secrets/default.nix
new file mode 100644
index 0000000..eabe03d
--- /dev/null
+++ b/containers/external-secrets/default.nix
@@ -0,0 +1,56 @@
+# Nix-built External Secrets Operator (amd64, for ringtail k3s).
+# Builds v2.2.0 from the forge mirror with all secret providers compiled in,
+# faithful to upstream's `make build` (-tags all_providers). The container.py
+# sibling builds the arm64 image for indri's minikube; this default.nix builds
+# the amd64 image on ringtail's nix-container-builder.
+{ pkgs ? import <nixpkgs> { } }:
+
+let
+  version = "2.2.0";
+
+  src = pkgs.fetchgit {
+    url = "https://forge.ops.eblu.me/mirrors/external-secrets.git";
+    rev = "v${version}";
+    hash = "sha256-eAocOAp5s4CFRrpKfQr2lf3Ji+6nQQ1A5/eTw5B7v9U=";
+  };
+
+  # external-secrets v2.2.0 requires Go >= 1.26.1; nixpkgs default go is 1.25.x.
+  external-secrets = (pkgs.buildGoModule.override { go = pkgs.go_1_26; }) {
+    inherit src version;
+    pname = "external-secrets";
+    vendorHash = "sha256-0xuBK3fjAplPLAElHvKB6d+2lDz+De/s91fV4dPZwjE=";
+
+    doCheck = false;
+
+    subPackages = [ "." ];
+
+    tags = [ "all_providers" ];
+
+    ldflags = [ "-s" "-w" ];
+
+    meta = with pkgs.lib; {
+      description = "Kubernetes operator that integrates external secret management systems";
+      homepage = "https://github.com/external-secrets/external-secrets";
+      license = licenses.asl20;
+      mainProgram = "external-secrets";
+    };
+  };
+in
+
+pkgs.dockerTools.buildLayeredImage {
+  name = "blumeops/external-secrets";
+  contents = [
+    external-secrets
+    pkgs.cacert
+    pkgs.tzdata
+  ];
+
+  config = {
+    Entrypoint = [ "${external-secrets}/bin/external-secrets" ];
+    Env = [
+      "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt"
+      "TZDIR=${pkgs.tzdata}/share/zoneinfo"
+    ];
+    User = "65534";
+  };
+}
diff --git a/docs/changelog.d/external-secrets-ringtail-nix.infra.md b/docs/changelog.d/external-secrets-ringtail-nix.infra.md
new file mode 100644
index 0000000..9ce3f85
--- /dev/null
+++ b/docs/changelog.d/external-secrets-ringtail-nix.infra.md
@@ -0,0 +1 @@
+Completed the external-secrets localization for the ringtail (amd64) cluster. The indri Dagger build (`container.py`) only produces an arm64 image; added `containers/external-secrets/default.nix` to build the amd64 variant on ringtail's nix-container-builder, and gave `external-secrets-ringtail` a thin kustomize overlay that reuses the shared manifest and points at the `-nix` image. Both clusters now run the locally-built external-secrets binary on their native architecture.

From f6c926f1f594a0ee019bca5d31cdcc4225f6d6cf Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Thu, 4 Jun 2026 16:19:20 -0700
Subject: [PATCH 06/20] C0: rebuild external-secrets off main, repoint both
 clusters to stable tags

indri -> v2.2.0-13895bb (arm64), ringtail -> v2.2.0-13895bb-nix (amd64).
Both deployed images now trace to main commit 13895bb instead of earlier
branch builds.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/external-secrets-ringtail/kustomization.yaml | 2 +-
 argocd/manifests/external-secrets/kustomization.yaml          | 2 +-
 docs/changelog.d/+external-secrets-stable-main-sha.infra.md   | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)
 create mode 100644 docs/changelog.d/+external-secrets-stable-main-sha.infra.md

diff --git a/argocd/manifests/external-secrets-ringtail/kustomization.yaml b/argocd/manifests/external-secrets-ringtail/kustomization.yaml
index 05b6b54..9fd4e2f 100644
--- a/argocd/manifests/external-secrets-ringtail/kustomization.yaml
+++ b/argocd/manifests/external-secrets-ringtail/kustomization.yaml
@@ -13,4 +13,4 @@ resources:
 
 images:
   - name: registry.ops.eblu.me/blumeops/external-secrets
-    newTag: v2.2.0-59dace8-nix
+    newTag: v2.2.0-13895bb-nix
diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml
index 8b1aea5..639db66 100644
--- a/argocd/manifests/external-secrets/kustomization.yaml
+++ b/argocd/manifests/external-secrets/kustomization.yaml
@@ -13,4 +13,4 @@ resources:
 images:
   - name: ghcr.io/external-secrets/external-secrets
     newName: registry.ops.eblu.me/blumeops/external-secrets
-    newTag: v2.2.0-0e70a1b
+    newTag: v2.2.0-13895bb
diff --git a/docs/changelog.d/+external-secrets-stable-main-sha.infra.md b/docs/changelog.d/+external-secrets-stable-main-sha.infra.md
new file mode 100644
index 0000000..fbe3c21
--- /dev/null
+++ b/docs/changelog.d/+external-secrets-stable-main-sha.infra.md
@@ -0,0 +1 @@
+Rebuilt the external-secrets images off `main` and repointed both clusters to the stable main-sha tags (`v2.2.0-13895bb` arm64 / `v2.2.0-13895bb-nix` amd64), so the deployed images on indri and ringtail trace to the same `main` commit rather than earlier feature-branch builds.

From a2f1e062243a47c7c68b5a57617f14102b798503 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 5 Jun 2026 06:46:58 -0700
Subject: [PATCH 07/20] Add hephaestus sync hub to indri (launchagent, PWA,
 device-code OIDC) (#369)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Makes indri the canonical **heph** hub for the hub-and-spoke task/context system, deployed as a self-updating LaunchAgent managed by Ansible. Other devices (gilbert) attach as offline-capable spokes.

## What's here
- **`ansible/roles/heph`** (tag `heph`) — bootstrap `cargo install hephd` (only if absent; `--self-update` keeps it current after), version-pinned `heph-pwa` checkout served via `--web-root`, launchagent `mcquack.eblume.heph`:
  ```
  hephd --mode server --http-addr 0.0.0.0:8787 --db … --web-root …
        --oidc-issuer …/o/heph/ --oidc-audience heph
        --self-update --self-update-interval-secs 600
  ```
  `~/.cargo/bin` is on the agent `PATH` so self-update's `cargo install` works.
- **Caddy** — `heph.ops.eblu.me → localhost:8787` (TLS for the PWA secure context).
- **Authentik** — new `heph` **public device-code** OIDC app + `default-device-code-flow` bound to the default brand's `flow_device_code` (verified live: brand `authentik-default`, field currently unset → additive).
- **Docs** — `services/hephaestus.md` (Path-A seeding runbook + spoke caveat), `indri.md`, changelog fragment.

## Three features requested
- **Autoupdate** — 10-min interval (`--self-update-interval-secs 600`).
- **PWA** — `--web-root` (confirmed shipped in v1.2.0).
- **Spoke** — gilbert reconfig documented (post-merge step).

## Deploy plan (not done yet — awaiting review)
1. Seed from gilbert (Path A): `heph daemon stop` → copy `heph.db` → `DELETE FROM meta WHERE key='origin'`.
2. Sync Authentik `apps`/blueprint; verify blueprint status via API (not just logs).
3. `provision-indri --tags heph,caddy` from this branch.
4. Point gilbert at the hub + `heph auth login`.

## Known follow-ups (heph-side, tracked in the Hephaestus project)
- `heph daemon` can't bake hub/spoke config or pass `--self-update-interval-secs` → worked around by the ansible plist.
- Path-A seeding lacks a clean `hephd --owner-id`/seed command → manual `meta.origin` reset for now.
- Self-update moves hephd ahead of the ansible-pinned PWA shell over time (drift; tolerated by the SW cache, revisit on next release).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/369
---
 ansible/playbooks/indri.yml                   |   2 +
 ansible/roles/caddy/defaults/main.yml         |   3 +
 ansible/roles/heph/defaults/main.yml          |  49 +++++++
 ansible/roles/heph/handlers/main.yml          |   6 +
 ansible/roles/heph/tasks/main.yml             |  82 +++++++++++
 ansible/roles/heph/templates/heph.plist.j2    |  50 +++++++
 .../authentik/configmap-blueprint.yaml        |  79 +++++++++++
 docs/changelog.d/heph-indri-hub.infra.md      |   1 +
 docs/reference/infrastructure/indri.md        |   1 +
 docs/reference/services/hephaestus.md         | 130 ++++++++++++++++++
 10 files changed, 403 insertions(+)
 create mode 100644 ansible/roles/heph/defaults/main.yml
 create mode 100644 ansible/roles/heph/handlers/main.yml
 create mode 100644 ansible/roles/heph/tasks/main.yml
 create mode 100644 ansible/roles/heph/templates/heph.plist.j2
 create mode 100644 docs/changelog.d/heph-indri-hub.infra.md
 create mode 100644 docs/reference/services/hephaestus.md

diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml
index ddb57f8..1e33bb1 100644
--- a/ansible/playbooks/indri.yml
+++ b/ansible/playbooks/indri.yml
@@ -260,5 +260,7 @@
       tags: cv
     - role: docs
       tags: docs
+    - role: heph
+      tags: heph
     - role: caddy
       tags: caddy
diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml
index 363d09e..e6d7385 100644
--- a/ansible/roles/caddy/defaults/main.yml
+++ b/ansible/roles/caddy/defaults/main.yml
@@ -52,6 +52,9 @@ caddy_services:
   - name: devpi
     host: "pypi.{{ caddy_domain }}"
     backend: "http://localhost:3141"
+  - name: heph
+    host: "heph.{{ caddy_domain }}"
+    backend: "http://localhost:8787"  # hephaestus hub (server mode) + PWA shell
   - name: kiwix
     host: "kiwix.{{ caddy_domain }}"
     backend: "https://kiwix.tail8d86e.ts.net"
diff --git a/ansible/roles/heph/defaults/main.yml b/ansible/roles/heph/defaults/main.yml
new file mode 100644
index 0000000..e5eea36
--- /dev/null
+++ b/ansible/roles/heph/defaults/main.yml
@@ -0,0 +1,49 @@
+---
+# hephaestus hub — the canonical heph replica (server mode) on indri.
+# Other devices (e.g. gilbert) are spokes that sync against this hub.
+# See [[set-up-sync-hub]] and [[host-heph-pwa]] in the hephaestus repo.
+
+# Pinned release used for the initial `cargo install` and the PWA shell.
+# After bootstrap, hephd's own --self-update keeps the binary current; this
+# pin only governs the first install and the bundled PWA shell version.
+heph_version: v1.2.0
+
+# Anonymous public HTTPS clone — matches hephd's INSTALL_GIT_URL so the initial
+# install and unattended self-update build from the same source (no ssh-agent).
+heph_repo_url: https://forge.eblu.me/eblume/hephaestus.git
+
+heph_bin_dir: /Users/erichblume/.cargo/bin
+heph_binary: "{{ heph_bin_dir }}/hephd"
+
+# rustc/cargo here are rustup shims. The bare (non-mise) environment that the
+# launchagent and ansible run in falls back to rustup's *default* toolchain,
+# which can lag behind heph's rust-version floor (Cargo.toml: 1.89). Pin the
+# channel explicitly so both the bootstrap build and unattended self-update
+# always use a current toolchain regardless of the host's rustup default.
+heph_rust_toolchain: stable
+
+heph_data_dir: /Users/erichblume/.local/share/heph
+heph_db: "{{ heph_data_dir }}/heph.db"
+heph_socket: "{{ heph_data_dir }}/hephd.sock"
+heph_log_dir: /Users/erichblume/Library/Logs
+
+# Version-pinned source checkout; the PWA static shell is served directly from
+# its heph-pwa/ subdir (no copy), keeping shell and hub in lockstep at heph_version.
+heph_pwa_src_dir: /Users/erichblume/.cache/heph-pwa-src
+heph_web_root: "{{ heph_pwa_src_dir }}/heph-pwa"
+
+# Hub listens on all interfaces so tailnet spokes can reach it directly
+# (http://indri.tail8d86e.ts.net:8787) and Caddy can proxy heph.ops.eblu.me.
+# Access is gated by Authentik OIDC regardless — tailnet reachability is not
+# enough (this is the owner's most sensitive data).
+heph_http_addr: 0.0.0.0:8787
+heph_port: 8787
+heph_external_url: https://heph.ops.eblu.me
+
+# Authentik OIDC — issuer + audience together turn hub auth on. The audience is
+# the device-code client id (see argocd/manifests/authentik heph blueprint).
+heph_oidc_issuer: https://authentik.ops.eblu.me/application/o/heph/
+heph_oidc_audience: heph
+
+# Self-update poll interval (seconds). 10 minutes.
+heph_self_update_interval_secs: 600
diff --git a/ansible/roles/heph/handlers/main.yml b/ansible/roles/heph/handlers/main.yml
new file mode 100644
index 0000000..92fe9d7
--- /dev/null
+++ b/ansible/roles/heph/handlers/main.yml
@@ -0,0 +1,6 @@
+---
+- name: Restart heph
+  ansible.builtin.shell: |
+    launchctl unload ~/Library/LaunchAgents/mcquack.eblume.heph.plist 2>/dev/null || true
+    launchctl load ~/Library/LaunchAgents/mcquack.eblume.heph.plist
+  changed_when: true
diff --git a/ansible/roles/heph/tasks/main.yml b/ansible/roles/heph/tasks/main.yml
new file mode 100644
index 0000000..7a45fe3
--- /dev/null
+++ b/ansible/roles/heph/tasks/main.yml
@@ -0,0 +1,82 @@
+---
+# hephaestus hub (server mode) on indri.
+#
+# DATA SEEDING (one-time, Path A — do this BEFORE the first provision so the hub
+# adopts gilbert's existing data instead of being born empty):
+#
+#   1. On the seed device (gilbert):   heph daemon stop
+#   2. Copy its store to indri:         scp ~/.local/share/heph/heph.db \
+#                                           indri:~/.local/share/heph/heph.db
+#   3. On indri, give the hub its OWN device origin (keeps gilbert's owner_id +
+#      data; hephd regenerates a fresh origin on next start when it is missing):
+#        sqlite3 ~/.local/share/heph/heph.db "DELETE FROM meta WHERE key='origin';"
+#   4. Run this role (installs hephd, stages the PWA, loads the launchagent).
+#
+# hephd auto-creates an empty store on first start if none exists, so seeding is
+# optional — skip it only if you intend a fresh, empty hub.
+
+- name: Ensure heph data directory exists
+  ansible.builtin.file:
+    path: "{{ heph_data_dir }}"
+    state: directory
+    mode: '0700'
+
+- name: Check for installed hephd binary
+  ansible.builtin.stat:
+    path: "{{ heph_binary }}"
+  register: heph_binary_stat
+
+# Bootstrap install only when hephd is absent. Thereafter hephd's own
+# --self-update keeps it current; ansible must not fight (or downgrade) it.
+# This builds from source and can take several minutes on a cold cargo cache.
+- name: Bootstrap-install heph + hephd from the forge ({{ heph_version }})
+  ansible.builtin.command:
+    cmd: >-
+      {{ heph_bin_dir }}/cargo install --locked
+      --git {{ heph_repo_url }}
+      --tag {{ heph_version }}
+      heph hephd
+  environment:
+    PATH: "{{ heph_bin_dir }}:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin"
+    RUSTUP_TOOLCHAIN: "{{ heph_rust_toolchain }}"
+  when: not heph_binary_stat.stat.exists
+  changed_when: true
+  notify: Restart heph
+
+# Checkout provides the PWA shell at {{ heph_web_root }} (heph-pwa/ subdir),
+# served directly by hephd. Static files are read from disk per request, so a
+# version bump needs no restart; the service worker (CACHE = "heph-pwa-vN")
+# evicts stale assets on next load.
+- name: Ensure heph cache parent directory exists
+  ansible.builtin.file:
+    path: "{{ heph_pwa_src_dir | dirname }}"
+    state: directory
+    mode: '0755'
+
+- name: Stage heph-pwa source at {{ heph_version }}
+  ansible.builtin.git:
+    repo: "{{ heph_repo_url }}"
+    dest: "{{ heph_pwa_src_dir }}"
+    version: "{{ heph_version }}"
+    depth: 1
+    single_branch: true
+    force: true
+
+- name: Deploy heph LaunchAgent plist
+  ansible.builtin.template:
+    src: heph.plist.j2
+    dest: ~/Library/LaunchAgents/mcquack.eblume.heph.plist
+    mode: '0644'
+  notify: Restart heph
+
+- name: Check if heph LaunchAgent is loaded
+  ansible.builtin.command: launchctl list mcquack.eblume.heph
+  register: heph_launchctl_check
+  changed_when: false
+  failed_when: false
+
+- name: Load heph LaunchAgent if not loaded
+  ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.heph.plist
+  when: heph_launchctl_check.rc != 0
+  changed_when: true
+  failed_when: false
diff --git a/ansible/roles/heph/templates/heph.plist.j2 b/ansible/roles/heph/templates/heph.plist.j2
new file mode 100644
index 0000000..19a2367
--- /dev/null
+++ b/ansible/roles/heph/templates/heph.plist.j2
@@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- {{ ansible_managed }} -->
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>Label</key>
+	<string>mcquack.eblume.heph</string>
+	<key>ProgramArguments</key>
+	<array>
+		<string>{{ heph_binary }}</string>
+		<string>--mode</string>
+		<string>server</string>
+		<string>--http-addr</string>
+		<string>{{ heph_http_addr }}</string>
+		<string>--db</string>
+		<string>{{ heph_db }}</string>
+		<string>--socket</string>
+		<string>{{ heph_socket }}</string>
+		<string>--web-root</string>
+		<string>{{ heph_web_root }}</string>
+		<string>--oidc-issuer</string>
+		<string>{{ heph_oidc_issuer }}</string>
+		<string>--oidc-audience</string>
+		<string>{{ heph_oidc_audience }}</string>
+		<string>--self-update</string>
+		<string>--self-update-interval-secs</string>
+		<string>{{ heph_self_update_interval_secs }}</string>
+	</array>
+	<key>RunAtLoad</key>
+	<true/>
+	<key>KeepAlive</key>
+	<true/>
+	<key>EnvironmentVariables</key>
+	<dict>
+		<!-- cargo + toolchain on PATH so --self-update can run `cargo install`. -->
+		<key>PATH</key>
+		<string>{{ heph_bin_dir }}:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
+		<key>HOME</key>
+		<string>/Users/erichblume</string>
+		<!-- Pin the rustup channel: the launchagent runs without mise, so a bare
+		     cargo shim would otherwise use rustup's (stale) default toolchain. -->
+		<key>RUSTUP_TOOLCHAIN</key>
+		<string>{{ heph_rust_toolchain }}</string>
+	</dict>
+	<key>StandardOutPath</key>
+	<string>{{ heph_log_dir }}/mcquack.heph.out.log</string>
+	<key>StandardErrorPath</key>
+	<string>{{ heph_log_dir }}/mcquack.heph.err.log</string>
+</dict>
+</plist>
diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml
index fcbb99b..56d9110 100644
--- a/argocd/manifests/authentik/configmap-blueprint.yaml
+++ b/argocd/manifests/authentik/configmap-blueprint.yaml
@@ -434,3 +434,82 @@ data:
           provider: !KeyOf mealie-provider
           meta_launch_url: https://meals.ops.eblu.me
           policy_engine_mode: all
+
+  heph.yaml: |
+    version: 1
+    metadata:
+      name: BlumeOps Heph SSO
+      labels:
+        blueprints.goauthentik.io/description: "Hephaestus hub OIDC (device-code) provider, application, and device-code flow"
+    entries:
+      # Device-code flow (RFC 8628). authentik ships no default for this, so we
+      # create one and bind it to the brand below. An empty stage_configuration
+      # flow is sufficient: the already-authenticated user just confirms the code.
+      - model: authentik_flows.flow
+        id: device-code-flow
+        identifiers:
+          slug: default-device-code-flow
+        attrs:
+          name: Device code flow
+          title: Device code flow
+          slug: default-device-code-flow
+          designation: stage_configuration
+          authentication: require_authenticated
+
+      # Enable the device-code grant globally by binding the flow to the default
+      # brand (domain authentik-default). Partial update — only sets this field.
+      - model: authentik_brands.brand
+        identifiers:
+          domain: authentik-default
+        attrs:
+          flow_device_code: !KeyOf device-code-flow
+
+      # OAuth2 provider for heph — PUBLIC client (device-code + PKCE, no secret).
+      # client_id doubles as the token audience the hub verifies (--oidc-audience heph),
+      # and the app slug 'heph' is the issuer path (/application/o/heph/).
+      - model: authentik_providers_oauth2.oauth2provider
+        id: heph-provider
+        identifiers:
+          name: Heph
+        attrs:
+          name: Heph
+          authorization_flow: !Find [authentik_flows.flow, [slug, default-provider-authorization-implicit-consent]]
+          invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]]
+          client_type: public
+          client_id: heph
+          # Device-code (RFC 8628) + PKCE use no redirect, but the provider
+          # serializer requires the field — an empty list satisfies it.
+          redirect_uris: []
+          signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]]
+          property_mappings:
+            - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]]
+            - !Find [authentik_providers_oauth2.scopemapping, [scope_name, email]]
+            - !Find [authentik_providers_oauth2.scopemapping, [scope_name, profile]]
+          sub_mode: hashed_user_id
+          include_claims_in_id_token: true
+
+      # Heph application — linked to the OAuth2 provider
+      - model: authentik_core.application
+        id: heph-app
+        identifiers:
+          slug: heph
+        attrs:
+          name: Hephaestus
+          slug: heph
+          provider: !KeyOf heph-provider
+          meta_launch_url: https://heph.ops.eblu.me
+          policy_engine_mode: any
+
+      # Policy binding — restrict heph to admins group (single-owner, sensitive data)
+      - model: authentik_policies.policybinding
+        identifiers:
+          order: 0
+          target: !KeyOf heph-app
+          group: !Find [authentik_core.group, [name, admins]]
+        attrs:
+          target: !KeyOf heph-app
+          group: !Find [authentik_core.group, [name, admins]]
+          order: 0
+          enabled: true
+          negate: false
+          timeout: 30
diff --git a/docs/changelog.d/heph-indri-hub.infra.md b/docs/changelog.d/heph-indri-hub.infra.md
new file mode 100644
index 0000000..6761cb7
--- /dev/null
+++ b/docs/changelog.d/heph-indri-hub.infra.md
@@ -0,0 +1 @@
+Added the [[hephaestus]] (`heph`) sync hub to indri as a self-updating LaunchAgent managed by Ansible (`ansible/roles/heph`, tag `heph`). The hub runs `hephd --mode server` behind `heph.ops.eblu.me` (Caddy TLS), with self-update on a 10-minute interval and the heph-pwa mobile shell served from `--web-root`. Access is gated by a new Authentik device-code (RFC 8628) OIDC application. Indri is now the canonical hub; other devices (e.g. gilbert) attach as offline-capable spokes. The hub's store was seeded from gilbert via the data-safe Path A bring-up (copy store, reset `meta.origin`).
diff --git a/docs/reference/infrastructure/indri.md b/docs/reference/infrastructure/indri.md
index 67652ca..8364ba0 100644
--- a/docs/reference/infrastructure/indri.md
+++ b/docs/reference/infrastructure/indri.md
@@ -33,6 +33,7 @@ Primary BlumeOps server. Mac Mini M1 (2020).
 - [[alloy|Alloy]] - Metrics/logs collector
 - [[caddy]] - Reverse proxy for `*.ops.eblu.me`
 - [[devpi]] - PyPI mirror (LaunchAgent)
+- [[hephaestus]] - heph task/context sync hub (LaunchAgent, self-updating)
 - [[cv]] - Static CV site, served by Caddy
 - [[docs]] - Quartz-built docs site, served by Caddy
 
diff --git a/docs/reference/services/hephaestus.md b/docs/reference/services/hephaestus.md
new file mode 100644
index 0000000..1754ea0
--- /dev/null
+++ b/docs/reference/services/hephaestus.md
@@ -0,0 +1,130 @@
+---
+title: Hephaestus
+modified: 2026-06-04
+last-reviewed: 2026-06-04
+tags:
+  - service
+  - hephaestus
+---
+
+# Hephaestus
+
+[hephaestus](https://github.com/eblume/hephaestus) (`heph`) is the user's
+self-hosted task + context/knowledge system. It is **hub-and-spoke**: each device
+runs a full local SQLite replica (`hephd --mode local`) and background-syncs
+against one canonical **hub**. Indri runs that hub.
+
+## Quick Reference
+
+| Property | Value |
+|----------|-------|
+| **PWA URL** | https://heph.ops.eblu.me (browser PWA, Caddy TLS) |
+| **Spoke sync URL** | http://indri.tail8d86e.ts.net:8787 (direct, tailnet) |
+| **Local Port** | 8787 (`hephd --mode server`, bound `0.0.0.0`) |
+| **Binary** | `~/.cargo/bin/hephd` (self-updating) |
+| **Data** | `~/.local/share/heph/heph.db` |
+| **PWA shell** | `~/.local/share/heph/web` |
+| **Logs** | `~/Library/Logs/mcquack.heph.{out,err}.log` |
+| **LaunchAgent** | `mcquack.eblume.heph` |
+| **Ansible role** | `ansible/roles/heph` (tag `heph`) |
+
+## What runs on indri
+
+The launchagent runs the hub in server mode with three features enabled:
+
+```
+hephd --mode server --http-addr 0.0.0.0:8787 --db ~/.local/share/heph/heph.db
+      --web-root ~/.local/share/heph/web
+      --oidc-issuer https://authentik.ops.eblu.me/application/o/heph/
+      --oidc-audience heph
+      --self-update --self-update-interval-secs 600
+```
+
+- **Server mode** exposes the HTTP sync endpoint (`/rpc`, `/sync/*`) that spokes
+  reconcile their op-log against.
+- **Self-update** (10-minute poll) rebuilds `hephd` from the forge when a newer
+  release tag appears (`cargo install --git https://forge.eblu.me/eblume/hephaestus.git`).
+  Indri's Rust toolchain (`~/.cargo/bin`) is on the agent's `PATH` for this, and
+  the plist pins `RUSTUP_TOOLCHAIN=stable` — the
+  launchagent runs without mise, so a bare `cargo` shim would otherwise fall back
+  to rustup's *default* toolchain, which can lag behind heph's `rust-version` floor
+  (1.89) and silently fail the build.
+- **PWA** (`--web-root`) serves the [heph-pwa] mobile shell; Caddy terminates TLS
+  at `heph.ops.eblu.me` so the PWA runs in a secure context (service worker,
+  install-to-home-screen, voice capture).
+
+[heph-pwa]: https://github.com/eblume/hephaestus
+
+The hub binds `0.0.0.0` so tailnet spokes can also sync directly
+(`http://indri.tail8d86e.ts.net:8787`); access is gated by Authentik OIDC either
+way — tailnet reachability alone is not enough.
+
+## Authentication (Authentik OIDC, device-code)
+
+The hub verifies an OIDC bearer token on every sync. The `heph` application is a
+**public** OAuth2 client using the **device-code flow** (RFC 8628), provisioned
+in the [[authentik]] blueprint (`argocd/manifests/authentik/configmap-blueprint.yaml`):
+
+- Issuer: `https://authentik.ops.eblu.me/application/o/heph/`
+- Audience / client id: `heph`
+- Restricted to the `admins` group (single-owner, sensitive data).
+
+Because no Authentik instance ships a device-code flow by default, the blueprint
+also creates `default-device-code-flow` and binds it to the default brand's
+`flow_device_code`. Devices obtain a token with `heph auth login`; the PWA
+currently takes a pasted token (in-app device-code login is upstream follow-up).
+
+## Data seeding (Path A, one-time)
+
+The hub was seeded from the existing `gilbert` device so no task history was
+lost. heph's data-safe bring-up ("Path A") has the hub **adopt the device's
+identity** rather than rewriting the device:
+
+1. Quiesce the seed device: `heph daemon stop` (on gilbert).
+2. Copy its store to indri: `scp ~/.local/share/heph/heph.db indri:~/.local/share/heph/heph.db`.
+3. Give the hub its **own device origin** (keeps gilbert's `owner_id` + data;
+   `hephd` regenerates a fresh `origin` on next start when it is missing):
+   ```fish
+   ssh indri "sqlite3 ~/.local/share/heph/heph.db \"DELETE FROM meta WHERE key='origin';\""
+   ```
+4. `mise run provision-indri -- --tags heph` (installs hephd, stages the PWA,
+   loads the launchagent → hub starts on the seeded store).
+
+Only `meta.origin` changes; `owner_id`, nodes, op-log, and links are copied
+untouched. A clean `hephd --owner-id` / seed command is tracked upstream as
+hephaestus follow-up — until then this manual reset is the documented path.
+
+## Connecting a spoke (e.g. gilbert)
+
+A device joins by running its local daemon with the hub URL + OIDC client and
+logging in once:
+
+```bash
+hephd --mode local --hub-url http://indri.tail8d86e.ts.net:8787 \
+      --oidc-issuer https://authentik.ops.eblu.me/application/o/heph/ \
+      --oidc-client-id heph
+heph auth login --hub-url http://indri.tail8d86e.ts.net:8787 \
+      --issuer https://authentik.ops.eblu.me/application/o/heph/ --client-id heph
+```
+
+> **Use the direct `http://…:8787` tailnet URL for sync, not the Caddy HTTPS
+> URL.** hephd's sync client is plain-HTTP-only; pointing `--hub-url` at
+> `https://heph.ops.eblu.me` fails with a confusing `error sending request`
+> (the HTTP connector rejects the `https` scheme before connecting). Tailscale
+> encrypts the transport, and the OIDC bearer token still gates every request.
+> `heph.ops.eblu.me` (Caddy TLS) exists only for the browser PWA, which needs a
+> secure context. The cached token is keyed by the exact `--hub-url`, so use the
+> same value for `hephd` and `heph auth login`.
+
+> **Caveat:** `heph daemon` cannot yet bake hub/spoke flags into the generated
+> launchd plist (upstream gap). On a spoke whose plist is managed by `heph
+> daemon`, the hub/OIDC flags must be hand-added — and a later `heph daemon
+> start/restart` will regenerate the plist and drop them. Avoid `heph daemon`
+> subcommands on a configured spoke until that gap is closed; reload via
+> `launchctl` instead.
+
+## Related
+
+- [[indri]] — host
+- [[authentik]] — OIDC provider
+- [[caddy]] — TLS termination for `heph.ops.eblu.me`

From 6576880b0e8e80cd88452add47627c3b4e6d6435 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 5 Jun 2026 07:30:31 -0700
Subject: [PATCH 08/20] heph Authentik: register heph-pwa redirect URIs (PKCE
 login) (#370)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the heph-pwa redirect URIs to the Authentik `heph` OAuth2 provider so the new browser **Login with Authentik** flow (Authorization Code + PKCE, hephaestus PR #9) can redirect back and exchange the code:

- `https://heph.ops.eblu.me/` (the PWA origin)
- `http://localhost:8787/` (local dev: `hephd --web-root`)

Authentik also keys token-endpoint CORS off these origins, so they're required for the browser token exchange. Additive (the provider was `redirect_uris: []`); harmless until the PWA feature deploys.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/370
---
 argocd/manifests/authentik/configmap-blueprint.yaml | 13 ++++++++++---
 docs/changelog.d/heph-pwa-redirect-uris.infra.md    |  1 +
 2 files changed, 11 insertions(+), 3 deletions(-)
 create mode 100644 docs/changelog.d/heph-pwa-redirect-uris.infra.md

diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml
index 56d9110..9da2f70 100644
--- a/argocd/manifests/authentik/configmap-blueprint.yaml
+++ b/argocd/manifests/authentik/configmap-blueprint.yaml
@@ -477,9 +477,16 @@ data:
           invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]]
           client_type: public
           client_id: heph
-          # Device-code (RFC 8628) + PKCE use no redirect, but the provider
-          # serializer requires the field — an empty list satisfies it.
-          redirect_uris: []
+          # CLI/TUI use the device-code grant (no redirect). The heph-pwa browser
+          # login uses Authorization Code + PKCE, which DOES redirect back to the
+          # app's origin — register those here (Authentik also keys token-endpoint
+          # CORS off these origins). Trailing slash matters: the PWA's redirect_uri
+          # is its base dir, e.g. https://heph.ops.eblu.me/.
+          redirect_uris:
+            - matching_mode: strict
+              url: https://heph.ops.eblu.me/
+            - matching_mode: strict
+              url: http://localhost:8787/  # local dev (hephd --web-root)
           signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]]
           property_mappings:
             - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]]
diff --git a/docs/changelog.d/heph-pwa-redirect-uris.infra.md b/docs/changelog.d/heph-pwa-redirect-uris.infra.md
new file mode 100644
index 0000000..f887eed
--- /dev/null
+++ b/docs/changelog.d/heph-pwa-redirect-uris.infra.md
@@ -0,0 +1 @@
+Registered the heph-pwa redirect URIs (`https://heph.ops.eblu.me/`, plus `http://localhost:8787/` for dev) on the Authentik `heph` OAuth2 provider, enabling the PWA's new Authorization Code + PKCE "Login with Authentik" flow (and the token-endpoint CORS it needs). Pairs with hephaestus PR #9.

From 3abe80523a0b402c40a0bd3d825e5d81b87939d8 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 5 Jun 2026 07:40:51 -0700
Subject: [PATCH 09/20] C0: bump indri heph hub to v1.2.1 (PWA Authentik login
 + /config)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ansible/roles/heph/defaults/main.yml       | 2 +-
 docs/changelog.d/+heph-hub-v1.2.1.infra.md | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 docs/changelog.d/+heph-hub-v1.2.1.infra.md

diff --git a/ansible/roles/heph/defaults/main.yml b/ansible/roles/heph/defaults/main.yml
index e5eea36..88d2240 100644
--- a/ansible/roles/heph/defaults/main.yml
+++ b/ansible/roles/heph/defaults/main.yml
@@ -6,7 +6,7 @@
 # Pinned release used for the initial `cargo install` and the PWA shell.
 # After bootstrap, hephd's own --self-update keeps the binary current; this
 # pin only governs the first install and the bundled PWA shell version.
-heph_version: v1.2.0
+heph_version: v1.2.1
 
 # Anonymous public HTTPS clone — matches hephd's INSTALL_GIT_URL so the initial
 # install and unattended self-update build from the same source (no ssh-agent).
diff --git a/docs/changelog.d/+heph-hub-v1.2.1.infra.md b/docs/changelog.d/+heph-hub-v1.2.1.infra.md
new file mode 100644
index 0000000..c203323
--- /dev/null
+++ b/docs/changelog.d/+heph-hub-v1.2.1.infra.md
@@ -0,0 +1 @@
+Bumped the indri heph hub to v1.2.1, which adds the hub `GET /config` endpoint and ships the heph-pwa **Login with Authentik** flow (Authorization Code + PKCE). Pairs with the Authentik `heph` provider redirect URIs registered earlier.

From cf63fcb5b5cf379700efe3ce0986b18ec4d76625 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Fri, 5 Jun 2026 08:22:46 -0700
Subject: [PATCH 10/20] C0: track heph in service-versions (self-updating; note
 drift task)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 service-versions.yaml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/service-versions.yaml b/service-versions.yaml
index cc9dc9e..866c687 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -414,6 +414,23 @@ services:
     upstream-source: https://github.com/caddyserver/caddy/releases
     notes: Built from source with Gandi DNS and Layer 4 plugins
 
+  - name: heph
+    type: ansible
+    last-reviewed: 2026-06-05
+    current-version: "v1.2.1"
+    upstream-source: https://forge.eblu.me/eblume/hephaestus/releases
+    notes: >-
+      hephaestus task/context sync hub on indri (server-mode launchagent,
+      ansible/roles/heph; cargo-built from the forge). SELF-UPDATING: hephd
+      polls the forge for newer releases every 10 min and rebuilds + restarts
+      itself, so the running version drifts AHEAD of the ansible heph_version
+      pin. current-version here is the last observed/deployed tag, not a hard
+      pin — verify the live version via `curl https://heph.ops.eblu.me/config`
+      is served (hub up) and the hub log's `current=` line. Reconciling this
+      self-update vs IaC-pin drift is tracked in the heph "Hephaestus" project:
+      "Reconcile hephd self-update with ansible-pinned version (drift on indri
+      hub)" (node 01KTBXWT6XTHNDH92CVJY88E5K).
+
   - name: borgmatic
     type: ansible
     last-reviewed: 2026-04-15

From 50a36ff93a9d1c697c976a1db498bc5633f2cd7c Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Sat, 6 Jun 2026 18:07:13 -0700
Subject: [PATCH 11/20] heph Authentik: grant offline_access scope (fixes spoke
 sync refresh-token 400)

The heph CLI requests scope "openid offline_access", but the Authentik
heph OAuth2 provider only mapped openid/email/profile. Without the
offline_access mapping the issued refresh token is bound to the login
session rather than the 30-day refresh-token window; once the session
lapses, hephd's refresh_token grant returns 400 Bad Request and spoke
sync silently degrades (heph sync --status -> auth_failure: true).

Add the built-in offline_access scope mapping to the provider's
property_mappings and document the requirement in the service reference.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 argocd/manifests/authentik/configmap-blueprint.yaml |  4 ++++
 docs/changelog.d/heph-offline-access.bugfix.md      |  1 +
 docs/reference/services/hephaestus.md               | 11 +++++++++++
 3 files changed, 16 insertions(+)
 create mode 100644 docs/changelog.d/heph-offline-access.bugfix.md

diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml
index 9da2f70..cc97dea 100644
--- a/argocd/manifests/authentik/configmap-blueprint.yaml
+++ b/argocd/manifests/authentik/configmap-blueprint.yaml
@@ -492,6 +492,10 @@ data:
             - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]]
             - !Find [authentik_providers_oauth2.scopemapping, [scope_name, email]]
             - !Find [authentik_providers_oauth2.scopemapping, [scope_name, profile]]
+            # offline_access: heph CLI requests "openid offline_access"; without
+            # this mapping the refresh token is session-bound and hephd's
+            # refresh_token grant 400s once the session lapses (spoke sync dies).
+            - !Find [authentik_providers_oauth2.scopemapping, [scope_name, offline_access]]
           sub_mode: hashed_user_id
           include_claims_in_id_token: true
 
diff --git a/docs/changelog.d/heph-offline-access.bugfix.md b/docs/changelog.d/heph-offline-access.bugfix.md
new file mode 100644
index 0000000..e9721bc
--- /dev/null
+++ b/docs/changelog.d/heph-offline-access.bugfix.md
@@ -0,0 +1 @@
+Granted the `offline_access` scope on the Authentik `heph` OAuth2 provider so hephaestus spokes receive a durable 30-day refresh token. Previously the refresh token was session-bound, so spoke sync would silently fail with a `400 Bad Request` on the `refresh_token` grant once the Authentik session lapsed.
diff --git a/docs/reference/services/hephaestus.md b/docs/reference/services/hephaestus.md
index 1754ea0..7abc35b 100644
--- a/docs/reference/services/hephaestus.md
+++ b/docs/reference/services/hephaestus.md
@@ -68,6 +68,17 @@ in the [[authentik]] blueprint (`argocd/manifests/authentik/configmap-blueprint.
 - Issuer: `https://authentik.ops.eblu.me/application/o/heph/`
 - Audience / client id: `heph`
 - Restricted to the `admins` group (single-owner, sensitive data).
+- Scope mappings: `openid`, `email`, `profile`, **`offline_access`**.
+
+> **`offline_access` is required for durable sync.** The `heph` CLI requests
+> `scope = "openid offline_access"`, and a refresh token is only issued for the
+> 30-day refresh-token window when the provider actually grants `offline_access`.
+> Without that scope mapping the refresh token is bound to the login **session**;
+> once the session lapses, hephd's `refresh_token` grant returns `400 Bad
+> Request`, the bearer can't be refreshed, and spoke sync silently degrades
+> (`heph sync --status` → `auth_failure: true`). `heph auth login` papers over it
+> until the next session expiry. Keep `offline_access` in the provider's
+> `property_mappings`.
 
 Because no Authentik instance ships a device-code flow by default, the blueprint
 also creates `default-device-code-flow` and binds it to the default brand's

From 8072cd21d75b1a88f7c843e68cefdf61da2c986e Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Mon, 8 Jun 2026 06:35:23 -0700
Subject: [PATCH 12/20] C0: review jellyfin, upgrade indri to 10.11.11
 (security fixes)

Jellyfin was 5 patch releases behind (10.11.6 -> 10.11.11). 10.11.7 and
10.11.10 contain disclosed CVE/GHSA security fixes. Upgraded via
brew upgrade --cask jellyfin on indri; service verified healthy and
externally reachable (HTTPS 200).

Documented the recurring Gatekeeper gotcha: cask upgrades re-quarantine
the .app and the launchd service hangs silently until the first-launch
dialog is approved on indri's GUI console (xattr removal over SSH is
blocked by macOS TCC).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/changelog.d/+jellyfin-10-11-11.bugfix.md |  1 +
 docs/reference/services/jellyfin.md           | 22 +++++++++++++++++--
 service-versions.yaml                         | 10 +++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelog.d/+jellyfin-10-11-11.bugfix.md

diff --git a/docs/changelog.d/+jellyfin-10-11-11.bugfix.md b/docs/changelog.d/+jellyfin-10-11-11.bugfix.md
new file mode 100644
index 0000000..779a042
--- /dev/null
+++ b/docs/changelog.d/+jellyfin-10-11-11.bugfix.md
@@ -0,0 +1 @@
+Upgraded Jellyfin on indri from 10.11.6 to 10.11.11, picking up the security fixes in 10.11.7 (disclosed CVEs/GHSAs, flagged "upgrade immediately") and 10.11.10 (three further GHSAs). Noted the recurring gotcha in the service-versions tracking: after a `brew upgrade --cask jellyfin`, the re-quarantined `.app` makes the launchd-spawned process hang silently until the Gatekeeper first-launch dialog is approved on indri's GUI console — removing the quarantine xattr over SSH is blocked by macOS TCC.
diff --git a/docs/reference/services/jellyfin.md b/docs/reference/services/jellyfin.md
index bbdfafd..c7b3074 100644
--- a/docs/reference/services/jellyfin.md
+++ b/docs/reference/services/jellyfin.md
@@ -1,7 +1,7 @@
 ---
 title: Jellyfin
-modified: 2026-02-07
-last-reviewed: 2026-03-23
+modified: 2026-06-08
+last-reviewed: 2026-06-08
 tags:
   - service
   - media
@@ -41,6 +41,24 @@ Dashboard > Playback:
 2. Allow hardware encoding: Enabled
 3. VPP Tone mapping: Enabled
 
+## Upgrades
+
+Installed via Homebrew cask (`state: present`, unpinned), so the Ansible role
+won't bump an already-installed cask. To upgrade, run on indri:
+
+```bash
+brew upgrade --cask jellyfin
+```
+
+**Gatekeeper gotcha:** a cask upgrade replaces `/Applications/Jellyfin.app` and
+re-applies the `com.apple.quarantine` xattr. When launchd respawns the service,
+the new binary hangs silently — process alive but ~0 CPU, no logs, no listening
+socket — because Gatekeeper is holding the first launch pending approval.
+Removing the xattr over SSH fails (`xattr -dr com.apple.quarantine ...` →
+"Operation not permitted", blocked by macOS TCC). Approve the first-launch
+dialog on indri's GUI console (or run the `xattr` removal from a local Terminal
+with Full Disk Access), then reload the LaunchAgent.
+
 ## Observability
 
 - Metrics: `jellyfin_metrics` ansible role
diff --git a/service-versions.yaml b/service-versions.yaml
index 866c687..419d129 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -440,9 +440,15 @@ services:
 
   - name: jellyfin
     type: ansible
-    last-reviewed: 2026-03-17
-    current-version: "10.11.6"
+    last-reviewed: 2026-06-08
+    current-version: "10.11.11"
     upstream-source: https://github.com/jellyfin/jellyfin/releases
+    notes: >-
+      Homebrew cask (state: present, unpinned). Upgrade with
+      `brew upgrade --cask jellyfin` on indri. After upgrade the .app is
+      re-quarantined; launchd-spawned launch hangs silently until the
+      Gatekeeper first-launch dialog is approved on indri's GUI console
+      (xattr removal over SSH is blocked by TCC).
 
   - name: automounter
     type: ansible

From 6370d2bddbb32a78de11516e92e114a7a713dce3 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Mon, 8 Jun 2026 07:00:48 -0700
Subject: [PATCH 13/20] C0: doc-review tailscale-operator (dual indri/ringtail,
 host caveat)

Add last-reviewed; document the operator now running on both indri's
minikube and ringtail's k3s; correct the ArgoCD apps row; pin upstream
v1.94.2; add the ProxyGroup Ingress 'host: *' requirement.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../+tailscale-operator-doc-review.doc.md     |  1 +
 .../kubernetes/tailscale-operator.md          | 23 +++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelog.d/+tailscale-operator-doc-review.doc.md

diff --git a/docs/changelog.d/+tailscale-operator-doc-review.doc.md b/docs/changelog.d/+tailscale-operator-doc-review.doc.md
new file mode 100644
index 0000000..8f7d5a3
--- /dev/null
+++ b/docs/changelog.d/+tailscale-operator-doc-review.doc.md
@@ -0,0 +1 @@
+Reviewed the tailscale-operator reference card: documented the dual indri/ringtail deployment, corrected the ArgoCD apps list, pinned the upstream version, and added the ProxyGroup Ingress `host:` caveat.
diff --git a/docs/reference/kubernetes/tailscale-operator.md b/docs/reference/kubernetes/tailscale-operator.md
index c102e02..174b347 100644
--- a/docs/reference/kubernetes/tailscale-operator.md
+++ b/docs/reference/kubernetes/tailscale-operator.md
@@ -1,6 +1,7 @@
 ---
 title: Tailscale Operator
-modified: 2026-02-08
+modified: 2026-06-08
+last-reviewed: 2026-06-08
 tags:
   - kubernetes
   - tailscale
@@ -15,8 +16,16 @@ The Tailscale operator enables Kubernetes services to be exposed directly on the
 | Property | Value |
 |----------|-------|
 | **Namespace** | `tailscale` |
-| **Upstream** | `mirrors/tailscale` on forge (static manifest) |
-| **ArgoCD Apps** | `tailscale-operator-base` (upstream), `tailscale-operator` (config) |
+| **Upstream** | `mirrors/tailscale` on forge (static manifest, pinned `v1.94.2`) |
+| **ArgoCD Apps** | `tailscale-operator` (indri/minikube), `tailscale-operator-ringtail` (ringtail/k3s) |
+
+The operator runs on **both** clusters — indri's minikube and ringtail's k3s.
+Both apps layer on the shared `tailscale-operator-base` kustomize directory
+(operator manifest, `ProxyClass`, `dnsconfig`); each cluster supplies its own
+`ProxyGroup` (indri: 2 replicas, ringtail: 1) and OAuth `ExternalSecret`. The
+ringtail overlay additionally rewrites the proxy image to a locally nix-built
+mirror. See [[ringtail]] and [[migrate-wave1-ringtail]] for the ongoing
+migration of k8s workloads onto ringtail.
 
 ## How It Works
 
@@ -27,7 +36,13 @@ Ingresses use a shared ProxyGroup (`ingress`) rather than per-service Tailscale
 3. Service becomes accessible at `<hostname>.tail8d86e.ts.net`
 4. TLS is handled automatically via Tailscale
 
-Tailnet clients must have `--accept-routes` enabled to route to VIP addresses.
+Two requirements for VIP routing to work:
+
+1. Tailnet clients must have `--accept-routes` enabled to route to VIP addresses.
+2. Ingress rules must **not** set an explicit `host:` field. The ProxyGroup
+   proxy receives the FQDN as the `Host` header (e.g.
+   `prometheus.tail8d86e.ts.net`), which won't match a short name. Use
+   `host: "*"` or omit `host:` entirely.
 
 Services can be individually tagged (e.g., `tag:flyio-target`) via Ingress annotations to control which ACL grants apply. See [[expose-service-publicly]] for the tagging workflow.
 

From e592ecfca49ab4e0a0a97d975b60926192955a56 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Mon, 8 Jun 2026 07:17:21 -0700
Subject: [PATCH 14/20] C0: update ringtail flake inputs (nixpkgs, disko)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/changelog.d/+ringtail-flake-update.infra.md |  1 +
 nixos/ringtail/flake.lock                        | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)
 create mode 100644 docs/changelog.d/+ringtail-flake-update.infra.md

diff --git a/docs/changelog.d/+ringtail-flake-update.infra.md b/docs/changelog.d/+ringtail-flake-update.infra.md
new file mode 100644
index 0000000..1d806df
--- /dev/null
+++ b/docs/changelog.d/+ringtail-flake-update.infra.md
@@ -0,0 +1 @@
+Updated ringtail NixOS flake inputs (nixpkgs `nixos-25.11`, disko) to latest via `dagger call flake-update`.
diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock
index bb60501..340bd9d 100644
--- a/nixos/ringtail/flake.lock
+++ b/nixos/ringtail/flake.lock
@@ -7,11 +7,11 @@
         ]
       },
       "locked": {
-        "lastModified": 1780290312,
-        "narHash": "sha256-eTAlX0CwgB84Ts3GaBd944A3DRXVMzgA0EqroZBISUo=",
+        "lastModified": 1780894562,
+        "narHash": "sha256-c3430xwxwhHipl3jigUGMMBfpaMylDqytW/kdmB3ZGs=",
         "owner": "nix-community",
         "repo": "disko",
-        "rev": "115e5211780054d8a890b41f0b7734cafad54dfe",
+        "rev": "24fed06cac83bcc44ac8efbb57cab1a82fa0bedc",
         "type": "github"
       },
       "original": {
@@ -43,11 +43,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1779796641,
-        "narHash": "sha256-ZsIrKmhp4vbBXoXXmR/tBXA/UCsAQiJL9vsgZEduhVY=",
+        "lastModified": 1780511130,
+        "narHash": "sha256-2v9lT4ya59Lh1FqPeLnz1MoX9y/wz2huqfe9RtQZITk=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "25f538306313eae3927264466c70d7001dcea1df",
+        "rev": "535f3e6942cb1cead3929c604320d3db54b542b9",
         "type": "github"
       },
       "original": {

From 1c41cca90392f25951f9040a86a767e7a4866509 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Mon, 8 Jun 2026 09:30:09 -0700
Subject: [PATCH 15/20] Retire Prowler image + IaC scans (keep K8s CIS only)
 (#372)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Why

Weekly compliance review (2026-06-07) surfaced the toil problem head-on:

| Report | Unmuted findings | Muted | Acted on |
|--------|------------------|-------|----------|
| **K8s CIS (In-Cluster)** | 0 | 65 | clean ✅ |
| **Container Images** | 20,005 (+713 WoW) | 0 | never |
| **IaC (manifests)** | 654 (+31/−30 WoW) | 0 | never |

The image and IaC scans generate tens of thousands of un-actioned, un-muted findings every week:

- **Image scan** — overwhelmingly unpatchable *upstream* base-image CVEs, and it re-scans every historical tag still in the registry (2× paperless, 3× mealie, 4× prowler tags in the latest report), multiplying the count.
- **IaC scan** — systemic Trivy KSV pod-security warnings against our own manifests; real but homelab-acceptable, never muted, so re-surfaced indefinitely.

The K8s CIS scan is the only one with realized value (fully mutelisted, 0 unmuted WoW) and is retained. Matches the broader scaling-back of the reporting system as minikube heads toward retirement.

## Changes

- Delete `cronjob-image-scan.yaml` and `cronjob-iac-scan.yaml` + remove from kustomization
- Drop the now-unused `mutelist/trivyignore.yaml` (only the IaC scan consumed it)
- `review-compliance-reports`: drop the two retired scans (and the grouped-findings rendering that existed solely for them)
- Docs: deploy-prowler (new 'Why only the K8s CIS scan' section), read-compliance-reports, security reference, prowler reference

## Deploy (after review)

```fish
argocd app set prowler --revision retire-prowler-image-iac-scans
argocd app sync prowler   # prune removes the two CronJobs
# after merge: argocd app set prowler --revision main && argocd app sync prowler
```

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/372
---
 .../manifests/prowler/cronjob-iac-scan.yaml   |  54 -------
 .../manifests/prowler/cronjob-image-scan.yaml |  39 -----
 argocd/manifests/prowler/kustomization.yaml   |   3 -
 .../prowler/mutelist/trivyignore.yaml         |  37 -----
 .../retire-prowler-image-iac-scans.infra.md   |   1 +
 docs/how-to/operations/deploy-prowler.md      |  49 ++----
 .../operations/read-compliance-reports.md     |  11 +-
 docs/reference/operations/security.md         |   8 +-
 docs/reference/services/prowler.md            |  12 +-
 mise-tasks/review-compliance-reports          | 147 ++++--------------
 10 files changed, 65 insertions(+), 296 deletions(-)
 delete mode 100644 argocd/manifests/prowler/cronjob-iac-scan.yaml
 delete mode 100644 argocd/manifests/prowler/cronjob-image-scan.yaml
 delete mode 100644 argocd/manifests/prowler/mutelist/trivyignore.yaml
 create mode 100644 docs/changelog.d/retire-prowler-image-iac-scans.infra.md

diff --git a/argocd/manifests/prowler/cronjob-iac-scan.yaml b/argocd/manifests/prowler/cronjob-iac-scan.yaml
deleted file mode 100644
index c1303a5..0000000
--- a/argocd/manifests/prowler/cronjob-iac-scan.yaml
+++ /dev/null
@@ -1,54 +0,0 @@
----
-apiVersion: batch/v1
-kind: CronJob
-metadata:
-  name: prowler-iac-scan
-  namespace: prowler
-spec:
-  schedule: "0 2 * * 6"  # Saturday 2am
-  concurrencyPolicy: Forbid
-  jobTemplate:
-    spec:
-      ttlSecondsAfterFinished: 604800  # Auto-delete after 7 days
-      template:
-        spec:
-          securityContext:
-            seccompProfile:
-              type: RuntimeDefault
-          containers:
-            - name: prowler
-              image: registry.ops.eblu.me/blumeops/prowler:kustomized
-              command: ["/bin/sh", "-c"]
-              # Prowler's --mutelist-file is a no-op for the IaC provider
-              # (it delegates to Trivy). The Prowler image's trivy shim
-              # injects --ignorefile $TRIVY_IGNOREFILE when set; see
-              # containers/prowler/Dockerfile.
-              env:
-                - name: TRIVY_IGNOREFILE
-                  value: /mutelist/trivyignore.yaml
-              args:
-                - |
-                  DATEDIR=/reports/prowler-iac/$(date +%Y-%m-%d)
-                  mkdir -p "$DATEDIR"
-                  prowler iac \
-                    --scan-repository-url https://forge.ops.eblu.me/eblume/blumeops.git \
-                    -z \
-                    --output-formats html csv json-ocsf \
-                    --output-directory "$DATEDIR"
-              volumeMounts:
-                - name: reports
-                  mountPath: /reports
-                - name: mutelist
-                  mountPath: /mutelist
-                  readOnly: true
-          restartPolicy: OnFailure
-          volumes:
-            - name: reports
-              persistentVolumeClaim:
-                claimName: prowler-reports
-            - name: mutelist
-              configMap:
-                name: prowler-mutelist
-                items:
-                  - key: trivyignore.yaml
-                    path: trivyignore.yaml
diff --git a/argocd/manifests/prowler/cronjob-image-scan.yaml b/argocd/manifests/prowler/cronjob-image-scan.yaml
deleted file mode 100644
index b779d08..0000000
--- a/argocd/manifests/prowler/cronjob-image-scan.yaml
+++ /dev/null
@@ -1,39 +0,0 @@
----
-apiVersion: batch/v1
-kind: CronJob
-metadata:
-  name: prowler-image-scan
-  namespace: prowler
-spec:
-  schedule: "0 3 * * 6"  # Saturday 3am
-  concurrencyPolicy: Forbid
-  jobTemplate:
-    spec:
-      ttlSecondsAfterFinished: 604800  # Auto-delete after 7 days
-      template:
-        spec:
-          securityContext:
-            seccompProfile:
-              type: RuntimeDefault
-          containers:
-            - name: prowler
-              image: registry.ops.eblu.me/blumeops/prowler:kustomized
-              command: ["/bin/sh", "-c"]
-              args:
-                - |
-                  DATEDIR=/reports/prowler-images/$(date +%Y-%m-%d)
-                  mkdir -p "$DATEDIR"
-                  prowler image \
-                    --registry https://registry.ops.eblu.me \
-                    --image-filter "^blumeops/" \
-                    -z \
-                    --output-formats html csv json-ocsf \
-                    --output-directory "$DATEDIR"
-              volumeMounts:
-                - name: reports
-                  mountPath: /reports
-          restartPolicy: OnFailure
-          volumes:
-            - name: reports
-              persistentVolumeClaim:
-                claimName: prowler-reports
diff --git a/argocd/manifests/prowler/kustomization.yaml b/argocd/manifests/prowler/kustomization.yaml
index 1d92a6b..38295a3 100644
--- a/argocd/manifests/prowler/kustomization.yaml
+++ b/argocd/manifests/prowler/kustomization.yaml
@@ -10,8 +10,6 @@ resources:
   - pv-nfs.yaml
   - pvc.yaml
   - cronjob.yaml
-  - cronjob-image-scan.yaml
-  - cronjob-iac-scan.yaml
 
 configMapGenerator:
   - name: prowler-mutelist
@@ -23,7 +21,6 @@ configMapGenerator:
       - mutelist/core-pod-security.yaml
       - mutelist/manual-node-checks.yaml
       - mutelist/rbac.yaml
-      - mutelist/trivyignore.yaml
 
 images:
   - name: registry.ops.eblu.me/blumeops/prowler
diff --git a/argocd/manifests/prowler/mutelist/trivyignore.yaml b/argocd/manifests/prowler/mutelist/trivyignore.yaml
deleted file mode 100644
index 87af966..0000000
--- a/argocd/manifests/prowler/mutelist/trivyignore.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-# Trivy ignorefile for Prowler IaC scan.
-#
-# Prowler's `--mutelist-file` flag is a no-op for the IaC provider
-# (iac_provider.py sets self._mutelist = None and delegates to Trivy).
-# Trivy in turn does not auto-discover this YAML form from cwd, so the
-# Prowler image ships a shim wrapper around `trivy` that injects
-# --ignorefile $TRIVY_IGNOREFILE when the env var is set. The cronjob
-# mounts this file and sets TRIVY_IGNOREFILE accordingly.
-#
-# Schema: https://trivy.dev/latest/docs/configuration/filtering/
-# IDs use the hyphenated form Trivy displays (KSV-0041, not KSV0041).
-misconfigurations:
-  - id: KSV-0041
-    paths:
-      - "argocd/manifests/external-secrets/rbac.yaml"
-    statement: >-
-      external-secrets-operator's entire function is to read and
-      synthesize Secret objects; ClusterRole over secrets is its
-      purpose. Both the controller and cert-controller are
-      upstream-defined.
-  - id: KSV-0041
-    paths:
-      - "argocd/manifests/kube-state-metrics/rbac.yaml"
-      - "argocd/manifests/kube-state-metrics-ringtail/rbac.yaml"
-    statement: >-
-      KSM exposes only Secret metadata (name, namespace, type, labels),
-      never the data field. list/watch on secrets is required for
-      kube_secret_info / kube_secret_labels metrics.
-  - id: KSV-0114
-    paths:
-      - "argocd/manifests/external-secrets/rbac.yaml"
-    statement: >-
-      cert-controller manages the external-secrets validating webhook
-      configurations to inject its own rotating CA bundle. RBAC is
-      scoped to two named webhooks (secretstore-validate,
-      externalsecret-validate) via resourceNames; KSV-0114 doesn't see
-      the resourceNames restriction so reports the full ClusterRole.
diff --git a/docs/changelog.d/retire-prowler-image-iac-scans.infra.md b/docs/changelog.d/retire-prowler-image-iac-scans.infra.md
new file mode 100644
index 0000000..9afd261
--- /dev/null
+++ b/docs/changelog.d/retire-prowler-image-iac-scans.infra.md
@@ -0,0 +1 @@
+Retired the Prowler container-image CVE scan and IaC scan, keeping only the K8s CIS benchmark scan. The two retired scans generated tens of thousands of un-actioned, un-muted findings every week (~20,000 image findings and growing, mostly unpatchable upstream-image CVEs; ~650 systemic Trivy KSV pod-security warnings) — the weekly `mise run review-compliance-reports` re-surfaced them all as "action needed" though none were ever triaged. The K8s CIS scan is fully mutelisted and runs clean, so it stays. Removed the two CronJobs, the now-unused `trivyignore.yaml` mutelist, and the grouped-findings rendering in the review tool that existed solely for the high-volume scans.
diff --git a/docs/how-to/operations/deploy-prowler.md b/docs/how-to/operations/deploy-prowler.md
index 75dced2..1475680 100644
--- a/docs/how-to/operations/deploy-prowler.md
+++ b/docs/how-to/operations/deploy-prowler.md
@@ -1,6 +1,6 @@
 ---
 title: Deploy Prowler CIS Scanner
-modified: 2026-03-24
+modified: 2026-06-08
 last-reviewed: 2026-03-24
 tags:
   - how-to
@@ -11,7 +11,20 @@ tags:
 
 # Deploy Prowler CIS Scanner
 
-Prowler runs weekly CIS Kubernetes Benchmark scans against minikube-indri and writes HTML/CSV/JSON reports to the NFS share on sifaka.
+Prowler runs a weekly CIS Kubernetes Benchmark scan against minikube-indri and writes HTML/CSV/JSON reports to the NFS share on sifaka.
+
+## Why only the K8s CIS scan
+
+Prowler originally ran three CronJobs: K8s CIS, container-image CVE scanning, and IaC scanning. The image and IaC scans were **retired in 2026-06**.
+
+Both were pure toil with no realized value:
+
+- **Image scan** produced ~20,000 unmuted findings per run and growing, none ever triaged or muted. They were overwhelmingly CVEs in *upstream* base images we don't control and can't patch, and the job re-scanned every historical tag still in the registry, multiplying the count.
+- **IaC scan** produced ~650 Trivy KSV findings (`runAsNonRoot`, `readOnlyRootFilesystem`, drop-capabilities, …) against our own manifests — real but systemic, homelab-acceptable, and likewise never muted, so the weekly review re-surfaced all of them indefinitely.
+
+The K8s CIS scan, by contrast, is fully mutelisted and runs clean (0 unmuted findings week over week), so it stays. The guiding principle matches [[ai-scraper-mitigation]]: don't keep generating a firehose of output that has no audience. If image-CVE signal is wanted later, the right shape is critical-severity-only, currently-deployed-tags-only, alert-on-new — a rebuild, not a revival (tracked as the "Trivy for image/IaC scanning" task).
+
+Note that the K8s CIS scan itself is tied to minikube-indri, which is slated for retirement; on k3s only ~22 of 70 checks produce results (no static pods). Re-pointing a lean posture check at ringtail is tracked separately ("prowler scan against ringtail").
 
 ## What it checks
 
@@ -33,38 +46,6 @@ Prowler's Kubernetes provider runs ~70 checks from the CIS Kubernetes Benchmark
 
 **k3s note:** k3s embeds the control plane in a single binary — no static pods exist. Only core + RBAC checks (~22 of 70) produce results. Consider `kube-bench` for k3s control plane checks.
 
-### Image vulnerability scanning (Saturday 3am)
-
-Prowler's image provider scans all `blumeops/*` container images in `registry.ops.eblu.me` for:
-
-- **CVEs** — known vulnerabilities from NVD, Alpine SecDB, Debian Security Tracker, and other sources
-- **Embedded secrets** — credentials or API keys baked into image layers
-- **Misconfigurations** — Dockerfile best practices (running as root, missing HEALTHCHECK, etc.)
-
-Uses Trivy under the hood. Reports are written to `sifaka:/volume1/reports/prowler-images/`.
-
-To run an ad-hoc image scan:
-
-```fish
-kubectl create job --from=cronjob/prowler-image-scan prowler-image-manual -n prowler --context=minikube-indri
-```
-
-### IaC scanning (Saturday 2am)
-
-Prowler's IaC provider scans the blumeops repository (cloned at scan time) for misconfigurations in:
-
-- **Dockerfiles** — running as root, using `latest` tags, missing `HEALTHCHECK`
-- **Kubernetes manifests** — missing resource limits, privileged containers, insecure settings
-- **Other IaC files** — Terraform, CloudFormation, etc. if present
-
-Uses Trivy under the hood. Reports are written to `sifaka:/volume1/reports/prowler-iac/`.
-
-To run an ad-hoc IaC scan:
-
-```fish
-kubectl create job --from=cronjob/prowler-iac-scan prowler-iac-manual -n prowler --context=minikube-indri
-```
-
 ## Reports
 
 Reports are written to `sifaka:/volume1/reports/prowler/` with timestamped filenames. See [[read-compliance-reports]] for how to access and interpret them.
diff --git a/docs/how-to/operations/read-compliance-reports.md b/docs/how-to/operations/read-compliance-reports.md
index e676ad5..2990026 100644
--- a/docs/how-to/operations/read-compliance-reports.md
+++ b/docs/how-to/operations/read-compliance-reports.md
@@ -1,6 +1,6 @@
 ---
 title: Read Compliance Reports
-modified: 2026-04-06
+modified: 2026-06-08
 last-reviewed: 2026-04-06
 tags:
   - how-to
@@ -27,8 +27,13 @@ Reports are stored on sifaka at `/volume1/reports/`. Each scanner writes to its
 | Scanner | Path | Schedule |
 |---------|------|----------|
 | [[prowler]] K8s CIS | `sifaka:/volume1/reports/prowler/` | Weekly (Sunday 3am) |
-| [[prowler]] Image | `sifaka:/volume1/reports/prowler-images/` | Weekly (Saturday 3am) |
-| [[prowler]] IaC | `sifaka:/volume1/reports/prowler-iac/` | Weekly (Saturday 2am) |
+
+> **Retired (2026-06):** the Prowler **image** (`prowler-images/`) and **IaC**
+> (`prowler-iac/`) scans were retired. They produced tens of thousands of
+> un-actioned, un-muted findings every week — mostly unpatchable upstream-image
+> CVEs and systemic pod-security KSV warnings — and nobody triaged them. See
+> [[deploy-prowler#Why only the K8s CIS scan]] for the rationale. Their stale
+> report directories may linger on sifaka until manually removed.
 
 Copy reports to your local machine (remember `scp -O` for sifaka):
 
diff --git a/docs/reference/operations/security.md b/docs/reference/operations/security.md
index 11c4df9..86b3d3b 100644
--- a/docs/reference/operations/security.md
+++ b/docs/reference/operations/security.md
@@ -1,6 +1,6 @@
 ---
 title: Security & Compliance
-modified: 2026-03-24
+modified: 2026-06-08
 last-reviewed: 2026-03-24
 tags:
   - operations
@@ -21,7 +21,7 @@ Security posture and compliance scanning for BlumeOps infrastructure.
 
 ## Scanning tools
 
-- [[prowler]] — CIS Kubernetes Benchmark scanner (weekly CronJob)
+- [[prowler]] — CIS Kubernetes Benchmark scanner (weekly CronJob). The container-image CVE scan and IaC scan were retired in 2026-06 (un-actioned noise — see [[deploy-prowler#Why only the K8s CIS scan]]); only the K8s CIS scan remains.
   - [[deploy-prowler]] — deployment and ad-hoc scan how-to
   - [[read-compliance-reports]] — accessing and interpreting reports
 - [[kingfisher]] — Secret detection and live validation for Forgejo repos (weekly CronJob + prek hook)
@@ -52,5 +52,5 @@ Suppressed findings are kept in Prowler mutelist YAML under `argocd/manifests/pr
 
 - No SOC 2 compliance mapping for Kubernetes (Prowler only maps SOC 2 for AWS/Azure/GCP)
 - k3s control plane checks produce no results (embedded binary, no static pods) — consider kube-bench
-- Container image scanning covers `blumeops/*` images only — upstream images (ollama, immich, etc.) are not scanned
-- IaC scanning covers the blumeops repo only — no scanning of third-party Helm charts or vendored manifests
+- No container-image CVE scanning (the Prowler image scan was retired 2026-06 as un-actioned noise). If reintroduced, scope it to critical-severity, currently-deployed tags, alert-on-new
+- No automated IaC misconfiguration scanning (the Prowler IaC scan was retired 2026-06). Manifest pod-security hardening is now an accept-and-document decision rather than a weekly report
diff --git a/docs/reference/services/prowler.md b/docs/reference/services/prowler.md
index f45955f..9f7e4b3 100644
--- a/docs/reference/services/prowler.md
+++ b/docs/reference/services/prowler.md
@@ -1,6 +1,6 @@
 ---
 title: Prowler
-modified: 2026-03-24
+modified: 2026-06-08
 last-reviewed: 2026-03-24
 tags:
   - service
@@ -17,20 +17,20 @@ CIS Kubernetes Benchmark scanner for compliance posture reporting.
 |----------|-------|
 | **Namespace** | `prowler` |
 | **Image** | `registry.ops.eblu.me/blumeops/prowler` (see `argocd/manifests/prowler/kustomization.yaml` for current tag) |
-| **Schedule** | K8s CIS: Sunday 3am / Image: Saturday 3am / IaC: Saturday 2am |
-| **Reports** | `sifaka:/volume1/reports/prowler/`, `prowler-images/`, `prowler-iac/` (NFS) |
+| **Schedule** | K8s CIS: Sunday 3am |
+| **Reports** | `sifaka:/volume1/reports/prowler/` (NFS) |
 | **Manifests** | `argocd/manifests/prowler/` |
 
 ## What it does
 
-Runs Prowler 5 as two CronJobs:
+Runs Prowler 5 as a single CronJob:
 
 - **K8s CIS scan** (Sunday) — CIS Kubernetes Benchmark v1.11 checks across pod security, RBAC, apiserver, etcd, kubelet, controller-manager, and scheduler
-- **Image scan** (Saturday) — CVE, secret, and misconfiguration scanning of all `blumeops/*` container images in the registry via Trivy
-- **IaC scan** (Saturday) — static analysis of Dockerfiles, K8s manifests, and other IaC files in the repo via Trivy
 
 Reports are written in HTML, CSV, and JSON-OCSF to the NFS share on sifaka.
 
+The **image** and **IaC** scans (formerly Saturday CronJobs) were retired in 2026-06 — they generated tens of thousands of un-actioned findings weekly. See [[deploy-prowler#Why only the K8s CIS scan]].
+
 ## See also
 
 - [[security]] — security & compliance posture overview
diff --git a/mise-tasks/review-compliance-reports b/mise-tasks/review-compliance-reports
index 24d2afc..f2a0a54 100755
--- a/mise-tasks/review-compliance-reports
+++ b/mise-tasks/review-compliance-reports
@@ -10,19 +10,19 @@
 
 Covers:
   - Prowler K8s CIS (in-cluster): per-finding detail
-  - Prowler container image scans: grouped by check + resource
-  - Prowler IaC manifest scans: grouped by check + resource
   - Kingfisher secret scanning: TODO — pending upstream JSON/CSV output
     support (currently HTML-only; contribute from spork)
 
-For each Prowler scan, copies the two most recent CSV reports, parses
+The Prowler container-image CVE scan and IaC scan were retired in 2026-06
+(see docs/how-to/operations/deploy-prowler.md) — they produced tens of
+thousands of un-actioned findings weekly. Only the K8s CIS scan remains.
+
+For the Prowler scan, copies the two most recent CSV reports, parses
 them, and displays:
   1. Overall status (pass/fail/manual/muted counts)
   2. Unmuted failures by severity
   3. Delta from the previous report (new vs resolved)
-  4. Actionable unmuted failures (per-finding for in-cluster; grouped
-     by check ID and resource for image/IaC because they have far too
-     many findings to list individually)
+  4. Actionable unmuted failures (per-finding detail)
 
 This is the primary tool for the weekly compliance report review.
 """
@@ -39,11 +39,9 @@ from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 
-PROWLER_SCANS: list[tuple[str, str, bool]] = [
-    # (label, sifaka base path, group_findings)
-    ("K8s CIS (In-Cluster)", "/volume1/reports/prowler", False),
-    ("Container Images", "/volume1/reports/prowler-images", True),
-    ("IaC (manifests)", "/volume1/reports/prowler-iac", True),
+PROWLER_SCANS: list[tuple[str, str]] = [
+    # (label, sifaka base path)
+    ("K8s CIS (In-Cluster)", "/volume1/reports/prowler"),
 ]
 
 console = Console()
@@ -334,14 +332,8 @@ def summarize_report(
     tmpdir: str,
     *,
     show_muted: bool = False,
-    group_findings: bool = False,
 ) -> None:
-    """Fetch and summarize the latest Prowler report under `base`.
-
-    When `group_findings` is True, top-N CHECK_ID and RESOURCE_NAME tables
-    are shown instead of a per-finding detail table — appropriate for
-    image and IaC scans that produce thousands of findings.
-    """
+    """Fetch and summarize the latest Prowler report under `base`."""
     console.rule(f"[bold]{label}[/bold]")
     csvs = list_reports(base)
     if not csvs:
@@ -458,36 +450,29 @@ def summarize_report(
         )
         console.print()
 
-        # For grouped scans the new/resolved listings are too noisy
-        # (potentially thousands of lines). Skip the listings; the count
-        # is in the panel above and detail is in the grouped tables.
-        if not group_findings:
-            if new_keys:
-                console.print("[bold red]New Unmuted Failures:[/bold red]")
-                for k in sorted(new_keys):
-                    r = curr_keys[k]
-                    console.print(
-                        f"  [{r['SEVERITY']}] {r['CHECK_ID']}: "
-                        f"{r['STATUS_EXTENDED'][:120]}"
-                    )
-                console.print()
+        if new_keys:
+            console.print("[bold red]New Unmuted Failures:[/bold red]")
+            for k in sorted(new_keys):
+                r = curr_keys[k]
+                console.print(
+                    f"  [{r['SEVERITY']}] {r['CHECK_ID']}: "
+                    f"{r['STATUS_EXTENDED'][:120]}"
+                )
+            console.print()
 
-            if resolved_keys:
-                console.print("[bold green]Resolved:[/bold green]")
-                for k in sorted(resolved_keys):
-                    r = prev_keys[k]
-                    console.print(
-                        f"  [dim][{r['SEVERITY']}] {r['CHECK_ID']}: "
-                        f"{r['STATUS_EXTENDED'][:120]}[/dim]"
-                    )
-                console.print()
+        if resolved_keys:
+            console.print("[bold green]Resolved:[/bold green]")
+            for k in sorted(resolved_keys):
+                r = prev_keys[k]
+                console.print(
+                    f"  [dim][{r['SEVERITY']}] {r['CHECK_ID']}: "
+                    f"{r['STATUS_EXTENDED'][:120]}[/dim]"
+                )
+            console.print()
 
-    # --- Unmuted failure details (grouped or per-finding) ---
+    # --- Unmuted failure details ---
     if latest["unmuted"]:
-        if group_findings:
-            _print_grouped_findings(latest["unmuted"])
-        else:
-            _print_findings_detail(latest["unmuted"])
+        _print_findings_detail(latest["unmuted"])
 
     # --- Muted findings summary ---
     if show_muted and latest["muted"]:
@@ -566,75 +551,6 @@ def _print_findings_detail(unmuted: list[dict]) -> None:
     console.print()
 
 
-def _worst_severity(rows: list[dict]) -> str:
-    """Return the most severe severity label across `rows`."""
-    if not rows:
-        return ""
-    return min(
-        (r["SEVERITY"] for r in rows),
-        key=lambda s: severity_sort({"SEVERITY": s}),
-    )
-
-
-def _print_grouped_findings(unmuted: list[dict], top_n: int = 15) -> None:
-    """Top-N tables grouped by CHECK_ID and RESOURCE_NAME.
-
-    Used for image and IaC scans where per-finding tables would be too
-    large to be useful. Shows count and worst severity for each group.
-    """
-    by_check: dict[str, list[dict]] = {}
-    by_resource: dict[str, list[dict]] = {}
-    for r in unmuted:
-        by_check.setdefault(r["CHECK_ID"], []).append(r)
-        by_resource.setdefault(r.get("RESOURCE_NAME", "") or "(no resource)", []).append(r)
-
-    check_table = Table(
-        show_header=True,
-        header_style="bold",
-        title=f"Top {top_n} Checks by Unmuted Finding Count",
-    )
-    check_table.add_column("Worst Sev")
-    check_table.add_column("Check ID")
-    check_table.add_column("Count", justify="right")
-
-    for check, rows in sorted(
-        by_check.items(), key=lambda kv: -len(kv[1])
-    )[:top_n]:
-        worst = _worst_severity(rows)
-        style = _sev_style(worst)
-        check_table.add_row(
-            f"[{style}]{worst}[/{style}]" if style else worst,
-            check,
-            str(len(rows)),
-        )
-
-    console.print(check_table)
-    console.print()
-
-    res_table = Table(
-        show_header=True,
-        header_style="bold",
-        title=f"Top {top_n} Resources by Unmuted Finding Count",
-    )
-    res_table.add_column("Worst Sev")
-    res_table.add_column("Resource")
-    res_table.add_column("Count", justify="right")
-
-    for resource, rows in sorted(
-        by_resource.items(), key=lambda kv: -len(kv[1])
-    )[:top_n]:
-        worst = _worst_severity(rows)
-        style = _sev_style(worst)
-        res_table.add_row(
-            f"[{style}]{worst}[/{style}]" if style else worst,
-            resource[:80],
-            str(len(rows)),
-        )
-
-    console.print(res_table)
-    console.print()
-
-
 def main(
     full: Annotated[
         bool, typer.Option(help="(reserved) currently a no-op; all unmuted failures already shown")
@@ -646,13 +562,12 @@ def main(
     del full  # historical flag, kept for backwards compatibility
 
     with tempfile.TemporaryDirectory() as tmpdir:
-        for label, base, group in PROWLER_SCANS:
+        for label, base in PROWLER_SCANS:
             summarize_report(
                 label,
                 base,
                 tmpdir,
                 show_muted=show_muted,
-                group_findings=group,
             )
 
     # --- Node-level MANUAL check verification ---

From db0512b5d43b3621113d6c0003abea7600fe41e6 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Tue, 9 Jun 2026 16:05:01 -0700
Subject: [PATCH 16/20] Doc review: 5 stalest cards; scale back ai-docs rule;
 document heph CLI (#373)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Doc review (5 stalest, all never-reviewed)

Each card was verified against live state (ArgoCD app list/health, manifests, 1Password item fields, Mealie API probe) and stamped `last-reviewed: 2026-06-09`.

| Card | Findings fixed |
|------|----------------|
| `reference/services/argocd.md` | Added Authentik SSO (public PKCE client, `--sso` login, admins→role:admin RBAC); documented dual-cluster management (minikube + ringtail k3s at `ringtail.tail8d86e.ts.net:6443`); corrected sync policy — the `apps` root is **manual**, not automated |
| `reference/services/authentik.md` | Blueprint list grown from 5 to 10 files; OIDC client table now lists all 8 clients with types; secrets table updated to `postgresql-*` fields and per-client secrets |
| `reference/services/grafana.md` | TeslaMate datasource moved to `pg.ops.eblu.me:5434` (ringtail); dashboard inventory refreshed (20 provisioned ConfigMaps); TeslaMate dashboards documented as init-container fetch from forge mirror at pinned tag; SSO role mapping wording corrected (Admin only for `admins` group) |
| `reference/infrastructure/unifi.md` | UnPoller image is now locally built (`registry.ops.eblu.me/blumeops/unpoller`); verified namespace/port |
| `how-to/mealie/plan-a-meal.md` | Procedure verified; **found the stored API token (`op://blumeops/mealie/credential`) returns 401** — operational fix in progress, doc content unchanged |

## AGENTS.md

- **Scaled back the ai-docs rule** (per discussion): agents now start by finding and reading relevant docs; `mise run ai-docs` (~130K tokens now) and `ai-sources` become opt-in bulk loads. `agent-change-process.md` updated to match. The `ai-docs` mise task itself is kept for now — happy to retire it in a follow-up if desired.
- **Documented the heph CLI** task workflow (list/show/context/log read paths; done/drop/skip/log/edit/task write paths) so future sessions can read and manipulate Blumeops tasks without rediscovery.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/373
---
 AGENTS.md                                     | 44 +++++++++++++++----
 .../changelog.d/doc-review-stalest-five.ai.md |  1 +
 .../doc-review-stalest-five.doc.md            |  1 +
 docs/explanation/agent-change-process.md      | 12 ++---
 docs/how-to/mealie/plan-a-meal.md             |  3 +-
 docs/reference/infrastructure/unifi.md        |  5 ++-
 docs/reference/services/argocd.md             | 35 +++++++++++----
 docs/reference/services/authentik.md          | 33 +++++++-------
 docs/reference/services/grafana.md            | 17 +++----
 docs/reference/tools/mise-tasks.md            |  3 +-
 docs/tutorials/ai-assistance-guide.md         |  7 ++-
 docs/tutorials/exploring-the-docs.md          | 11 +----
 mise-tasks/ai-docs                            | 13 ------
 13 files changed, 106 insertions(+), 79 deletions(-)
 create mode 100644 docs/changelog.d/doc-review-stalest-five.ai.md
 create mode 100644 docs/changelog.d/doc-review-stalest-five.doc.md
 delete mode 100755 mise-tasks/ai-docs

diff --git a/AGENTS.md b/AGENTS.md
index c64af40..510176d 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -12,10 +12,9 @@ blumeops is Erich Blume's GitOps repository for personal infrastructure, orchest
 
 ## Rules
 
-1. **Always run `mise run ai-docs` at session start**
-    This will refresh your context with important information you will be assumed to know and follow.
-    **Read the full output** — never truncate, pipe to `head`/`tail`, or skip sections.
-    For problems with a large surface area, ask the user if `mise run ai-sources` should also be run — it concatenates all non-doc source files (~270K tokens) for deep codebase context.
+1. **Start every task by finding and reading the relevant docs**
+    Search `docs/` for cards related to the change area (grep for titles/tags, follow `[[wiki-links]]`) and read what you find before acting. Wiki-links refer to cards under `docs/` by filename stem.
+    For problems with a very large surface area, `mise run ai-sources` concatenates all non-doc source files (~270K tokens) — opt-in only, confirm with the user before loading it wholesale; targeted reading is usually better.
 2. **Always use `--context=minikube-indri` with kubectl** (or `--context=k3s-ringtail` for ringtail services) - work contexts must never be touched
     **NEVER run `minikube delete`** — it destroys all PVs, etcd, and cluster state. Use `minikube stop`/`minikube start` for restarts. If minikube is stuck, see [[restart-indri]]. Full rebuild from scratch requires the DR procedure in [[rebuild-minikube-cluster]].
 3. **Classify the change as C0/C1/C2 before starting** (see below) — this determines branching and PR requirements
@@ -69,7 +68,7 @@ See [[agent-change-process]] for the full methodology.
 ~/code/3rd/             # mirrored external projects
 ~/code/work             # FORBIDDEN
 ```
-Other code paths will be listed via ai-docs, this is just an overview. When you
+This is just an overview — explore `docs/` for the rest. When you
 encounter wiki-links (`[[like-this]]`) it is referring to docs/ cards.
 
 ## Service Deployment
@@ -148,13 +147,42 @@ Create a new spork: `mise run spork-create <mirror-name>`
 ## Task Discovery
 
 BlumeOps tasks live in [hephaestus](https://github.com/eblume/hephaestus) (`heph`),
-the user's self-hosted context/task system. Fetch them with the CLI:
+the user's self-hosted context/task system. The CLI is a thin client of the
+local `hephd` daemon. (This replaced the retired `blumeops-tasks` mise task,
+which read from Todoist.)
+
+### Reading tasks
 
 ```fish
-heph list --project Blumeops --json  # outstanding Blumeops tasks as JSON
+heph list --project Blumeops --json   # outstanding Blumeops tasks as JSON
+heph next                             # tactical "what is next?" ranking
+heph show <node_id>                   # one task with its scalars
+heph context <node_id>                # print the task's canonical-context doc
+heph log <node_id>                    # print the task's latest log entries
 ```
 
-(This replaced the retired `blumeops-tasks` mise task, which read from Todoist.)
+JSON rows carry `node_id` (use this as `<ID>` in all commands below), `title`,
+`state`, `do_date`/`late_on` (epoch ms), `recurrence` (RFC-5545), and
+`attention` (red|orange|white|blue — a1–a4 urgency tiers; blue = on-deck).
+
+### Manipulating tasks
+
+```fish
+heph done <node_id>                   # mark done (recurring tasks roll forward)
+heph drop <node_id>                   # mark dropped
+heph skip <node_id>                   # skip a recurring task's current occurrence
+heph log <node_id> "text"             # append a log entry
+heph context <node_id> --append "…"   # append to the canonical-context doc (--body replaces; `-` reads stdin)
+heph edit <node_id> --do-date +3d     # reschedule; also --late-on/--recur/--attention/--project (`none` clears)
+heph task "Title" --project Blumeops --do-date fri --attention white  # create a task
+```
+
+Date forms: `today|tomorrow|+3d|fri|YYYY-MM-DD`. Recurrence: presets
+(`daily|weekly|monthly|yearly|weekdays`) or natural language (`"every 3 days"`).
+
+Conventions: don't save TODOs to agent memory — file them as heph tasks under
+the Blumeops project. When completing a recurring chore (e.g. "BlumeOps doc
+review"), `heph log` a short note of what was done, then `heph done` it.
 
 Most operational scripts are stored in `./mise-tasks/`. For scripts with any logic or
 complexity, use uv run --script 's with explicit dependencies. Complex
diff --git a/docs/changelog.d/doc-review-stalest-five.ai.md b/docs/changelog.d/doc-review-stalest-five.ai.md
new file mode 100644
index 0000000..95da490
--- /dev/null
+++ b/docs/changelog.d/doc-review-stalest-five.ai.md
@@ -0,0 +1 @@
+Retired the `ai-docs` mise task and its mandatory session-start rule: the concatenated docs corpus had grown to ~130K tokens, too large to ingest wholesale. Agents now start tasks by finding and reading the relevant docs (grep + wiki-links); `ai-sources` remains for opt-in deep codebase context. Also documented the full `heph` CLI task workflow (read, log, complete, create) in AGENTS.md.
diff --git a/docs/changelog.d/doc-review-stalest-five.doc.md b/docs/changelog.d/doc-review-stalest-five.doc.md
new file mode 100644
index 0000000..8353e3d
--- /dev/null
+++ b/docs/changelog.d/doc-review-stalest-five.doc.md
@@ -0,0 +1 @@
+Reviewed the five stalest documentation cards (argocd, authentik, grafana, unifi, plan-a-meal): brought ArgoCD's SSO/dual-cluster/sync-policy story up to date, expanded Authentik's blueprint and OIDC client inventory to all eight clients, fixed Grafana's TeslaMate datasource target and dashboard list, and noted UnPoller's locally-built image.
diff --git a/docs/explanation/agent-change-process.md b/docs/explanation/agent-change-process.md
index 5141950..a6d8684 100644
--- a/docs/explanation/agent-change-process.md
+++ b/docs/explanation/agent-change-process.md
@@ -1,6 +1,6 @@
 ---
 title: Agent Change Process
-modified: 2026-03-15
+modified: 2026-06-09
 last-reviewed: 2026-02-23
 tags:
   - explanation
@@ -25,13 +25,13 @@ Before starting work, classify the change:
 
 When in doubt, start at C1. Upgrade to C2 if complexity spirals or the user requests it.
 
-**Context loading:** All change classes start with `mise run ai-docs` (~85K tokens of documentation). For problems with a large surface area, ask the user if `mise run ai-sources` should also be run — it concatenates all non-doc source files (~270K tokens). Together they cover the full codebase without overlap.
+**Context loading:** All change classes start by finding and reading the docs relevant to the change area — grep `docs/` and follow wiki-links. For problems with a very large surface area, `mise run ai-sources` concatenates all non-doc source files (~270K tokens); confirm with the user before loading it wholesale.
 
 ## C0 — Quick Fix
 
 A change where the risk is low enough that problems can be quickly fixed forward.
 
-1. Run `mise run ai-docs` to load context
+1. Find and read the docs relevant to the change area
 2. Implement the change directly on main
 3. Add a changelog fragment if the change is user-visible or noteworthy (`docs/changelog.d/+<descriptive-slug>.<type>.md`)
 4. Commit and push
@@ -46,7 +46,7 @@ A change with enough complexity or risk that a human should review it, but not s
 
 ### Process
 
-1. Run `mise run ai-docs` to load context
+1. Find and read the docs relevant to the change area
 2. **Search related docs** — read existing documentation and reference cards related to the change area
 3. **Create a feature branch** and open a PR early (draft is fine)
 4. **Documentation first** — commit doc changes reflecting the desired end state before writing code. This helps the reviewer understand intent and catches design issues early
@@ -77,7 +77,7 @@ A complex, multi-session change managed through the [Mikado method](https://mika
 
 Before writing any code, invest in understanding the problem:
 
-1. Run `mise run ai-docs` to load context
+1. Find and read the docs relevant to the change area
 2. Search related docs, reference cards, and existing how-to guides for the change area
 3. Think through the dependency graph — what prerequisites exist? What could go wrong?
 4. Create Mikado cards for everything you can anticipate (you'll discover more later — that's the point of the method)
@@ -220,7 +220,7 @@ When the final leaf node is closed and no `status: active` cards remain:
 
 When starting a new session to continue C2 work:
 
-1. Run `mise run ai-docs` to load context
+1. Find and read the docs relevant to the change area
 2. Run `mise run docs-mikado --resume` — this will:
    - Detect the current branch and match it to an active chain
    - Show the chain state, ready leaf nodes, and current position in the invariant
diff --git a/docs/how-to/mealie/plan-a-meal.md b/docs/how-to/mealie/plan-a-meal.md
index 1e6eb48..10de2cb 100644
--- a/docs/how-to/mealie/plan-a-meal.md
+++ b/docs/how-to/mealie/plan-a-meal.md
@@ -1,6 +1,7 @@
 ---
 title: Plan a Meal
-modified: 2026-03-17
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - how-to
   - mealie
diff --git a/docs/reference/infrastructure/unifi.md b/docs/reference/infrastructure/unifi.md
index 6182880..43297e7 100644
--- a/docs/reference/infrastructure/unifi.md
+++ b/docs/reference/infrastructure/unifi.md
@@ -1,6 +1,7 @@
 ---
 title: UniFi
-modified: 2026-03-16
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - infrastructure
   - networking
@@ -71,7 +72,7 @@ Attempted Feb 2026 with the `ubiquiti-community/unifi` Terraform provider via Pu
 
 ## Monitoring
 
-UniFi metrics are exported to Prometheus via [UnPoller](https://github.com/unpoller/unpoller), running as a k8s deployment in the `monitoring` namespace on indri. UnPoller polls the UX7 controller API using an API key and exposes metrics on port 9130.
+UniFi metrics are exported to Prometheus via [UnPoller](https://github.com/unpoller/unpoller), running as a k8s deployment in the `monitoring` namespace on indri's minikube (`argocd/manifests/unpoller/`, locally-built image `registry.ops.eblu.me/blumeops/unpoller`). UnPoller polls the UX7 controller API using an API key and exposes metrics on port 9130.
 
 - **Prometheus job:** `unpoller`
 - **Metrics prefix:** `unifi_`
diff --git a/docs/reference/services/argocd.md b/docs/reference/services/argocd.md
index e890cc5..2eaecb5 100644
--- a/docs/reference/services/argocd.md
+++ b/docs/reference/services/argocd.md
@@ -1,6 +1,7 @@
 ---
 title: ArgoCD
-modified: 2026-02-07
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - service
   - gitops
@@ -18,22 +19,38 @@ GitOps continuous delivery platform for the [[cluster|Kubernetes cluster]].
 | **Tailscale URL** | https://argocd.tail8d86e.ts.net |
 | **Namespace** | `argocd` |
 | **Git Source** | `ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git` |
-| **Manifests Path** | `argocd/` |
+| **Manifests Path** | `argocd/apps/` (Applications), `argocd/manifests/` (workloads) |
+
+## Clusters
+
+A single ArgoCD instance (on indri's minikube) manages both clusters:
+
+| Cluster | Destination | Apps |
+|---------|-------------|------|
+| minikube (indri) | `https://kubernetes.default.svc` | Most services |
+| k3s ([[ringtail]]) | `https://ringtail.tail8d86e.ts.net:6443` | GPU workloads and `*-ringtail` apps |
 
 ## Sync Policy
 
-| Application | Sync Policy | Rationale |
-|-------------|-------------|-----------|
-| `apps` | Automated | Picks up new Application manifests |
-| All workloads | Manual | Explicit control over deployments |
+All applications use **manual sync** — including the `apps` app-of-apps root. To pick up newly added Application manifests, sync `apps` explicitly:
 
-## Credentials
+```bash
+argocd app sync apps
+```
 
-- Admin password: 1Password (blumeops vault)
-- Git deploy key (SSH): 1Password
+This gives explicit control over every deployment; nothing rolls out on push alone.
+
+## Authentication
+
+- **SSO via [[authentik]]** — OIDC with a public PKCE client (`argocd`), shared by the web UI and CLI: `argocd login argocd.ops.eblu.me --sso`. The Authentik `admins` group maps to `role:admin` via the RBAC ConfigMap; the default policy grants no access.
+- **Local admin** — break-glass password in 1Password (blumeops vault), for when Authentik is down.
+
+The git deploy key (SSH) is injected via [[external-secrets]].
 
 ## Related
 
 - [[argocd-cli]] - CLI usage and deployment workflows
 - [[apps|Apps]] - Full application registry
 - [[forgejo]] - Git source
+- [[authentik]] - OIDC identity provider for SSO
+- [[federated-login]] - How authentication works across BlumeOps
diff --git a/docs/reference/services/authentik.md b/docs/reference/services/authentik.md
index 89a17cc..480f59b 100644
--- a/docs/reference/services/authentik.md
+++ b/docs/reference/services/authentik.md
@@ -1,6 +1,7 @@
 ---
 title: Authentik
-modified: 2026-02-20
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - service
   - security
@@ -42,9 +43,7 @@ Authentik configuration is managed via Blueprints (YAML) stored as a ConfigMap m
 
 - **`common.yaml`** — shared identity resources (`admins` group)
 - **`mfa.yaml`** — MFA enforcement on the default authentication flow (`not_configured_action: configure`)
-- **`grafana.yaml`** — Grafana OAuth2 provider, application, and policy binding
-- **`forgejo.yaml`** — Forgejo OAuth2 provider, application, and policy binding
-- **`zot.yaml`** — Zot registry OAuth2 provider, application, and policy binding
+- One blueprint per OIDC client (provider, application, and policy binding): `grafana.yaml`, `forgejo.yaml`, `zot.yaml`, `argocd.yaml`, `jellyfin.yaml`, `mealie.yaml`, `paperless.yaml`, `heph.yaml`
 
 Group membership is included in the `profile` scope claim (Authentik built-in). Services use `--group-claim-name groups` to read it.
 
@@ -52,13 +51,18 @@ Blueprint file: `argocd/manifests/authentik/configmap-blueprint.yaml`
 
 ## OIDC Clients
 
-| Client | Status |
-|--------|--------|
-| [[grafana]] | Active |
-| [[forgejo]] | Active |
-| [[zot]] | Active |
+| Client | Type |
+|--------|------|
+| [[grafana]] | Confidential |
+| [[forgejo]] | Confidential |
+| [[zot]] | Confidential |
+| [[argocd]] | Public (PKCE, shared by web UI and CLI) |
+| [[jellyfin]] | Confidential |
+| [[mealie]] | Confidential |
+| [[paperless]] | Confidential |
+| heph | Public (PKCE, with `offline_access` for spoke sync refresh tokens) |
 
-Future clients: [[argocd]], [[miniflux]]
+Future clients: [[miniflux]]
 
 ## Secrets
 
@@ -67,11 +71,10 @@ Injected via [[external-secrets]] from the "Authentik (blumeops)" 1Password item
 | 1Password Field | Purpose |
 |-----------------|---------|
 | `secret-key` | Authentik secret key |
-| `db-password` | PostgreSQL password |
-| `grafana-client-secret` | OIDC client secret for Grafana |
-| `forgejo-client-secret` | OIDC client secret for Forgejo |
-| `zot-client-secret` | OIDC client secret for Zot |
-| `api-token` | Authentik API token |
+| `postgresql-host` / `-port` / `-name` / `-user` / `-password` | PostgreSQL connection |
+| `<client>-client-secret` | OIDC client secret, one per confidential client (grafana, forgejo, zot, jellyfin, mealie, paperless) |
+
+The item also holds an `api-token` field (Authentik API access for admin scripting); it is not synced into the cluster.
 
 ## Container Image
 
diff --git a/docs/reference/services/grafana.md b/docs/reference/services/grafana.md
index 3a9ae01..d6b812c 100644
--- a/docs/reference/services/grafana.md
+++ b/docs/reference/services/grafana.md
@@ -1,6 +1,7 @@
 ---
 title: Grafana
-modified: 2026-02-28
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - service
   - observability
@@ -25,7 +26,7 @@ Dashboards and visualization for BlumeOps observability.
 
 Grafana supports two login methods:
 
-- **SSO via [[authentik]]** — OIDC login through Authentik (`auth.generic_oauth`). Users click "Sign in with Authentik", authenticate at Authentik, and are redirected back as Admin.
+- **SSO via [[authentik]]** — OIDC login through Authentik (`auth.generic_oauth`). Members of the Authentik `admins` group get the Admin role; everyone else gets Viewer (`role_attribute_path` in `grafana.ini`).
 - **Local admin** — break-glass login using the password from 1Password ("Grafana (blumeops)"). Always available if Authentik is down.
 
 The OIDC client secret is injected via [[external-secrets]] (`grafana-authentik-oauth` secret in monitoring namespace).
@@ -37,7 +38,7 @@ The OIDC client secret is injected via [[external-secrets]] (`grafana-authentik-
 | Prometheus | prometheus | `prometheus.monitoring.svc.cluster.local:9090` |
 | Loki | loki | `loki.monitoring.svc.cluster.local:3100` |
 | Tempo | tempo | `tempo.monitoring.svc.cluster.local:3200` |
-| TeslaMate | postgres | `blumeops-pg-rw.databases.svc.cluster.local:5432` |
+| TeslaMate | postgres | `pg.ops.eblu.me:5434` (TeslaMate's database on [[ringtail]], via Caddy L4) |
 
 ## Dashboard Provisioning
 
@@ -49,13 +50,9 @@ Optional annotation: `grafana_folder: "FolderName"`
 
 ## Key Dashboards
 
-- macOS System - Host metrics for indri
-- Minikube - Kubernetes cluster overview
-- Borgmatic Backups - Backup status and trends
-- Services Health - HTTP probe results
-- Docs APM - Request rate, latency, cache for docs.eblu.me
-- Fly.io Proxy Health - Aggregate proxy health across all upstream services
-- TeslaMate (18 dashboards) - Vehicle data
+Provisioned dashboards live in `argocd/manifests/grafana-config/dashboards/` (one ConfigMap per dashboard). Coverage as of 2026-06: alerts, borgmatic, CV APM, devpi, docs APM, fly.io proxy, forgejo, frigate, jellyfin, kubernetes, loki, macOS (indri host), postgresql, ringtail, shower APM, sifaka disks, snowflake proxy, tempo, transmission, zot.
+
+TeslaMate's dashboards are not in the repo — an init container fetches them from the forge mirror at a pinned tag (`TESLAMATE_VERSION` in `argocd/manifests/grafana/deployment.yaml`).
 
 ## Related
 
diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md
index b614cb1..f777aa5 100644
--- a/docs/reference/tools/mise-tasks.md
+++ b/docs/reference/tools/mise-tasks.md
@@ -1,6 +1,6 @@
 ---
 title: Mise Tasks
-modified: 2026-04-11
+modified: 2026-06-09
 tags:
   - reference
   - tools
@@ -17,7 +17,6 @@ Run `mise tasks --sort name` for the live list with descriptions.
 
 | Task | Description |
 |------|-------------|
-| `ai-docs` | All documentation concatenated for AI context (~85K tokens) |
 | `ai-sources` | All non-doc source files for deep AI context (~270K tokens) |
 | `docs-check-frontmatter` | Check required frontmatter fields |
 | `docs-check-links` | Validate wiki-links resolve correctly (supports path-based links) |
diff --git a/docs/tutorials/ai-assistance-guide.md b/docs/tutorials/ai-assistance-guide.md
index 4f0c595..d3e23d7 100644
--- a/docs/tutorials/ai-assistance-guide.md
+++ b/docs/tutorials/ai-assistance-guide.md
@@ -1,6 +1,6 @@
 ---
 title: AI Assistance Guide
-modified: 2026-02-23
+modified: 2026-06-09
 tags:
   - tutorials
   - ai
@@ -17,7 +17,7 @@ This guide provides context for AI agents assisting with BlumeOps operations, an
 These are non-negotiable for AI agents working in this repo:
 
 1. **Always use `--context=minikube-indri` with kubectl** - Work contexts exist that must never be touched
-2. **Run `mise run ai-docs` at session start** - Review current infrastructure state
+2. **Start every task by finding and reading the relevant docs** - Grep `docs/` and follow wiki-links
 3. **Never commit secrets** - The repo is public at github.com/eblume/blumeops
 4. **Wait for user review before deploying** - Create PRs, don't auto-deploy
 5. **Never merge PRs without explicit request** - The user merges after review
@@ -91,8 +91,7 @@ BlumeOps operations are driven by mise tasks. Run `mise tasks` to list all avail
 
 | Task | When to Use |
 |------|-------------|
-| `ai-docs` | At session start - all documentation concatenated for AI context (~85K tokens, see [[mise-tasks]]) |
-| `ai-sources` | Deep context - all non-doc source files (~270K tokens). Ask user before running; useful for problems with a large surface area |
+| `ai-sources` | Deep context - all non-doc source files (~270K tokens). Ask user before running; useful for problems with a large surface area (see [[mise-tasks]]) |
 | `docs-mikado` | View active Mikado dependency chains for C2 changes |
 | `docs-mikado --resume` | Resume a C2 chain: detect branch, show state and next steps |
 | `provision-indri` | Deploy changes to [[indri]]-hosted services via Ansible |
diff --git a/docs/tutorials/exploring-the-docs.md b/docs/tutorials/exploring-the-docs.md
index 2fd5f66..43966ec 100644
--- a/docs/tutorials/exploring-the-docs.md
+++ b/docs/tutorials/exploring-the-docs.md
@@ -1,6 +1,6 @@
 ---
 title: Exploring the Docs
-modified: 2026-02-10
+modified: 2026-06-09
 tags:
   - tutorials
   - getting-started
@@ -31,7 +31,6 @@ You probably want quick access to operational details:
 - [How-to](/how-to/) guides for common operations (deploy, troubleshoot, update ACLs)
 - [Reference](/reference/) has service URLs, commands, and config locations
 - [[ai-assistance-guide]] explains how to work effectively with AI agents
-- Run `mise run ai-docs` to prime AI context with key documentation
 
 ### For AI Agents
 
@@ -75,13 +74,7 @@ Prek hooks validate that all wiki-links resolve to existing files and flag ambig
 
 ## AI Context Priming
 
-The `ai-docs` mise task concatenates key documentation files for AI context:
-
-```bash
-mise run ai-docs
-```
-
-This outputs key documentation files and a full tree listing of all docs, providing an agent with essential context for BlumeOps operations.
+AI agents prime themselves by searching `docs/` for cards relevant to the task at hand and following wiki-links from there. (The retired `ai-docs` mise task used to concatenate every doc for this purpose, but the corpus outgrew a context window.) For deep codebase questions, `mise run ai-sources` concatenates all non-doc source files.
 
 ## Related
 
diff --git a/mise-tasks/ai-docs b/mise-tasks/ai-docs
deleted file mode 100755
index 66e11d7..0000000
--- a/mise-tasks/ai-docs
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-#MISE description="Prime AI context with all BlumeOps documentation"
-
-set -euo pipefail
-
-DOCS_DIR="$(cd "$(dirname "$0")/.." && pwd)/docs"
-
-# Concatenate all docs (excluding changelog fragments)
-find "$DOCS_DIR" -name '*.md' -not -path '*/changelog.d/*' | sort | while read -r f; do
-  printf '=== %s ===\n' "${f#"$DOCS_DIR/"}"
-  cat "$f"
-  printf '\n'
-done

From ccbc2ff0a94ecd22df68c255e4bbc0149bab4a70 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Tue, 9 Jun 2026 16:08:39 -0700
Subject: [PATCH 17/20] C0: service-review automounter (1.13.0, healthy); fix
 tracking-file path in script

AutoMounter on indri auto-updated to 1.13.0 via the App Store, matching
the latest upstream release; all seven sifaka SMB mounts are live and
the app + helper are running. The service-review script's guidance text
pointed at docs/reference/services/service-versions.yaml, but the file
lives at the repo root (where the script actually reads it).

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 docs/changelog.d/+service-review-automounter.misc.md | 1 +
 mise-tasks/service-review                            | 4 ++--
 service-versions.yaml                                | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelog.d/+service-review-automounter.misc.md

diff --git a/docs/changelog.d/+service-review-automounter.misc.md b/docs/changelog.d/+service-review-automounter.misc.md
new file mode 100644
index 0000000..31e5644
--- /dev/null
+++ b/docs/changelog.d/+service-review-automounter.misc.md
@@ -0,0 +1 @@
+Service review: AutoMounter on indri is current at 1.13.0 (App Store auto-updated from the tracked 1.11.0); all sifaka SMB mounts verified healthy. Fixed the stale tracking-file path shown by `mise run service-review`.
diff --git a/mise-tasks/service-review b/mise-tasks/service-review
index f83b104..d22097f 100755
--- a/mise-tasks/service-review
+++ b/mise-tasks/service-review
@@ -8,7 +8,7 @@
 #USAGE flag "--type <type>" help="Filter by service type (argocd, ansible, nixos, fly, mise)"
 """Review the most stale service for version freshness.
 
-Reads ``docs/reference/services/service-versions.yaml`` and sorts services
+Reads ``service-versions.yaml`` (repo root) and sorts services
 by the ``last-reviewed`` field. Services without the field (or null) are
 treated as never-reviewed and float to the top. Displays a staleness table
 and then shows the most stale service with a review checklist.
@@ -210,7 +210,7 @@ def main(
         "• Verify the service is running and healthy\n",
         "• Check logs for errors or warnings\n",
         "\n[bold]After Review:[/bold]\n",
-        "• Update the tracking file: [cyan]docs/reference/services/service-versions.yaml[/cyan]\n",
+        "• Update the tracking file: [cyan]service-versions.yaml[/cyan] (repo root)\n",
         f"• Set [cyan]last-reviewed: {today}[/cyan] and [cyan]current-version[/cyan]\n",
         "• Commit the change (along with any upgrades)",
     ]
diff --git a/service-versions.yaml b/service-versions.yaml
index 419d129..95b9e44 100644
--- a/service-versions.yaml
+++ b/service-versions.yaml
@@ -452,8 +452,8 @@ services:
 
   - name: automounter
     type: ansible
-    last-reviewed: 2026-03-17
-    current-version: "1.11.0"
+    last-reviewed: 2026-06-09
+    current-version: "1.13.0"
     upstream-source: https://www.pixeleyes.co.nz/automounter/
     notes: Mac App Store app, no Ansible role. Updates via App Store.
 

From b24fd147ac13b802a52c6eb609a7bb4e345ba5c4 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Tue, 9 Jun 2026 16:27:08 -0700
Subject: [PATCH 18/20] C0: fix 1Password export menu wording in backup how-to

The desktop app's menu is File > Export > <account name> (e.g.
Blume/Davis), not "All Vaults". Verified an account-level 1PUX export
contains all four vaults (Private, blumeops, Payrix, Shared). Updated
the op-backup script's prompt text to match.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 docs/changelog.d/+1password-export-menu-wording.doc.md | 1 +
 docs/how-to/operations/run-1password-backup.md         | 6 +++---
 mise-tasks/op-backup                                   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 docs/changelog.d/+1password-export-menu-wording.doc.md

diff --git a/docs/changelog.d/+1password-export-menu-wording.doc.md b/docs/changelog.d/+1password-export-menu-wording.doc.md
new file mode 100644
index 0000000..1236ffc
--- /dev/null
+++ b/docs/changelog.d/+1password-export-menu-wording.doc.md
@@ -0,0 +1 @@
+Corrected the 1Password backup how-to: the desktop app's export menu item is named after the account ("File > Export > Blume/Davis"), not "All Vaults". Verified an account export contains all four vaults (Private, blumeops, Payrix, Shared).
diff --git a/docs/how-to/operations/run-1password-backup.md b/docs/how-to/operations/run-1password-backup.md
index 0dc9ec9..2f8c88a 100644
--- a/docs/how-to/operations/run-1password-backup.md
+++ b/docs/how-to/operations/run-1password-backup.md
@@ -1,7 +1,7 @@
 ---
 title: Run 1Password Backup
-modified: 2026-03-11
-last-reviewed: 2026-03-16
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - how-to
   - operations
@@ -24,7 +24,7 @@ How to export and encrypt your 1Password vaults for inclusion in [[borgmatic]] b
 ### 1. Export Vaults From 1Password
 
 1. Open the 1Password desktop app
-2. **File > Export > All Vaults**
+2. **File > Export > Blume/Davis** (the menu item is named after the account, not "All Vaults" — exporting the account covers all vaults: Private, blumeops, Payrix, and Shared)
 3. Choose **1PUX** format
 4. Save to `~/Documents/` — 1Password names the file `1PasswordExport-<account-uuid>-<timestamp>.1pux` automatically; don't bother renaming it, pass the path to the task in the next step
 
diff --git a/mise-tasks/op-backup b/mise-tasks/op-backup
index 7db033b..a8a5dc2 100755
--- a/mise-tasks/op-backup
+++ b/mise-tasks/op-backup
@@ -86,7 +86,7 @@ def get_export_path(argv_path: str | None) -> Path | None:
         else:
             console.print("Export your vaults from the 1Password desktop app:")
             console.print("  1. Open 1Password")
-            console.print("  2. File > Export > All Vaults (or select specific vaults)")
+            console.print("  2. File > Export > <account name> (exports all vaults in the account)")
             console.print(f"  3. Save as 1PUX format to: [cyan]{EXPORT_DIR}[/cyan]")
             console.print()
             raw = console.input("Path to .1pux file: ").strip()

From d03ed337a9dfb4dc3cafb658d2cb3fccc193c797 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Tue, 9 Jun 2026 17:45:23 -0700
Subject: [PATCH 19/20] Localize the Tailscale operator stack (k8s-operator +
 indri ProxyClass) (#374)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Weekly non-local-container task: localize the Tailscale operator stack on **both clusters**.

## What

- **`containers/tailscale-operator/`** (new) — builds `cmd/k8s-operator` v1.94.2 from the forge mirror, mirroring upstream's mkctr recipe (`/usr/local/bin/operator`, `ts_kube,ts_package_container` go tags, version stamps). `container.py` (dagger) for indri/arm64; `default.nix` for ringtail/amd64.
- **`containers/tailscale/container.py`** (new) — dagger/arm64 build of the proxy image (containerboot), mirroring the upstream Dockerfile (iptables-legacy symlinks, `/tailscale/run.sh` compat). Ringtail already consumes the existing nix build; this completes parity for indri.
- **Version pinned at v1.94.2** (same as currently deployed) — this PR is a pure supply-chain swap, no version change. v1.96.x is avoided deliberately (MagicDNS-in-containers regression).
- Docs-first: tailscale-operator card gains **Local Images** and **Rollout Safety** sections.

## Rollout plan (after image builds)

1. Manifest commit: per-overlay `images:` override for the operator + ProxyClass strategic-merge patch on indri (kustomize `images:` can't touch CR fields).
2. `argocd app set tailscale-operator --revision <branch> && argocd app sync` — indri first, verify, then ringtail.
3. **Shadow-device safety**: device identity lives in the tailscale state Secrets; an image swap re-uses existing node keys, so no `-1` clones. State Secrets are not touched. Post-sync verification: pod health, device names unchanged, `mise run services-check`.

## Follow-ups (not this PR)

- `dnsconfig` nameserver image (`tailscale/k8s-nameserver:stable`) still upstream.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/374
---
 .../kustomization.yaml                        |  19 +++-
 .../tailscale-operator/kustomization.yaml     |  20 ++++
 .../tailscale-operator/proxyclass-image.yaml  |  11 ++
 containers/tailscale-operator/container.py    |  53 +++++++++
 containers/tailscale-operator/default.nix     |  67 +++++++++++
 containers/tailscale/container.py             | 104 ++++++++++++++++++
 .../localize-tailscale-operator.infra.md      |   1 +
 .../kubernetes/tailscale-operator.md          |  44 +++++++-
 8 files changed, 307 insertions(+), 12 deletions(-)
 create mode 100644 argocd/manifests/tailscale-operator/proxyclass-image.yaml
 create mode 100644 containers/tailscale-operator/container.py
 create mode 100644 containers/tailscale-operator/default.nix
 create mode 100644 containers/tailscale/container.py
 create mode 100644 docs/changelog.d/localize-tailscale-operator.infra.md

diff --git a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml
index 2d9ceb2..fc119c9 100644
--- a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml
+++ b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml
@@ -9,12 +9,19 @@ resources:
   - proxygroup-ingress.yaml
   - external-secret.yaml
 
-# Rewrite the proxyclass image to our local nix-built mirror.
-# Scoped to ringtail only; indri's tailscale-operator/kustomization.yaml still
-# pulls from upstream docker.io. A strategic merge patch is used instead of
-# kustomize's `images:` directive because that directive only rewrites images
-# in standard k8s container fields, not custom-resource fields like
-# ProxyClass.spec.statefulSet.pod.tailscaleContainer.image.
+# Rewrite the operator image to the locally nix-built (amd64) mirror.
+# The name must match the post-base-render image (base already rewrites
+# tailscale/k8s-operator -> docker.io/tailscale/k8s-operator).
+images:
+  - name: docker.io/tailscale/k8s-operator
+    newName: registry.ops.eblu.me/blumeops/tailscale-operator
+    newTag: v1.94.2-ac40a18-nix
+
+# Rewrite the proxyclass image to our local nix-built mirror (indri's overlay
+# carries the equivalent dagger/arm64 patch). A strategic merge patch is used
+# instead of kustomize's `images:` directive because that directive only
+# rewrites images in standard k8s container fields, not custom-resource fields
+# like ProxyClass.spec.statefulSet.pod.tailscaleContainer.image.
 patches:
   - path: proxyclass-image.yaml
     target:
diff --git a/argocd/manifests/tailscale-operator/kustomization.yaml b/argocd/manifests/tailscale-operator/kustomization.yaml
index f1d6f89..ad275a9 100644
--- a/argocd/manifests/tailscale-operator/kustomization.yaml
+++ b/argocd/manifests/tailscale-operator/kustomization.yaml
@@ -14,3 +14,23 @@ resources:
   # Endpoints). Apply manually:
   #   kubectl --context=minikube-indri apply -f endpoints-forge.yaml
   - ingress-forge.yaml
+
+# Rewrite the operator image to the locally dagger-built (arm64) mirror.
+# The name must match the post-base-render image (base already rewrites
+# tailscale/k8s-operator -> docker.io/tailscale/k8s-operator).
+images:
+  - name: docker.io/tailscale/k8s-operator
+    newName: registry.ops.eblu.me/blumeops/tailscale-operator
+    newTag: v1.94.2-ac40a18
+
+# Rewrite the proxyclass image to the local mirror. A strategic merge patch
+# is used instead of kustomize's `images:` directive because that directive
+# only rewrites standard k8s container fields, not custom-resource fields
+# like ProxyClass.spec.statefulSet.pod.tailscaleContainer.image.
+patches:
+  - path: proxyclass-image.yaml
+    target:
+      group: tailscale.com
+      version: v1alpha1
+      kind: ProxyClass
+      name: default
diff --git a/argocd/manifests/tailscale-operator/proxyclass-image.yaml b/argocd/manifests/tailscale-operator/proxyclass-image.yaml
new file mode 100644
index 0000000..eae73eb
--- /dev/null
+++ b/argocd/manifests/tailscale-operator/proxyclass-image.yaml
@@ -0,0 +1,11 @@
+apiVersion: tailscale.com/v1alpha1
+kind: ProxyClass
+metadata:
+  name: default
+spec:
+  statefulSet:
+    pod:
+      tailscaleContainer:
+        image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-ac40a18
+      tailscaleInitContainer:
+        image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-ac40a18
diff --git a/containers/tailscale-operator/container.py b/containers/tailscale-operator/container.py
new file mode 100644
index 0000000..ff63845
--- /dev/null
+++ b/containers/tailscale-operator/container.py
@@ -0,0 +1,53 @@
+"""Tailscale Kubernetes operator — native Dagger build.
+
+Single Go binary (cmd/k8s-operator) from the forge mirror, mirroring
+upstream's build_docker.sh mkctr recipe: binary at /usr/local/bin/operator,
+go tags ts_kube + ts_package_container, version stamps in ldflags.
+
+Consumed by the tailscale-operator app on indri's minikube (arm64); the
+ringtail app uses the -nix tag from default.nix instead.
+"""
+
+import dagger
+
+from blumeops.containers import (
+    alpine_runtime,
+    clone_from_forge,
+    go_build,
+    oci_labels,
+)
+
+VERSION = "v1.94.2"
+
+
+async def build(src: dagger.Directory) -> dagger.Container:
+    source = clone_from_forge("tailscale", VERSION)
+    semver = VERSION.removeprefix("v")
+
+    builder = go_build(
+        source,
+        "/out/operator",
+        cmd_path="./cmd/k8s-operator",
+        tags="ts_kube,ts_package_container",
+        ldflags=(
+            "-w -s"
+            f" -X tailscale.com/version.longStamp={semver}"
+            f" -X tailscale.com/version.shortStamp={semver}"
+        ),
+    )
+
+    # Upstream runs the operator as root on a minimal base; only CA certs
+    # are needed at runtime (operator talks to the k8s API and Tailscale
+    # control plane over HTTPS).
+    runtime = alpine_runtime(extra_apk=["ca-certificates"], create_user=False)
+    runtime = oci_labels(
+        runtime,
+        title="Tailscale Kubernetes Operator",
+        description="Tailscale operator for Kubernetes Ingress/egress proxies",
+        version=VERSION,
+    )
+    return runtime.with_file(
+        "/usr/local/bin/operator",
+        builder.file("/out/operator"),
+        permissions=0o555,
+    ).with_entrypoint(["/usr/local/bin/operator"])
diff --git a/containers/tailscale-operator/default.nix b/containers/tailscale-operator/default.nix
new file mode 100644
index 0000000..8b279d5
--- /dev/null
+++ b/containers/tailscale-operator/default.nix
@@ -0,0 +1,67 @@
+# Nix-built tailscale k8s-operator for ringtail's tailscale-operator app.
+# Builds cmd/k8s-operator v1.94.2 from the forge mirror, mirroring upstream's
+# build_docker.sh mkctr recipe (binary at /usr/local/bin/operator, ts_kube +
+# ts_package_container go tags). Built on the ringtail nix-container-builder.
+{ pkgs ? import <nixpkgs> { } }:
+
+let
+  version = "1.94.2";
+
+  src = pkgs.fetchgit {
+    url = "https://forge.ops.eblu.me/mirrors/tailscale.git";
+    rev = "v${version}";
+    hash = "sha256-qjWVB8xWVgIVUgrf27F6hwiFIE+4ERXWeHv26ugg/x4=";
+  };
+
+  operator = pkgs.buildGoModule {
+    inherit src version;
+    pname = "tailscale-operator";
+    vendorHash = "sha256-WeMTOkERj4hvdg4yPaZ1gRgKnhRIBXX55kUVbX/k/xM=";
+
+    subPackages = [ "cmd/k8s-operator" ];
+
+    tags = [
+      "ts_kube"
+      "ts_package_container"
+    ];
+
+    ldflags = [
+      "-s"
+      "-w"
+      "-X tailscale.com/version.longStamp=${version}"
+      "-X tailscale.com/version.shortStamp=${version}"
+    ];
+
+    doCheck = false;
+
+    meta = with pkgs.lib; {
+      description = "Tailscale operator for Kubernetes";
+      homepage = "https://tailscale.com";
+      license = licenses.bsd3;
+    };
+  };
+in
+
+pkgs.dockerTools.buildLayeredImage {
+  name = "blumeops/tailscale-operator";
+  tag = "v${version}";
+
+  contents = [
+    operator
+    pkgs.cacert
+  ];
+
+  # buildGoModule names the binary after the package dir (k8s-operator);
+  # upstream's image expects /usr/local/bin/operator.
+  extraCommands = ''
+    mkdir -p usr/local/bin
+    ln -s /bin/k8s-operator usr/local/bin/operator
+  '';
+
+  config = {
+    Entrypoint = [ "/usr/local/bin/operator" ];
+    Env = [
+      "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt"
+    ];
+  };
+}
diff --git a/containers/tailscale/container.py b/containers/tailscale/container.py
new file mode 100644
index 0000000..8e3e509
--- /dev/null
+++ b/containers/tailscale/container.py
@@ -0,0 +1,104 @@
+"""Tailscale proxy image (containerboot) — native Dagger build.
+
+Builds cmd/tailscale, cmd/tailscaled, and cmd/containerboot from the forge
+mirror, mirroring the upstream Dockerfile: Alpine runtime with iptables
+(legacy symlinked over the default, per upstream issue #17854), iproute2,
+and the /tailscale/run.sh compat symlink.
+
+Consumed by the tailscale-operator ProxyClass on indri's minikube (arm64);
+ringtail's ProxyClass uses the -nix tag from default.nix instead.
+"""
+
+import dagger
+
+from blumeops.containers import (
+    alpine_runtime,
+    clone_from_forge,
+    go_build,
+    oci_labels,
+)
+
+VERSION = "v1.94.2"
+
+
+async def build(src: dagger.Directory) -> dagger.Container:
+    source = clone_from_forge("tailscale", VERSION)
+    semver = VERSION.removeprefix("v")
+
+    ldflags = (
+        "-w -s"
+        f" -X tailscale.com/version.longStamp={semver}"
+        f" -X tailscale.com/version.shortStamp={semver}"
+    )
+    builder = go_build(
+        source,
+        "/out/tailscale",
+        cmd_path="./cmd/tailscale",
+        ldflags=ldflags,
+    )
+    builder = builder.with_exec(
+        [
+            "go",
+            "build",
+            f"-ldflags={ldflags}",
+            "-o",
+            "/out/tailscaled",
+            "./cmd/tailscaled",
+        ]
+    ).with_exec(
+        [
+            "go",
+            "build",
+            f"-ldflags={ldflags}",
+            "-o",
+            "/out/containerboot",
+            "./cmd/containerboot",
+        ]
+    )
+
+    runtime = alpine_runtime(
+        extra_apk=["ca-certificates", "iptables", "iproute2", "ip6tables"],
+        create_user=False,
+    )
+    runtime = oci_labels(
+        runtime,
+        title="Tailscale",
+        description="Tailscale containerboot proxy image for the k8s operator",
+        version=VERSION,
+    )
+    return (
+        runtime
+        # Match upstream Dockerfile: nftables-backed iptables misbehaves in
+        # some environments, force the legacy backend (tailscale/tailscale#17854).
+        .with_exec(
+            [
+                "sh",
+                "-c",
+                "rm /usr/sbin/iptables && ln -s /usr/sbin/iptables-legacy /usr/sbin/iptables"
+                " && rm /usr/sbin/ip6tables && ln -s /usr/sbin/ip6tables-legacy /usr/sbin/ip6tables",
+            ]
+        )
+        .with_file(
+            "/usr/local/bin/tailscale",
+            builder.file("/out/tailscale"),
+            permissions=0o555,
+        )
+        .with_file(
+            "/usr/local/bin/tailscaled",
+            builder.file("/out/tailscaled"),
+            permissions=0o555,
+        )
+        .with_file(
+            "/usr/local/bin/containerboot",
+            builder.file("/out/containerboot"),
+            permissions=0o555,
+        )
+        .with_exec(
+            [
+                "sh",
+                "-c",
+                "mkdir /tailscale && ln -s /usr/local/bin/containerboot /tailscale/run.sh",
+            ]
+        )
+        .with_entrypoint(["/usr/local/bin/containerboot"])
+    )
diff --git a/docs/changelog.d/localize-tailscale-operator.infra.md b/docs/changelog.d/localize-tailscale-operator.infra.md
new file mode 100644
index 0000000..324eac6
--- /dev/null
+++ b/docs/changelog.d/localize-tailscale-operator.infra.md
@@ -0,0 +1 @@
+Localized the Tailscale operator stack: the k8s-operator image (both clusters) and the ProxyClass proxy image (indri, completing parity with ringtail) are now built from the forge mirror instead of pulled from Docker Hub.
diff --git a/docs/reference/kubernetes/tailscale-operator.md b/docs/reference/kubernetes/tailscale-operator.md
index 174b347..ba03014 100644
--- a/docs/reference/kubernetes/tailscale-operator.md
+++ b/docs/reference/kubernetes/tailscale-operator.md
@@ -1,7 +1,7 @@
 ---
 title: Tailscale Operator
-modified: 2026-06-08
-last-reviewed: 2026-06-08
+modified: 2026-06-09
+last-reviewed: 2026-06-09
 tags:
   - kubernetes
   - tailscale
@@ -22,10 +22,42 @@ The Tailscale operator enables Kubernetes services to be exposed directly on the
 The operator runs on **both** clusters — indri's minikube and ringtail's k3s.
 Both apps layer on the shared `tailscale-operator-base` kustomize directory
 (operator manifest, `ProxyClass`, `dnsconfig`); each cluster supplies its own
-`ProxyGroup` (indri: 2 replicas, ringtail: 1) and OAuth `ExternalSecret`. The
-ringtail overlay additionally rewrites the proxy image to a locally nix-built
-mirror. See [[ringtail]] and [[migrate-wave1-ringtail]] for the ongoing
-migration of k8s workloads onto ringtail.
+`ProxyGroup` (indri: 2 replicas, ringtail: 1) and OAuth `ExternalSecret`. See
+[[ringtail]] and [[migrate-wave1-ringtail]] for the ongoing migration of k8s
+workloads onto ringtail.
+
+## Local Images
+
+Both the operator and the proxy run locally-built images from the forge
+mirror (`mirrors/tailscale`), not Docker Hub:
+
+| Image | Build | Used by |
+|-------|-------|---------|
+| `blumeops/tailscale-operator` | `containers/tailscale-operator/` (`container.py` for indri/arm64, `default.nix` `-nix` tag for ringtail/amd64) | operator Deployment, via each overlay's `images:` override |
+| `blumeops/tailscale` | `containers/tailscale/` (same dual build) | `ProxyClass` proxy pods, via a strategic-merge patch in each overlay |
+
+The ProxyClass image must be set with a **patch**, not kustomize's `images:`
+directive — that directive only rewrites standard container fields, not
+custom-resource fields like `ProxyClass.spec.statefulSet.pod.tailscaleContainer.image`.
+
+The `dnsconfig` nameserver image (`tailscale/k8s-nameserver:stable`) is still
+upstream — a known follow-up.
+
+## Rollout Safety (device identity)
+
+Proxy and operator tailnet identity lives in Kubernetes state Secrets in the
+`tailscale` namespace, not in pods or images. An image swap rolls the
+Deployment/StatefulSets but pods re-authenticate with their existing node
+keys — devices keep their names. Shadow devices (`foo-1` suffixes) appear only
+when a pod registers *fresh* while a stale device record still holds the name
+(deleted state Secrets, cluster rebuilds). When rolling out image changes:
+
+1. Never delete the `tailscale` namespace state Secrets.
+2. Verify after sync: pods healthy, device names unchanged in the admin
+   console, `mise run services-check` green.
+3. If a collision does occur: delete the stale device in the admin console
+   AND the affected state Secret, then restart the pod (see
+   [[rebuild-minikube-cluster]]).
 
 ## How It Works
 

From 7581b61cbf2da69b06c38396d8967e7dcd903c08 Mon Sep 17 00:00:00 2001
From: Erich Blume <blume.erich@gmail.com>
Date: Tue, 9 Jun 2026 17:55:52 -0700
Subject: [PATCH 20/20] C0: tailscale-operator manifests to [main] image tags

Post-squash-merge rebuild of PR #374's containers from main (runs
585/586); same v1.94.2 content, tags now traceable to d03ed337.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../manifests/tailscale-operator-ringtail/kustomization.yaml  | 2 +-
 argocd/manifests/tailscale-operator/kustomization.yaml        | 2 +-
 argocd/manifests/tailscale-operator/proxyclass-image.yaml     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml
index fc119c9..25c3545 100644
--- a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml
+++ b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml
@@ -15,7 +15,7 @@ resources:
 images:
   - name: docker.io/tailscale/k8s-operator
     newName: registry.ops.eblu.me/blumeops/tailscale-operator
-    newTag: v1.94.2-ac40a18-nix
+    newTag: v1.94.2-d03ed33-nix
 
 # Rewrite the proxyclass image to our local nix-built mirror (indri's overlay
 # carries the equivalent dagger/arm64 patch). A strategic merge patch is used
diff --git a/argocd/manifests/tailscale-operator/kustomization.yaml b/argocd/manifests/tailscale-operator/kustomization.yaml
index ad275a9..239f7ea 100644
--- a/argocd/manifests/tailscale-operator/kustomization.yaml
+++ b/argocd/manifests/tailscale-operator/kustomization.yaml
@@ -21,7 +21,7 @@ resources:
 images:
   - name: docker.io/tailscale/k8s-operator
     newName: registry.ops.eblu.me/blumeops/tailscale-operator
-    newTag: v1.94.2-ac40a18
+    newTag: v1.94.2-d03ed33
 
 # Rewrite the proxyclass image to the local mirror. A strategic merge patch
 # is used instead of kustomize's `images:` directive because that directive
diff --git a/argocd/manifests/tailscale-operator/proxyclass-image.yaml b/argocd/manifests/tailscale-operator/proxyclass-image.yaml
index eae73eb..82a7e0b 100644
--- a/argocd/manifests/tailscale-operator/proxyclass-image.yaml
+++ b/argocd/manifests/tailscale-operator/proxyclass-image.yaml
@@ -6,6 +6,6 @@ spec:
   statefulSet:
     pod:
       tailscaleContainer:
-        image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-ac40a18
+        image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-d03ed33
       tailscaleInitContainer:
-        image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-ac40a18
+        image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-d03ed33