From 1425bf1f5c1486a8f739bc8c91bc99ee94a761d9 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 20 Apr 2026 09:03:54 -0700 Subject: [PATCH 001/122] Upgrade forgejo-runner to v12.8, adopt server.connections, and clean up docs (#338) ## Summary - consolidate forgejo-runner how-to docs into current cards - upgrade the k8s forgejo-runner deployment to the latest v12.8.x runner image - switch the k8s runner from first-boot register flow to declarative server.connections config - keep the runner image on the native Dagger build path and update the surrounding manifests/secrets ## Notes - PR opened early for C1 review - implementation and deployment verification will follow in subsequent commits Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/338 --- argocd/manifests/forgejo-runner/config.yaml | 13 ++- .../manifests/forgejo-runner/deployment.yaml | 22 +--- .../forgejo-runner/external-secret.yaml | 16 +-- .../forgejo-runner/kustomization.yaml | 2 +- containers/forgejo-runner/container.py | 4 +- ...o-runner-v12-8-server-connections.infra.md | 1 + .../forgejo-runner/configure-k8s-runner.md | 100 ++++++++++++++++++ .../review-runner-config-v12.md | 39 ------- .../forgejo-runner/upgrade-k8s-runner.md | 52 --------- ...t-v12.md => validate-forgejo-workflows.md} | 15 ++- docs/reference/services/forgejo-runner.md | 15 +-- docs/reference/services/forgejo.md | 1 + service-versions.yaml | 2 +- 13 files changed, 142 insertions(+), 140 deletions(-) create mode 100644 docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md create mode 100644 docs/how-to/forgejo-runner/configure-k8s-runner.md delete mode 100644 docs/how-to/forgejo-runner/review-runner-config-v12.md delete mode 100644 docs/how-to/forgejo-runner/upgrade-k8s-runner.md rename docs/how-to/forgejo-runner/{validate-workflows-against-v12.md => validate-forgejo-workflows.md} (61%) diff --git a/argocd/manifests/forgejo-runner/config.yaml b/argocd/manifests/forgejo-runner/config.yaml index 4894825..121d327 100644 --- a/argocd/manifests/forgejo-runner/config.yaml +++ b/argocd/manifests/forgejo-runner/config.yaml @@ -1,9 +1,8 @@ -# Reviewed against v12.7.3 defaults (2026-03-30) +# Reviewed against v12.8.2 defaults (2026-04-20) log: level: info runner: - file: /data/.runner capacity: 2 timeout: 3h shutdown_timeout: 3h @@ -13,7 +12,15 @@ runner: TZ: America/Los_Angeles container: - # Job execution image is set via RUNNER_LABELS in deployment.yaml network: "host" # Connect to DinD sidecar via TCP (not socket) docker_host: tcp://127.0.0.1:2375 + +server: + connections: + forgejo: + url: https://forge.ops.eblu.me/ + uuid: ${FORGEJO_RUNNER_UUID} + token: ${FORGEJO_RUNNER_TOKEN} + labels: + - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.1-24f7512 diff --git a/argocd/manifests/forgejo-runner/deployment.yaml b/argocd/manifests/forgejo-runner/deployment.yaml index c793895..7db7798 100644 --- a/argocd/manifests/forgejo-runner/deployment.yaml +++ b/argocd/manifests/forgejo-runner/deployment.yaml @@ -25,14 +25,6 @@ spec: env: - name: TZ value: America/Los_Angeles - - name: DOCKER_HOST - value: tcp://localhost:2375 - - name: FORGEJO_URL - value: "https://forge.ops.eblu.me" - - name: RUNNER_NAME - value: "k8s-runner" - - name: RUNNER_LABELS - value: "k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.1-24f7512" command: - /bin/sh - -c @@ -44,19 +36,11 @@ spec: done echo "Docker daemon ready" - # Register if not already registered - if [ ! -f /data/.runner ]; then - echo "Registering runner..." - forgejo-runner register \ - --instance "$FORGEJO_URL" \ - --token "$RUNNER_TOKEN" \ - --name "$RUNNER_NAME" \ - --labels "$RUNNER_LABELS" \ - --no-interactive - fi + # Render config with credentials from ExternalSecret. + envsubst < /config/config.yaml > /tmp/config.yaml # Start daemon - exec forgejo-runner daemon --config /config/config.yaml + exec forgejo-runner daemon --config /tmp/config.yaml envFrom: - secretRef: name: forgejo-runner-env diff --git a/argocd/manifests/forgejo-runner/external-secret.yaml b/argocd/manifests/forgejo-runner/external-secret.yaml index fce28bb..ab7a691 100644 --- a/argocd/manifests/forgejo-runner/external-secret.yaml +++ b/argocd/manifests/forgejo-runner/external-secret.yaml @@ -1,11 +1,7 @@ -# ExternalSecret for Forgejo Runner token +# ExternalSecret for Forgejo Runner credentials # # 1Password item: "Forgejo Secrets" in blumeops vault -# Field: runner_reg (runner registration token) -# -# Non-secret env vars (FORGEJO_URL, RUNNER_NAME, RUNNER_LABELS) live in the -# deployment spec so that changes (e.g. image version bumps) trigger a rollout -# automatically. +# Fields: runner_k8s_uuid, runner_k8s_token # apiVersion: external-secrets.io/v1 kind: ExternalSecret @@ -21,7 +17,11 @@ spec: name: forgejo-runner-env creationPolicy: Owner data: - - secretKey: RUNNER_TOKEN + - secretKey: FORGEJO_RUNNER_UUID remoteRef: key: Forgejo Secrets - property: runner_reg + property: runner_k8s_uuid + - secretKey: FORGEJO_RUNNER_TOKEN + remoteRef: + key: Forgejo Secrets + property: runner_k8s_token diff --git a/argocd/manifests/forgejo-runner/kustomization.yaml b/argocd/manifests/forgejo-runner/kustomization.yaml index f8d9377..0df16e2 100644 --- a/argocd/manifests/forgejo-runner/kustomization.yaml +++ b/argocd/manifests/forgejo-runner/kustomization.yaml @@ -11,7 +11,7 @@ resources: images: - name: code.forgejo.org/forgejo/runner newName: registry.ops.eblu.me/blumeops/forgejo-runner - newTag: v12.7.3-352b95c + newTag: v12.8.2-bf16b8a - name: docker newTag: 27-dind diff --git a/containers/forgejo-runner/container.py b/containers/forgejo-runner/container.py index ffaca88..dfb2edf 100644 --- a/containers/forgejo-runner/container.py +++ b/containers/forgejo-runner/container.py @@ -13,7 +13,7 @@ from blumeops.containers import ( oci_labels, ) -VERSION = "12.7.3" +VERSION = "12.8.2" async def build(src: dagger.Directory) -> dagger.Container: @@ -34,7 +34,7 @@ async def build(src: dagger.Directory) -> dagger.Container: # Stage 2: Runtime runtime = alpine_runtime( - extra_apk=["git", "bash", "ca-certificates"], + extra_apk=["git", "bash", "ca-certificates", "gettext-envsubst"], uid=1000, gid=1000, username="runner", diff --git a/docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md b/docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md new file mode 100644 index 0000000..cc35684 --- /dev/null +++ b/docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md @@ -0,0 +1 @@ +Upgraded the k8s Forgejo runner to the v12.8 line, switched it from first-boot registration to declarative `server.connections` credentials from 1Password, and consolidated the supporting runner how-to documentation. diff --git a/docs/how-to/forgejo-runner/configure-k8s-runner.md b/docs/how-to/forgejo-runner/configure-k8s-runner.md new file mode 100644 index 0000000..3c095d0 --- /dev/null +++ b/docs/how-to/forgejo-runner/configure-k8s-runner.md @@ -0,0 +1,100 @@ +--- +title: Configure K8s Forgejo Runner +modified: 2026-04-20 +last-reviewed: 2026-04-20 +tags: + - how-to + - forgejo-runner + - ci +--- + +# Configure K8s Forgejo Runner + +Configure the Kubernetes Forgejo runner on [[indri]] using declarative `server.connections` config instead of first-boot `register`. + +## Why This Flow + +The older bootstrap pattern used `forgejo-runner register` on container start and persisted `/data/.runner` in an `emptyDir`. That works, but it depends on deprecated CLI flows and mutates runner identity at runtime. + +The preferred pattern is: + +- Create runner credentials once on the Forgejo host +- Store the runner UUID and token in 1Password +- Inject them into Kubernetes via [[external-secrets]] +- Render `server.connections` in `argocd/manifests/forgejo-runner/config.yaml` + +This keeps runner identity under secret management and makes pod restarts idempotent. + +## Create Runner Credentials + +On [[indri]], use Forgejo's local CLI instead of the web UI: + +```bash +ssh indri 'cd ~/code/3rd/forgejo && ./forgejo forgejo-cli actions register \ + --name k8s-runner \ + --scope instance \ + --secret "$(openssl rand -hex 32)"' +``` + +This returns a runner UUID. The generated secret becomes the runner token. Store both in 1Password under the "Forgejo Secrets" item as: + +- `runner_k8s_uuid` +- `runner_k8s_token` + +## Kubernetes Secret Wiring + +Expose those fields with `argocd/manifests/forgejo-runner/external-secret.yaml` and make them available to the runner container as environment variables. + +The deployment should not carry registration-only env vars like `FORGEJO_URL`, `RUNNER_NAME`, or `RUNNER_TOKEN`. + +## Runner Config + +Keep the runner configuration in `argocd/manifests/forgejo-runner/config.yaml`. The key change is adopting `server.connections`: + +```yaml +server: + connections: + forgejo: + url: https://forge.ops.eblu.me + uuid: ${FORGEJO_RUNNER_UUID} + token: ${FORGEJO_RUNNER_TOKEN} + labels: + - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image: +``` + +Other settings that still matter for this deployment: + +- `runner.capacity: 2` +- `runner.timeout: 3h` +- `runner.shutdown_timeout: 3h` +- `container.network: host` +- `container.docker_host: tcp://127.0.0.1:2375` + +We do not currently use cache configuration, extra volume mounts, or multiple Forgejo connections. + +## Deployment Shape + +The pod still runs two containers: + +1. `runner` — Forgejo runner daemon +2. `dind` — Docker-in-Docker sidecar + +The startup script only needs to wait for DinD and then launch the daemon. It should no longer call `forgejo-runner register` or depend on `/data/.runner`. + +## Upgrade Procedure + +When bumping the runner version: + +1. Update `VERSION` in `containers/forgejo-runner/container.py` +2. Review release notes for runner breaking changes +3. Confirm `config.yaml` is still compatible with the current runner defaults +4. Build and release the updated `forgejo-runner` image +5. Update `argocd/manifests/forgejo-runner/kustomization.yaml` to the new image tag +6. Validate workflows with [[validate-forgejo-workflows]] +7. Sync the `forgejo-runner` ArgoCD app and trigger a test workflow + +## Related + +- [[validate-forgejo-workflows]] — Validate workflow schema against the deployed runner line +- [[forgejo-runner]] — Service reference +- [[build-container-image]] — Build and release the runner image diff --git a/docs/how-to/forgejo-runner/review-runner-config-v12.md b/docs/how-to/forgejo-runner/review-runner-config-v12.md deleted file mode 100644 index af50090..0000000 --- a/docs/how-to/forgejo-runner/review-runner-config-v12.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: Review Runner Config for v12 -modified: 2026-02-27 -last-reviewed: 2026-02-27 -tags: - - how-to - - forgejo-runner - - ci ---- - -# Review Runner Config for v12 - -Compare the current runner ConfigMap against the v12.7.0 default config to identify new, changed, or deprecated keys. - -## Findings - -Compared `forgejo-runner generate-config` output from v6.3.1 and v12.7.0. Our config is minimal and remains valid for v12. - -### New sections in v12 (not adopted) - -- **`server.connections`** — multi-server polling. Not needed (single Forgejo instance). -- **`cache.secret_url`** — load cache secret from file URL. Not needed. -- **`runner.report_retry`** — retry config for log uploads. Defaults are fine. - -### Changed semantics - -- **`container.docker_host`** — v12 supports `unix://` and `ssh://` URLs. Our explicit `tcp://127.0.0.1:2375` still correct for DinD sidecar. -- **`cache`** section restructured with proxy/server split and better docs. We don't configure cache, so defaults apply. - -### Config update applied - -Added `shutdown_timeout: 3h` to allow graceful job completion on pod termination (v12 default, was missing from our v6 config). Added review date comment. - -`container.valid_volumes` and `container.options` left empty — our jobs use host networking and don't mount volumes. Can harden later if needed. - -## Related - -- [[upgrade-k8s-runner]] — Parent goal -- [[validate-workflows-against-v12]] — Sibling prerequisite diff --git a/docs/how-to/forgejo-runner/upgrade-k8s-runner.md b/docs/how-to/forgejo-runner/upgrade-k8s-runner.md deleted file mode 100644 index 3d285ac..0000000 --- a/docs/how-to/forgejo-runner/upgrade-k8s-runner.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: Upgrade K8s Forgejo Runner to v12 -modified: 2026-02-27 -last-reviewed: 2026-02-27 -tags: - - how-to - - forgejo-runner - - ci ---- - -# Upgrade K8s Forgejo Runner to v12 - -Upgrade the k8s forgejo-runner daemon from v6.3.1 to v12.7.0 (or latest v12.x at time of execution). - -## Background - -The k8s runner on indri (minikube) uses the upstream `code.forgejo.org/forgejo/runner` image, currently pinned to v6.3.1. The latest is v12.7.0. The runner is still in alpha and uses major version bumps for each breaking change, so v6→v12 crosses six major versions. The ringtail runner is already at ~v12.6.4 via nixpkgs and needs no work. - -Blast radius is low — if the upgrade breaks CI, revert the image tag in `argocd/manifests/forgejo-runner/deployment.yaml` and sync. - -## Breaking Changes Crossed - -| Version | Change | Impact | -|---------|--------|--------| -| v7.0 | CLI `--gitea-instance` → `--forgejo-instance`; `FORGEJO_*` env vars | Low — our registration doesn't use the old flag | -| v8.0 | Workflow schema validation; default image → `node:22-bookworm` | Workflows must pass validation | -| v9.0 | Stricter schema + actions validation; `forgejo-runner validate` added | Same — but now we have a tool | -| v10.0 | Cache isolation; skip v10.0.0 (regression) | Low | -| v11.0 | License MIT → GPLv3 | Non-technical | -| v12.0 | Git binary required; git worktrees for remote actions | Low — OCI image includes git | - -## Execution Steps - -Once prerequisites are met: - -1. Update `argocd/manifests/forgejo-runner/deployment.yaml`: - - Change runner image from `code.forgejo.org/forgejo/runner:6.3.1` to `code.forgejo.org/forgejo/runner:12.7.0` -2. Update `argocd/manifests/forgejo-runner/config.yaml` with any config changes from [[review-runner-config-v12]] -3. Push, sync ArgoCD: `argocd app sync forgejo-runner` -4. Verify runner registers and connects: check Forgejo admin → runners -5. Trigger a test workflow (manual dispatch of `build-container.yaml` or `branch-cleanup.yaml`) -6. Update `service-versions.yaml` to note the daemon version - -## Rollback - -Revert the image tag to `6.3.1` in `deployment.yaml`, push, and sync. - -## Related - -- [[forgejo]] — Forgejo service reference -- [[validate-workflows-against-v12]] — Pre-upgrade workflow validation -- [[review-runner-config-v12]] — Config format review diff --git a/docs/how-to/forgejo-runner/validate-workflows-against-v12.md b/docs/how-to/forgejo-runner/validate-forgejo-workflows.md similarity index 61% rename from docs/how-to/forgejo-runner/validate-workflows-against-v12.md rename to docs/how-to/forgejo-runner/validate-forgejo-workflows.md index 5f98502..ed21de7 100644 --- a/docs/how-to/forgejo-runner/validate-workflows-against-v12.md +++ b/docs/how-to/forgejo-runner/validate-forgejo-workflows.md @@ -1,20 +1,20 @@ --- -title: Validate Workflows Against v12 +title: Validate Forgejo Workflows modified: 2026-04-11 -last-reviewed: 2026-02-27 +last-reviewed: 2026-04-20 tags: - how-to - forgejo-runner - ci --- -# Validate Workflows Against v12 +# Validate Forgejo Workflows -Run `forgejo-runner validate` (available from v9.0+) against all workflow files to catch schema issues before upgrading the k8s runner daemon. +Run `forgejo-runner validate` against all workflow files to catch schema issues before upgrading the k8s runner daemon. ## Result -All 6 workflows pass v12.7.0 schema validation with no changes needed: +All current workflows pass the validation step with no changes needed: - `branch-cleanup.yaml` — OK - `build-blumeops.yaml` — OK @@ -27,7 +27,7 @@ All 6 workflows pass v12.7.0 schema validation with no changes needed: 1. `validate_workflows` function added to `src/blumeops/main.py` (formerly `.dagger/src/blumeops_ci/main.py`) - Uses `forgejo-runner validate --directory .` inside the upstream runner container - - `runner_version` parameter (default `12.7.0`) pins to deployed version + - `runner_version` parameter pins validation to the deployed runner line 2. `mise run validate-workflows` task wired to `dagger call validate-workflows` 3. Pre-commit hook triggers on `.forgejo/workflows/` changes @@ -41,5 +41,4 @@ dagger call validate-workflows --src=. ## Related -- [[upgrade-k8s-runner]] — Parent goal -- [[review-runner-config-v12]] — Sibling prerequisite +- [[configure-k8s-runner]] — Runner configuration and upgrade flow diff --git a/docs/reference/services/forgejo-runner.md b/docs/reference/services/forgejo-runner.md index d61f378..612f20f 100644 --- a/docs/reference/services/forgejo-runner.md +++ b/docs/reference/services/forgejo-runner.md @@ -1,7 +1,7 @@ --- title: Forgejo Runner -modified: 2026-03-30 -last-reviewed: 2026-03-30 +modified: 2026-04-20 +last-reviewed: 2026-04-20 tags: - service - ci-cd @@ -22,21 +22,21 @@ Forgejo Actions runner daemon for CI/CD job execution. Runs as a Kubernetes pod | **Capacity** | 2 concurrent jobs | | **Timeout** | 3h | | **Forgejo Instance** | https://forge.ops.eblu.me | -| **Image** | `code.forgejo.org/forgejo/runner` (see `argocd/manifests/forgejo-runner/kustomization.yaml` for current tag) | +| **Image** | `registry.ops.eblu.me/blumeops/forgejo-runner` (see `argocd/manifests/forgejo-runner/kustomization.yaml` for current tag) | | **DinD Sidecar** | `docker:27-dind` | ## Architecture The pod runs two containers: -1. **runner** - The Forgejo runner daemon. Registers with the forge on first start, then polls for jobs. Talks to DinD via `tcp://localhost:2375`. +1. **runner** - The Forgejo runner daemon. Loads a rendered `server.connections` config at startup, then polls for jobs. Talks to DinD via `tcp://localhost:2375`. 2. **dind** - Docker-in-Docker sidecar (privileged). Provides the Docker daemon for job container execution. Uses a registry mirror at `host.minikube.internal:5050` ([[zot]]). -Runner state (`/data/.runner`) is stored in an `emptyDir` volume, so re-registration happens on pod restart. The registration token comes from 1Password via [[external-secrets]]. +The runner daemon image is built from `containers/forgejo-runner/container.py`, not pulled directly from upstream. Credentials come from 1Password via [[external-secrets]], and the startup script renders the final config before launching the daemon. The `/data` volume remains for the runner home directory and job scratch space, not for `.runner` registration state. ## Job Execution Image -The actual container image used to run workflow steps is set via `RUNNER_LABELS` in the deployment, not in the runner config. This image is tracked separately as `runner-job-image` in `service-versions.yaml`. See [[build-container-image]] for how it's built. +The actual container image used to run workflow steps is declared in `server.connections.labels` in the runner config. This image is tracked separately as `runner-job-image` in `service-versions.yaml`. See [[build-container-image]] for how it's built. ## Network @@ -46,7 +46,8 @@ Jobs run with `network: "host"` to share the DinD network namespace. This gives | Secret | Source | Purpose | |--------|--------|---------| -| `RUNNER_TOKEN` | 1Password ("Forgejo Secrets" → `runner_reg`) | Runner registration with forge | +| `FORGEJO_RUNNER_UUID` | 1Password ("Forgejo Secrets" → `runner_k8s_uuid`) | Static runner identity for `server.connections` | +| `FORGEJO_RUNNER_TOKEN` | 1Password ("Forgejo Secrets" → `runner_k8s_token`) | Static runner credential for `server.connections` | ## Related diff --git a/docs/reference/services/forgejo.md b/docs/reference/services/forgejo.md index 11bb9a5..5b16b0e 100644 --- a/docs/reference/services/forgejo.md +++ b/docs/reference/services/forgejo.md @@ -85,6 +85,7 @@ Both container workflows trigger on the same tag pattern (`*-v[0-9]*`). Each che Server configuration secrets managed via 1Password → Ansible: - `lfs-jwt-secret`, `internal-token`, `oauth2-jwt-secret` - Forgejo server tokens - `runner_reg` - Runner registration token (also in k8s via [[external-secrets]]) +- `runner_k8s_uuid`, `runner_k8s_token` - Static credentials for the k8s runner `server.connections` flow ## Forgejo Actions Secrets diff --git a/service-versions.yaml b/service-versions.yaml index 761aa8d..8584322 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -236,7 +236,7 @@ services: - name: forgejo-runner type: argocd last-reviewed: 2026-03-30 - current-version: "12.7.3" + current-version: "12.8.2" upstream-source: https://code.forgejo.org/forgejo/runner/releases notes: >- Runner daemon version (code.forgejo.org/forgejo/runner). Job execution From 21177ff47f48de9ade604c794c124e91478ca3fc Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 20 Apr 2026 09:11:37 -0700 Subject: [PATCH 002/122] chore: update forgejo-runner image tag --- argocd/manifests/forgejo-runner/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/forgejo-runner/kustomization.yaml b/argocd/manifests/forgejo-runner/kustomization.yaml index 0df16e2..93cd33b 100644 --- a/argocd/manifests/forgejo-runner/kustomization.yaml +++ b/argocd/manifests/forgejo-runner/kustomization.yaml @@ -11,7 +11,7 @@ resources: images: - name: code.forgejo.org/forgejo/runner newName: registry.ops.eblu.me/blumeops/forgejo-runner - newTag: v12.8.2-bf16b8a + newTag: v12.8.2-1425bf1 - name: docker newTag: 27-dind From d6ad8e8e59a73faf9ab71e22ff6d9da728aaa82f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 20 Apr 2026 09:15:35 -0700 Subject: [PATCH 003/122] chore: refresh forgejo-runner review date --- service-versions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service-versions.yaml b/service-versions.yaml index 8584322..75ad89d 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -235,7 +235,7 @@ services: - name: forgejo-runner type: argocd - last-reviewed: 2026-03-30 + last-reviewed: 2026-04-20 current-version: "12.8.2" upstream-source: https://code.forgejo.org/forgejo/runner/releases notes: >- From 54841dbf70f2f61000ff77f9685055db09d7597e Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 20 Apr 2026 10:09:16 -0700 Subject: [PATCH 004/122] Update ringtail flake inputs --- nixos/ringtail/flake.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock index 86c20af..90fdff1 100644 --- a/nixos/ringtail/flake.lock +++ b/nixos/ringtail/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1773889306, - "narHash": "sha256-PAqwnsBSI9SVC2QugvQ3xeYCB0otOwCacB1ueQj2tgw=", + "lastModified": 1776613567, + "narHash": "sha256-gC9Cp5ibBmGD5awCA9z7xy6MW6iJufhazTYJOiGlCUI=", "owner": "nix-community", "repo": "disko", - "rev": "5ad85c82cc52264f4beddc934ba57f3789f28347", + "rev": "32f4236bfc141ae930b5ba2fb604f561fed5219d", "type": "github" }, "original": { @@ -43,11 +43,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1775811116, - "narHash": "sha256-t+HZK42pB6N+i5RGbuy7Xluez/VvWbembBdvzsc23Ss=", + "lastModified": 1776434932, + "narHash": "sha256-gyqXNMgk3sh+ogY5svd2eNLJ6oEwzbAeaoBrrxD0lKk=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "54170c54449ea4d6725efd30d719c5e505f1c10e", + "rev": "c7f47036d3df2add644c46d712d14262b7d86c0c", "type": "github" }, "original": { From 58fe4f0073dbec11242d451b2a946cbd54c34436 Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Mon, 20 Apr 2026 15:48:15 -0700 Subject: [PATCH 005/122] ty --- mise.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mise.toml b/mise.toml index 12c92df..82821c6 100644 --- a/mise.toml +++ b/mise.toml @@ -9,4 +9,4 @@ prek = "0.3.4" pulumi = "3.215.0" dagger = "0.20.1" -ty = "0.0.29" +"pipx:ty" = "0.0.29" From db8fd946ae0de2288b3f7126c4106a936c8a5227 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 08:12:33 -0700 Subject: [PATCH 006/122] Bump Dagger to 0.20.6 and migrate runner-job-image to Alpine container.py Bumps the Dagger engine/CLI from v0.20.1 to v0.20.6 (mise pin, dagger.json engineVersion, SDK regen) and rewrites the runner-job-image container as a native Dagger pipeline on Alpine 3.23 using the shared alpine_runtime helper, replacing the Debian-based Dockerfile. All Forgejo Actions in this repo use actions/checkout (a JS action), so musl is not a compatibility concern. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 2 + containers/runner-job-image/Dockerfile | 84 ------------------- containers/runner-job-image/container.py | 79 +++++++++++++++++ dagger.json | 5 +- ...dagger-0-20-6-runner-image-alpine.infra.md | 1 + docs/reference/tools/dagger.md | 2 +- mise.toml | 2 +- service-versions.yaml | 8 +- 8 files changed, 90 insertions(+), 93 deletions(-) delete mode 100644 containers/runner-job-image/Dockerfile create mode 100644 containers/runner-job-image/container.py create mode 100644 docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md diff --git a/.gitignore b/.gitignore index acfafba..48c4b97 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ __pycache__/ # OS .DS_Store +/**/__pycache__ +/.env diff --git a/containers/runner-job-image/Dockerfile b/containers/runner-job-image/Dockerfile deleted file mode 100644 index 0018c64..0000000 --- a/containers/runner-job-image/Dockerfile +++ /dev/null @@ -1,84 +0,0 @@ -# Forgejo Actions Job Execution Image -# -# This image is used as the job execution environment for Forgejo Actions. -# The host runner daemon creates containers from this image to run workflow steps. -# -# Build logic (container images, docs site) runs inside Dagger containers, -# so this image only needs: git, Docker CLI, Dagger CLI, ArgoCD CLI, uv, yq, and basic tools. -# -# Usage: Configure runner with label like: -# docker:docker://registry.ops.eblu.me/blumeops/runner-job-image:latest - -ARG CONTAINER_APP_VERSION=0.20.1 - -FROM debian:bookworm-slim - -ARG TARGETARCH -ARG CONTAINER_APP_VERSION -ARG DAGGER_VERSION=${CONTAINER_APP_VERSION} - -LABEL org.opencontainers.image.title="Runner Job Image" -LABEL org.opencontainers.image.description="Forgejo Actions job execution environment" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -# Install base dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - curl \ - git \ - gnupg \ - jq \ - tzdata \ - && rm -rf /var/lib/apt/lists/* - -# Install Node.js (required by actions/checkout and other JavaScript Actions) -RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ - && apt-get install -y --no-install-recommends nodejs \ - && rm -rf /var/lib/apt/lists/* \ - && node --version - -# Install Docker CLI (Dagger shells out to `docker` to provision its engine) -RUN install -m 0755 -d /etc/apt/keyrings \ - && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \ - && chmod a+r /etc/apt/keyrings/docker.asc \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian bookworm stable" > /etc/apt/sources.list.d/docker.list \ - && apt-get update \ - && apt-get install -y --no-install-recommends docker-ce-cli \ - && rm -rf /var/lib/apt/lists/* - -# Install uv (Python package runner for towncrier) -RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ - && mv /root/.local/bin/uv /usr/local/bin/uv \ - && mv /root/.local/bin/uvx /usr/local/bin/uvx - -# Install argocd CLI (for syncing apps from workflows) -RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" \ - && curl -fsSL -o /usr/local/bin/argocd \ - "https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-${ARCH}" \ - && chmod +x /usr/local/bin/argocd \ - && argocd version --client - -# Install Dagger CLI (for running Dagger CI pipelines) -RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" \ - && curl -fsSL -o /tmp/dagger.tar.gz \ - "https://dl.dagger.io/dagger/releases/${DAGGER_VERSION}/dagger_v${DAGGER_VERSION}_linux_${ARCH}.tar.gz" \ - && tar -xzf /tmp/dagger.tar.gz -C /usr/local/bin dagger \ - && rm /tmp/dagger.tar.gz \ - && dagger version - -# Install yq (for editing YAML files in workflows) -RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" \ - && curl -fsSL -o /usr/local/bin/yq \ - "https://github.com/mikefarah/yq/releases/latest/download/yq_linux_${ARCH}" \ - && chmod +x /usr/local/bin/yq \ - && yq --version - -# Install flyctl (for Fly.io cache purge after docs deploy) -RUN curl -L https://fly.io/install.sh | sh \ - && mv /root/.fly/bin/flyctl /usr/local/bin/fly \ - && rm -rf /root/.fly - -# Default to bash -CMD ["/bin/bash"] diff --git a/containers/runner-job-image/container.py b/containers/runner-job-image/container.py new file mode 100644 index 0000000..c5710ff --- /dev/null +++ b/containers/runner-job-image/container.py @@ -0,0 +1,79 @@ +"""Forgejo Actions job execution image — native Dagger build. + +The forgejo-runner daemon creates containers from this image to run +workflow steps. Contains the tools workflows reach for: git, Docker CLI, +Node.js (for JavaScript Actions), Dagger CLI, ArgoCD CLI, uv, yq, flyctl. + +VERSION tracks the Dagger CLI version, the primary build tool. +""" + +import dagger + +from blumeops.containers import alpine_runtime, oci_labels + +VERSION = "0.20.6" + + +async def build(src: dagger.Directory) -> dagger.Container: + # Map `uname -m` to the arch suffix each upstream uses. + arch_setup = ( + 'ARCH_UNAME="$(uname -m)"; ' + 'case "$ARCH_UNAME" in ' + " x86_64) ARCH=amd64 ;; " + " aarch64) ARCH=arm64 ;; " + ' *) echo "unsupported arch: $ARCH_UNAME" >&2; exit 1 ;; ' + "esac; " + ) + + runtime = alpine_runtime( + extra_apk=[ + "bash", + "ca-certificates", + "curl", + "docker-cli", + "git", + "gnupg", + "jq", + "nodejs", + "npm", + "tzdata", + ], + create_user=False, + ) + runtime = oci_labels( + runtime, + title="Runner Job Image", + description="Forgejo Actions job execution environment", + version=VERSION, + ) + + install_tools = ( + arch_setup + + "set -eux; " + # Dagger CLI (pinned) + + f'curl -fsSL -o /tmp/dagger.tar.gz "https://dl.dagger.io/dagger/releases/{VERSION}/dagger_v{VERSION}_linux_${{ARCH}}.tar.gz"; ' + + "tar -xzf /tmp/dagger.tar.gz -C /usr/local/bin dagger; " + + "rm /tmp/dagger.tar.gz; " + + "dagger version; " + # ArgoCD CLI (latest — matches cluster server version over time) + + 'curl -fsSL -o /usr/local/bin/argocd "https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-${ARCH}"; ' + + "chmod +x /usr/local/bin/argocd; " + + "argocd version --client; " + # yq (latest) + + 'curl -fsSL -o /usr/local/bin/yq "https://github.com/mikefarah/yq/releases/latest/download/yq_linux_${ARCH}"; ' + + "chmod +x /usr/local/bin/yq; " + + "yq --version; " + # uv / uvx (latest; musl target auto-selected by installer) + + "curl -LsSf https://astral.sh/uv/install.sh " + + '| env UV_INSTALL_DIR=/usr/local/bin UV_UNMANAGED_INSTALL="/usr/local/bin" sh; ' + + "uv --version; " + # flyctl (latest) + + "curl -L https://fly.io/install.sh | sh; " + + "mv /root/.fly/bin/flyctl /usr/local/bin/fly; " + + "rm -rf /root/.fly; " + + "fly version" + ) + + return runtime.with_exec(["sh", "-c", install_tools]).with_default_args( + args=["/bin/bash"] + ) diff --git a/dagger.json b/dagger.json index c982487..3309378 100644 --- a/dagger.json +++ b/dagger.json @@ -1,8 +1,7 @@ { "name": "blumeops", - "engineVersion": "v0.20.1", + "engineVersion": "v0.20.6", "sdk": { "source": "python" - }, - "source": "." + } } diff --git a/docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md b/docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md new file mode 100644 index 0000000..35f77c2 --- /dev/null +++ b/docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md @@ -0,0 +1 @@ +Upgraded Dagger from v0.20.1 to v0.20.6 (engine, CLI pin, and SDK regen) and migrated `runner-job-image` from a Debian-based Dockerfile to a native Dagger `container.py` on Alpine 3.23, reusing the shared `alpine_runtime` helper. diff --git a/docs/reference/tools/dagger.md b/docs/reference/tools/dagger.md index 379c10f..89be50c 100644 --- a/docs/reference/tools/dagger.md +++ b/docs/reference/tools/dagger.md @@ -16,7 +16,7 @@ Build engine for BlumeOps CI/CD pipelines. Replaces shell-based build scripts wi | Property | Value | |----------|-------| | **Module** | `blumeops` | -| **Engine Version** | v0.20.1 | +| **Engine Version** | v0.20.6 | | **SDK** | Python | | **Source** | `src/blumeops/main.py` | | **Config** | `dagger.json` (source: `.`) | diff --git a/mise.toml b/mise.toml index 82821c6..286c4e0 100644 --- a/mise.toml +++ b/mise.toml @@ -8,5 +8,5 @@ "pipx:borgmatic" = "2.1.4" prek = "0.3.4" pulumi = "3.215.0" -dagger = "0.20.1" +dagger = "0.20.6" "pipx:ty" = "0.0.29" diff --git a/service-versions.yaml b/service-versions.yaml index 75ad89d..f5811b5 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -244,8 +244,8 @@ services: - name: runner-job-image type: argocd - last-reviewed: 2026-03-06 - current-version: "0.20.1" + last-reviewed: 2026-04-21 + current-version: "0.20.6" upstream-source: https://github.com/dagger/dagger/releases notes: >- Forgejo Actions job execution image. CONTAINER_APP_VERSION tracks the @@ -396,8 +396,8 @@ services: - name: dagger type: mise - last-reviewed: 2026-04-12 - current-version: "0.20.1" + last-reviewed: 2026-04-21 + current-version: "0.20.6" upstream-source: https://github.com/dagger/dagger/releases notes: Dagger CI/CD engine; pinned in mise.toml From 50f8c2a33f53bcea167a185412bdf54a49b36be9 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 08:18:25 -0700 Subject: [PATCH 007/122] Roll k8s runner to runner-job-image v0.20.6-9b6be09 Points the k8s Forgejo runner label at the locally-bootstrapped runner-job-image built from the Alpine container.py on this branch. Once merged, CI will rebuild the same image from the same SHA. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/forgejo-runner/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/forgejo-runner/config.yaml b/argocd/manifests/forgejo-runner/config.yaml index 121d327..7c5196e 100644 --- a/argocd/manifests/forgejo-runner/config.yaml +++ b/argocd/manifests/forgejo-runner/config.yaml @@ -23,4 +23,4 @@ server: uuid: ${FORGEJO_RUNNER_UUID} token: ${FORGEJO_RUNNER_TOKEN} labels: - - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.1-24f7512 + - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.6-9b6be09 From fb32cc07c4b3fe2e64d0387a26765ba9c7b25397 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 08:38:33 -0700 Subject: [PATCH 008/122] chore: repoint runner-job-image tag at CI-built v0.20.6-50f8c2a Swaps the k8s runner label from the local bootstrap tag (v0.20.6-9b6be09) to the equivalent image rebuilt by CI from main. Functionally identical; closes the bootstrap loop. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/forgejo-runner/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/forgejo-runner/config.yaml b/argocd/manifests/forgejo-runner/config.yaml index 7c5196e..01ede7c 100644 --- a/argocd/manifests/forgejo-runner/config.yaml +++ b/argocd/manifests/forgejo-runner/config.yaml @@ -23,4 +23,4 @@ server: uuid: ${FORGEJO_RUNNER_UUID} token: ${FORGEJO_RUNNER_TOKEN} labels: - - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.6-9b6be09 + - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.6-50f8c2a From 30f39ae0507e46bc4caa0c777393452b25eef052 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 08:53:41 -0700 Subject: [PATCH 009/122] Review contributing tutorial: add last-reviewed, .ai.md fragment type, prek provenance Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+review-contributing-doc.doc.md | 1 + docs/tutorials/contributing.md | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 docs/changelog.d/+review-contributing-doc.doc.md diff --git a/docs/changelog.d/+review-contributing-doc.doc.md b/docs/changelog.d/+review-contributing-doc.doc.md new file mode 100644 index 0000000..c394a01 --- /dev/null +++ b/docs/changelog.d/+review-contributing-doc.doc.md @@ -0,0 +1 @@ +Refresh the contributing tutorial: add `last-reviewed`, include the `.ai.md` changelog fragment type, and clarify that `prek` is pinned via `mise`. diff --git a/docs/tutorials/contributing.md b/docs/tutorials/contributing.md index cddafea..a2a7069 100644 --- a/docs/tutorials/contributing.md +++ b/docs/tutorials/contributing.md @@ -1,6 +1,7 @@ --- title: Contributing -modified: 2026-02-07 +modified: 2026-04-21 +last-reviewed: 2026-04-21 tags: - tutorials - contributing @@ -37,14 +38,14 @@ brew bundle # installs tea, argocd, mise, etc. ### Using Mise (Optional) -Mise manages language toolchains and runs tasks: +Mise manages language toolchains, runs tasks, and pins tools like `prek`: ```bash -mise install # installs Python, Node.js, etc. from mise.toml +mise install # installs Python, Node.js, prek, etc. from mise.toml ``` ### Git Hooks (prek) -Git hooks validate changes on `git commit`: +Git hooks validate changes on `git commit` (prek is pinned in `mise.toml`): ```bash prek install prek run --all-files # verify setup @@ -104,6 +105,7 @@ Fragment types (file suffix): - `.bugfix.md` - Bug fixes - `.infra.md` - Infrastructure changes - `.doc.md` - Documentation +- `.ai.md` - AI-assisted changes - `.misc.md` - Other ### 4. Test Your Changes From fb4bf5a7a350b677856f6f1a5ebc16c78190e71d Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 09:28:02 -0700 Subject: [PATCH 010/122] Add frigate-notify nix container build (#339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Mirrors `github.com/0x2142/frigate-notify` at `v0.5.4` to `forge.ops.eblu.me/mirrors/frigate-notify`. - Adds `containers/frigate-notify/default.nix` — `buildGoModule` + `dockerTools.buildLayeredImage`, following the `ntfy` pattern. - Uses `-tags goolm` to avoid the libolm CGO dependency (matrix notifier is imported unconditionally in the upstream but we only use ntfy alerts). - Runs as nonroot (UID 65534), exposes port 8000, bundles `cacert`/`tzdata`. ## Why Move `ghcr.io/0x2142/frigate-notify:v0.5.4` (ringtail-deployed) under local control. Aligns with the [[indri → ringtail migration plan]] and the `default.nix` convention for ringtail-targeted containers documented in [[build-container-image]]. ## Verification - `dagger call build-nix --src=. --container-name=frigate-notify export --path=./out.tar.gz` produces a valid 20MB docker archive (10 layers) with `blumeops/frigate-notify` tag locally. - Hashes pinned for `fetchgit` (src) and `vendorHash` (go modules). ## Follow-up (post-merge) 1. `mise run container-build-and-release frigate-notify` — release from main SHA. 2. C0 follow-up: update `argocd/manifests/frigate/kustomization.yaml` image ref to `registry.ops.eblu.me/blumeops/frigate-notify:v0.5.4--nix`. 3. ArgoCD auto-syncs the deployment. ## Test plan - [ ] `dagger call build-nix` succeeds from a clean checkout. - [ ] `mise run container-build-and-release frigate-notify --dry-run` looks correct. - [ ] After release + kustomization swap: frigate-notify pod comes up healthy on ringtail; ntfy alerts still fire on Frigate events. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/339 --- containers/frigate-notify/default.nix | 57 +++++++++++++++++++ .../+frigate-notify-local.infra.md | 1 + 2 files changed, 58 insertions(+) create mode 100644 containers/frigate-notify/default.nix create mode 100644 docs/changelog.d/+frigate-notify-local.infra.md diff --git a/containers/frigate-notify/default.nix b/containers/frigate-notify/default.nix new file mode 100644 index 0000000..1ddbe4e --- /dev/null +++ b/containers/frigate-notify/default.nix @@ -0,0 +1,57 @@ +# Nix-built frigate-notify — polls Frigate webapi and pushes alerts to ntfy. +{ pkgs ? import { } }: + +let + version = "0.5.4"; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/frigate-notify.git"; + rev = "v${version}"; + hash = "sha256-c/QOSQNNJ+ElMDm45lBOsru/ujBhCWethiRefj3hBOk="; + }; + + frigate-notify = pkgs.buildGoModule { + inherit src version; + pname = "frigate-notify"; + + vendorHash = "sha256-Ho9oaK01wJDPf3ufV2klV1dG4qFNVNJkWmWvEgAy10s="; + + doCheck = false; + subPackages = [ "." ]; + + # `goolm` swaps the matrix crypto backend from libolm (CGO) to pure-Go olm, + # avoiding the libolm.h dependency. Our deployment doesn't use matrix, but + # the package is imported unconditionally. + tags = [ "goolm" ]; + + ldflags = [ "-s" "-w" ]; + + meta = with pkgs.lib; { + description = "Bridge between Frigate NVR events and notification services"; + homepage = "https://github.com/0x2142/frigate-notify"; + license = licenses.mit; + mainProgram = "frigate-notify"; + }; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/frigate-notify"; + contents = [ + frigate-notify + pkgs.cacert + pkgs.tzdata + ]; + + config = { + Entrypoint = [ "${frigate-notify}/bin/frigate-notify" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + ]; + ExposedPorts = { + "8000/tcp" = { }; + }; + User = "65534"; + }; +} diff --git a/docs/changelog.d/+frigate-notify-local.infra.md b/docs/changelog.d/+frigate-notify-local.infra.md new file mode 100644 index 0000000..120f915 --- /dev/null +++ b/docs/changelog.d/+frigate-notify-local.infra.md @@ -0,0 +1 @@ +Add local nix container build for `frigate-notify` (`containers/frigate-notify/default.nix`) so the Frigate→ntfy bridge is rebuilt on ringtail from the forge mirror instead of pulled from `ghcr.io/0x2142/frigate-notify`. From c88b6d773cc9513952a94bf547365f44fab5cf78 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 09:31:29 -0700 Subject: [PATCH 011/122] C0: point frigate-notify at local registry tag v0.5.4-fb4bf5a-nix Built from main in run #516 after #339 merged. Follows the navidrome kustomization convention (deployment image = local ref + :kustomized, kustomization override = newTag only). Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/frigate/deployment-notify.yaml | 2 +- argocd/manifests/frigate/kustomization.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/argocd/manifests/frigate/deployment-notify.yaml b/argocd/manifests/frigate/deployment-notify.yaml index 740d104..91f4237 100644 --- a/argocd/manifests/frigate/deployment-notify.yaml +++ b/argocd/manifests/frigate/deployment-notify.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: frigate-notify - image: ghcr.io/0x2142/frigate-notify:kustomized + image: registry.ops.eblu.me/blumeops/frigate-notify:kustomized env: - name: TZ value: America/Los_Angeles diff --git a/argocd/manifests/frigate/kustomization.yaml b/argocd/manifests/frigate/kustomization.yaml index b424bd0..3a679c6 100644 --- a/argocd/manifests/frigate/kustomization.yaml +++ b/argocd/manifests/frigate/kustomization.yaml @@ -17,8 +17,8 @@ images: newTag: "1.37" - name: ghcr.io/blakeblackshear/frigate newTag: 0.17.1-tensorrt - - name: ghcr.io/0x2142/frigate-notify - newTag: v0.5.4 + - name: registry.ops.eblu.me/blumeops/frigate-notify + newTag: v0.5.4-fb4bf5a-nix configMapGenerator: - name: frigate-config From e92805409e05961919fb71ee32605e67a86eb21c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 09:43:00 -0700 Subject: [PATCH 012/122] fix(frigate-notify): set WorkingDir=/app and create writable /app MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upstream binary expects CWD=/app (relative config.yml lookup, lumberjack logfile at ./log/app.log). Without this, the pod crashed on startup — the ConfigMap-mounted /app/config.yml wasn't found and zerolog spammed "mkdir log: permission denied" as it tried to create ./log at / as nonroot. Creates /app as 1777 (tmp-style) so nonroot can write logs; WorkingDir set to /app so the default config path resolves correctly. Co-Authored-By: Claude Opus 4.7 (1M context) --- containers/frigate-notify/default.nix | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/containers/frigate-notify/default.nix b/containers/frigate-notify/default.nix index 1ddbe4e..701b194 100644 --- a/containers/frigate-notify/default.nix +++ b/containers/frigate-notify/default.nix @@ -43,8 +43,17 @@ pkgs.dockerTools.buildLayeredImage { pkgs.tzdata ]; + # Upstream Dockerfile expects WORKDIR=/app (config at ./config.yml, logfile at + # ./log/app.log via lumberjack). Create /app world-writable so nonroot can + # write logs; the config is mounted in from a ConfigMap. + extraCommands = '' + mkdir -p app + chmod 1777 app + ''; + config = { Entrypoint = [ "${frigate-notify}/bin/frigate-notify" ]; + WorkingDir = "/app"; Env = [ "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" "TZDIR=${pkgs.tzdata}/share/zoneinfo" From a9ef02a602a0483a0cf061d4e9a01c9765bf9044 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 09:44:24 -0700 Subject: [PATCH 013/122] C0: bump frigate-notify to v0.5.4-e928054-nix (workdir fix) Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/frigate/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/frigate/kustomization.yaml b/argocd/manifests/frigate/kustomization.yaml index 3a679c6..a61c758 100644 --- a/argocd/manifests/frigate/kustomization.yaml +++ b/argocd/manifests/frigate/kustomization.yaml @@ -18,7 +18,7 @@ images: - name: ghcr.io/blakeblackshear/frigate newTag: 0.17.1-tensorrt - name: registry.ops.eblu.me/blumeops/frigate-notify - newTag: v0.5.4-fb4bf5a-nix + newTag: v0.5.4-e928054-nix configMapGenerator: - name: frigate-config From 0ceafc374db22f21d35a72288d76a55464fd9b92 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 09:59:48 -0700 Subject: [PATCH 014/122] C0: review operator-managed-pods CC (2026-04-21) Tailscale operator still defaults to privileged proxy pods with no seccomp profile (issue #7359 open upstream). Control remains valid. Added note about ProxyClass + device plugin remediation path. Co-Authored-By: Claude Opus 4.7 (1M context) --- compensating-controls.yaml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/compensating-controls.yaml b/compensating-controls.yaml index b441341..67bbf75 100644 --- a/compensating-controls.yaml +++ b/compensating-controls.yaml @@ -77,11 +77,16 @@ controls: operator, not user manifests. Operator is tracked in service-versions.yaml and regularly updated. created: 2026-03-30 - last-reviewed: 2026-03-30 + last-reviewed: 2026-04-21 notes: >- Verify operator version is current via 'mise run service-review'. Check Tailscale changelog for security fixes. If operator adds - seccomp support, remove these mutes. + seccomp support, remove these mutes. As of 2026-04-21: still no + default seccomp on operator-generated pods (upstream issue #7359 + open). A ProxyClass + generic device plugin can downgrade proxies + from privileged to NET_ADMIN+NET_RAW and set seccompProfile — + potential future remediation to remove the seccomp mute without + waiting for upstream defaults. - id: ephemeral-privileged-jobs description: >- From e6a6a6042e6765e5c1aba62c3ad4c315da64608c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 10:12:00 -0700 Subject: [PATCH 015/122] C0: suggest mise run runner-logs in container-build-and-release After dispatching, poll the Forgejo API for the run matching our head_sha and print `mise run runner-logs ` so the suggested monitor command is one copy-paste away. Falls back to the bare command if the poll times out. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...ontainer-build-suggest-runner-logs.misc.md | 1 + mise-tasks/container-build-and-release | 61 ++++++++++++++++++- 2 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+container-build-suggest-runner-logs.misc.md diff --git a/docs/changelog.d/+container-build-suggest-runner-logs.misc.md b/docs/changelog.d/+container-build-suggest-runner-logs.misc.md new file mode 100644 index 0000000..d10ea51 --- /dev/null +++ b/docs/changelog.d/+container-build-suggest-runner-logs.misc.md @@ -0,0 +1 @@ +`container-build-and-release` now prints the specific `mise run runner-logs ` command after dispatching, polling the Forgejo API to resolve the run number for the commit it just triggered. diff --git a/mise-tasks/container-build-and-release b/mise-tasks/container-build-and-release index 2e1be27..afa970e 100755 --- a/mise-tasks/container-build-and-release +++ b/mise-tasks/container-build-and-release @@ -15,6 +15,7 @@ Dockerfile and Nix builds in a single workflow. import subprocess import sys +import time from pathlib import Path import httpx @@ -48,6 +49,52 @@ def get_forge_token() -> str: return result.stdout.strip() +def max_run_number(headers: dict[str, str]) -> int: + """Return the highest current run_number for WORKFLOW, or 0 if none.""" + resp = httpx.get( + f"{FORGE_API}/repos/{REPO}/actions/tasks", + params={"limit": 50}, + headers=headers, + timeout=15, + ) + if resp.status_code != 200: + return 0 + runs = [ + t["run_number"] + for t in resp.json().get("workflow_runs", []) + if t.get("workflow_id") == WORKFLOW + ] + return max(runs, default=0) + + +def find_dispatched_run( + ref: str, floor: int, headers: dict[str, str], timeout_s: int = 20 +) -> int | None: + """Poll the tasks endpoint for the run triggered by our dispatch. + + Matches by head_sha + workflow + run_number > floor so we don't pick up + an older build of the same commit or a concurrent unrelated dispatch. + """ + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + resp = httpx.get( + f"{FORGE_API}/repos/{REPO}/actions/tasks", + params={"limit": 20}, + headers=headers, + timeout=15, + ) + if resp.status_code == 200: + for task in resp.json().get("workflow_runs", []): + if ( + task.get("head_sha") == ref + and task.get("workflow_id") == WORKFLOW + and task.get("run_number", 0) > floor + ): + return task["run_number"] + time.sleep(1) + return None + + def list_containers() -> None: typer.echo("Available containers:") for d in sorted(Path("containers").iterdir()): @@ -112,7 +159,8 @@ def main( if dry_run: typer.echo(f"[dry-run] Would dispatch {WORKFLOW}") typer.echo() - typer.echo(f"Monitor builds at: {FORGE_ACTIONS}") + typer.echo("Monitor builds with: mise run runner-logs") + typer.echo(f" or visit: {FORGE_ACTIONS}") return token = get_forge_token() @@ -132,6 +180,10 @@ def main( typer.echo("Push your changes before triggering a build: git push origin main") raise typer.Exit(1) + # Snapshot the highest existing run_number so we can identify the one + # our dispatch creates. + floor = max_run_number(headers) + url = f"{FORGE_API}/repos/{REPO}/actions/workflows/{WORKFLOW}/dispatches" payload = { "ref": "main", @@ -148,7 +200,12 @@ def main( raise typer.Exit(1) typer.echo() - typer.echo(f"Monitor builds at: {FORGE_ACTIONS}") + run_number = find_dispatched_run(ref, floor, headers) + if run_number is not None: + typer.echo(f"Monitor builds with: mise run runner-logs {run_number}") + else: + typer.echo("Monitor builds with: mise run runner-logs") + typer.echo(f" or visit: {FORGE_ACTIONS}") if __name__ == "__main__": From 225b0e700870725ef08c453cd879e5da06c69327 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 10:18:08 -0700 Subject: [PATCH 016/122] C0: allow argocd CLI --sso localhost callback Adds http://localhost:8085/auth/callback to the ArgoCD OAuth2 provider's redirect_uris so `argocd login --sso` works. Loopback redirect is the RFC 8252 pattern for native CLI apps; PKCE (already enabled) covers the code-interception risk. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/authentik/configmap-blueprint.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml index 27910ef..aa6a07e 100644 --- a/argocd/manifests/authentik/configmap-blueprint.yaml +++ b/argocd/manifests/authentik/configmap-blueprint.yaml @@ -270,6 +270,8 @@ data: url: https://argocd.ops.eblu.me/auth/callback - matching_mode: strict url: https://argocd.tail8d86e.ts.net/auth/callback + - matching_mode: strict + url: http://localhost:8085/auth/callback signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]] property_mappings: - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]] From 0e62ad55961bca8b14a33d766cd4cd0f197c4a9c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 10:34:39 -0700 Subject: [PATCH 017/122] =?UTF-8?q?C0:=20argocd=20OIDC=20=E2=80=94=20switc?= =?UTF-8?q?h=20to=20public=20client=20for=20CLI=20SSO?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes argocd's Authentik OAuth2 client from confidential to public and drops the clientSecret from argocd-cm. Public + PKCE works for both the web UI (argocd-server backend) and the argocd CLI (`argocd login --sso`) without a shared secret, matching OAuth 2.1 guidance. Confidential → public was needed because the CLI can't hold a client secret; Authentik's per-app issuer model made the alternative ("cliClientID" pattern with separate public client) awkward since it requires a shared issuer across apps which Authentik doesn't serve. Follow-up: deadcode AUTHENTIK_ARGOCD_CLIENT_SECRET env wiring and the argocd-oidc-authentik ExternalSecret once verified. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/argocd/argocd-cm-patch.yaml | 1 - argocd/manifests/authentik/configmap-blueprint.yaml | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/argocd/manifests/argocd/argocd-cm-patch.yaml b/argocd/manifests/argocd/argocd-cm-patch.yaml index cb7e27f..54e4ede 100644 --- a/argocd/manifests/argocd/argocd-cm-patch.yaml +++ b/argocd/manifests/argocd/argocd-cm-patch.yaml @@ -16,7 +16,6 @@ data: name: Authentik issuer: https://authentik.ops.eblu.me/application/o/argocd/ clientID: argocd - clientSecret: $argocd-oidc-authentik:client-secret requestedScopes: - openid - profile diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml index aa6a07e..fcbb99b 100644 --- a/argocd/manifests/authentik/configmap-blueprint.yaml +++ b/argocd/manifests/authentik/configmap-blueprint.yaml @@ -262,9 +262,8 @@ data: name: ArgoCD authorization_flow: !Find [authentik_flows.flow, [slug, default-provider-authorization-implicit-consent]] invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]] - client_type: confidential + client_type: public client_id: argocd - client_secret: !Env AUTHENTIK_ARGOCD_CLIENT_SECRET redirect_uris: - matching_mode: strict url: https://argocd.ops.eblu.me/auth/callback From 86317315edbb17a4013b1de9a8bf77325db7658f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 10:38:26 -0700 Subject: [PATCH 018/122] C0: remove argocd OIDC client_secret wiring Now that argocd's Authentik OAuth2 client is public (PKCE-only), the client_secret plumbing is dead code: - delete argocd-oidc-authentik ExternalSecret and drop it from kustomization - remove AUTHENTIK_ARGOCD_CLIENT_SECRET env from authentik-worker - remove argocd-client-secret mapping from authentik-config ExternalSecret The argocd-client-secret field in the 1Password "Authentik (blumeops)" item is now unreferenced and can be deleted there. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../external-secret-oidc-authentik.yaml | 31 ------------------- argocd/manifests/argocd/kustomization.yaml | 1 - .../authentik/deployment-worker.yaml | 5 --- .../manifests/authentik/external-secret.yaml | 4 --- 4 files changed, 41 deletions(-) delete mode 100644 argocd/manifests/argocd/external-secret-oidc-authentik.yaml diff --git a/argocd/manifests/argocd/external-secret-oidc-authentik.yaml b/argocd/manifests/argocd/external-secret-oidc-authentik.yaml deleted file mode 100644 index 475a713..0000000 --- a/argocd/manifests/argocd/external-secret-oidc-authentik.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# ExternalSecret for ArgoCD OIDC client secret (Authentik) -# -# Referenced from argocd-cm as $argocd-oidc-authentik:client-secret -# Must have app.kubernetes.io/part-of: argocd label for ArgoCD to read it -# ---- -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: argocd-oidc-authentik - namespace: argocd -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: argocd-oidc-authentik - creationPolicy: Owner - template: - metadata: - labels: - app.kubernetes.io/part-of: argocd - data: - - secretKey: client-secret - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: "Authentik (blumeops)" - metadataPolicy: None - property: argocd-client-secret diff --git a/argocd/manifests/argocd/kustomization.yaml b/argocd/manifests/argocd/kustomization.yaml index 9bdac10..6deb7ec 100644 --- a/argocd/manifests/argocd/kustomization.yaml +++ b/argocd/manifests/argocd/kustomization.yaml @@ -9,7 +9,6 @@ resources: - https://raw.githubusercontent.com/argoproj/argo-cd/998fb59dc355653c0657908a6ea2f87136e022d1/manifests/install.yaml - ingress-tailscale.yaml - external-secret-repo-forge.yaml - - external-secret-oidc-authentik.yaml patches: - path: argocd-cmd-params-cm.yaml diff --git a/argocd/manifests/authentik/deployment-worker.yaml b/argocd/manifests/authentik/deployment-worker.yaml index b81ec32..053fa3d 100644 --- a/argocd/manifests/authentik/deployment-worker.yaml +++ b/argocd/manifests/authentik/deployment-worker.yaml @@ -75,11 +75,6 @@ spec: secretKeyRef: name: authentik-config key: jellyfin-client-secret - - name: AUTHENTIK_ARGOCD_CLIENT_SECRET - valueFrom: - secretKeyRef: - name: authentik-config - key: argocd-client-secret - name: AUTHENTIK_MEALIE_CLIENT_SECRET valueFrom: secretKeyRef: diff --git a/argocd/manifests/authentik/external-secret.yaml b/argocd/manifests/authentik/external-secret.yaml index 9abf699..93de499 100644 --- a/argocd/manifests/authentik/external-secret.yaml +++ b/argocd/manifests/authentik/external-secret.yaml @@ -53,10 +53,6 @@ spec: remoteRef: key: "Authentik (blumeops)" property: jellyfin-client-secret - - secretKey: argocd-client-secret - remoteRef: - key: "Authentik (blumeops)" - property: argocd-client-secret - secretKey: mealie-client-secret remoteRef: key: "Authentik (blumeops)" From 7d94b9073ae3230e78901f5b95351bf0f4fe6016 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 21 Apr 2026 10:43:21 -0700 Subject: [PATCH 019/122] =?UTF-8?q?C0:=20docs=20=E2=80=94=20default=20argo?= =?UTF-8?q?cd=20login=20to=20--sso;=20drop=20extraneous=20--grpc-web?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that argocd's Authentik OAuth2 client is public, `argocd login --sso` works for day-to-day use. Promote it to the default in AGENTS.md, argocd-cli reference, and troubleshooting; keep the admin/password flow documented as a break-glass fallback for when Authentik is unavailable. Also drops --grpc-web from every interactive login command — confirmed extraneous (login succeeds without it). Left in CI workflows and `argocd cluster add` untouched; those are different contexts that I didn't re-test. Co-Authored-By: Claude Opus 4.7 (1M context) --- AGENTS.md | 2 +- argocd/manifests/argocd/README.md | 4 ++-- .../operations/rebuild-minikube-cluster.md | 24 ++++++------------- docs/how-to/operations/troubleshooting.md | 5 ++++ docs/reference/tools/argocd-cli.md | 8 +++++++ 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 80f9852..9e7350d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -86,7 +86,7 @@ Most services run in minikube on indri via ArgoCD (app-of-apps, manual sync). GP **Commands:** `argocd app list|get|diff|sync ` -**Login:** `argocd login argocd.ops.eblu.me --username admin --password "$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/srogeebssulhtb6tnqd7ls6qey/password')"` +**Login:** `argocd login argocd.ops.eblu.me --sso` (opens browser for Authentik SSO). Admin fallback for break-glass: `argocd login argocd.ops.eblu.me --username admin --password "$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/srogeebssulhtb6tnqd7ls6qey/password')"` ### Indri (Ansible) diff --git a/argocd/manifests/argocd/README.md b/argocd/manifests/argocd/README.md index 615e3bb..2eaf4d4 100644 --- a/argocd/manifests/argocd/README.md +++ b/argocd/manifests/argocd/README.md @@ -25,7 +25,7 @@ kubectl wait --for=condition=available deployment/argocd-server -n argocd --time kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d && echo # 5. Login and change password -argocd login argocd.tail8d86e.ts.net --username admin --grpc-web +argocd login argocd.tail8d86e.ts.net --username admin argocd account update-password # 6. Apply repo-creds-forge credential template for SSH access to all forge repos @@ -114,4 +114,4 @@ spec: Future improvement: integrate with a secrets operator (e.g., External Secrets). - The credential template (`repo-creds`) uses a URL prefix to match all repos on forge. - ArgoCD uses Tailscale Ingress with Let's Encrypt for TLS termination. -- The `--grpc-web` flag is required for CLI access through the Tailscale ingress. +- After Authentik is up, prefer `argocd login argocd.ops.eblu.me --sso` over the admin password login above; admin is only needed during bootstrap or as break-glass. diff --git a/docs/how-to/operations/rebuild-minikube-cluster.md b/docs/how-to/operations/rebuild-minikube-cluster.md index ad64c89..e23d027 100644 --- a/docs/how-to/operations/rebuild-minikube-cluster.md +++ b/docs/how-to/operations/rebuild-minikube-cluster.md @@ -108,18 +108,13 @@ kubectl --context=minikube-indri apply -f argocd/apps/apps.yaml # 6. Login and sync apps argocd login argocd.tail8d86e.ts.net --username admin \ --password "$(kubectl --context=minikube-indri -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d)" \ - --grpc-web -argocd app sync apps --grpc-web -``` + argocd app sync apps``` ## Phase 4: Bootstrap 1Password Connect + External Secrets ```bash # 1. Sync foundation -argocd app sync external-secrets-crds --grpc-web -argocd app sync external-secrets --grpc-web -argocd app sync 1password-connect --grpc-web - +argocd app sync external-secrets-crdsargocd app sync external-secretsargocd app sync 1password-connect # 2. Create 1Password Connect secrets manually CREDS_RAW=$(op read "op://blumeops/1Password Connect/credentials-file") echo "$CREDS_RAW" | kubectl --context=minikube-indri create secret generic op-credentials -n 1password \ @@ -140,25 +135,20 @@ kubectl --context=minikube-indri get clustersecretstores ```bash # Foundation (CRDs, operators) -argocd app sync cloudnative-pg kube-state-metrics --grpc-web - +argocd app sync cloudnative-pg kube-state-metrics # Databases -argocd app sync blumeops-pg --grpc-web - +argocd app sync blumeops-pg # Observability -argocd app sync loki prometheus tempo grafana grafana-config --grpc-web - +argocd app sync loki prometheus tempo grafana grafana-config # Register ringtail cluster (for authentik, ntfy, ollama, frigate) ssh ringtail 'sudo cat /etc/rancher/k3s/k3s.yaml' | \ sed 's|127.0.0.1|ringtail.tail8d86e.ts.net|' > /tmp/k3s-ringtail.yaml KUBECONFIG=/tmp/k3s-ringtail.yaml argocd cluster add default --name k3s-ringtail --grpc-web -y # Authentik (critical — Zot OIDC depends on it, most image pulls depend on Zot) -argocd app sync authentik --grpc-web - +argocd app sync authentik # Everything else -argocd app sync tailscale-operator alloy-k8s --grpc-web -# ... remaining apps +argocd app sync tailscale-operator alloy-k8s# ... remaining apps ``` ## Phase 6: Restore Databases from Borgmatic diff --git a/docs/how-to/operations/troubleshooting.md b/docs/how-to/operations/troubleshooting.md index 63dc79a..84301c3 100644 --- a/docs/how-to/operations/troubleshooting.md +++ b/docs/how-to/operations/troubleshooting.md @@ -72,6 +72,11 @@ kubectl --context=minikube-indri -n get pods --field-selector=status **ArgoCD login expired:** ```bash +argocd login argocd.ops.eblu.me --sso +``` + +If Authentik itself is down, fall back to admin: +```bash argocd login argocd.ops.eblu.me --username admin --password "$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/srogeebssulhtb6tnqd7ls6qey/password')" ``` diff --git a/docs/reference/tools/argocd-cli.md b/docs/reference/tools/argocd-cli.md index 7a60490..a2aa223 100644 --- a/docs/reference/tools/argocd-cli.md +++ b/docs/reference/tools/argocd-cli.md @@ -24,6 +24,14 @@ argocd app sync apps # Sync the app-of-apps (picks up new Application ## Login +Default (Authentik SSO, PKCE, opens browser): + +```bash +argocd login argocd.ops.eblu.me --sso +``` + +Break-glass admin login (only if Authentik is down): + ```bash argocd login argocd.ops.eblu.me \ --username admin \ From 88eabc3de6f322e099fccd8c2df5d3bea9b5d587 Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Tue, 21 Apr 2026 14:47:13 -0700 Subject: [PATCH 020/122] Disable Xalia --- nixos/ringtail/gaming.nix | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nixos/ringtail/gaming.nix b/nixos/ringtail/gaming.nix index 2b361b3..d84ef9b 100644 --- a/nixos/ringtail/gaming.nix +++ b/nixos/ringtail/gaming.nix @@ -7,6 +7,11 @@ dedicatedServer.openFirewall = true; }; + # Proton Experimental ships an accessibility bridge (xalia) that hangs during + # game launch when AT-SPI is not running on the host. This host has no AT-SPI, + # so disable xalia globally to avoid wedging iscriptevaluator.exe. + environment.sessionVariables.PROTON_USE_XALIA = "0"; + # Gamescope — micro-compositor for game fullscreen/resolution management. # Use as Steam launch option: gamescope -W 2560 -H 1440 -f -- %command% programs.gamescope = { From 34fa2ef28abd785655e75a8ea7ac7bc1662e3f6f Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Thu, 23 Apr 2026 12:16:02 -0700 Subject: [PATCH 021/122] =?UTF-8?q?C0:=20ringtail=20=E2=80=94=20restore=20?= =?UTF-8?q?sway=20default=20keybindings,=20fix=20fuzzel=20border=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend (not replace) home-manager's default sway keybindings via lib.mkOptionDefault, with lib.mkForce on the custom overrides that conflict with defaults. Add Mod+F1 cheatsheet binding (fuzzel-filterable). Move fuzzel's border-radius/border-width out of [main] into a proper [border] section with the expected short names. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+ringtail-sway-fuzzel.bugfix.md | 3 +++ nixos/ringtail/configuration.nix | 23 +++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) create mode 100644 docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md diff --git a/docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md b/docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md new file mode 100644 index 0000000..6801040 --- /dev/null +++ b/docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md @@ -0,0 +1,3 @@ +Fixed sway keybindings on ringtail — the home-manager `keybindings` block was replacing the module's defaults entirely, leaving only explicit overrides (no workspace switching, focus, move, splits, resize mode, etc). Switched to `lib.mkOptionDefault` with `lib.mkForce` on the conflicting custom binds (`Mod+Return`, `Mod+d`, `Mod+space`, `Mod+l`) so defaults merge back in. Also added `Mod+F1` to show a filterable fuzzel list of current keybindings. + +Fixed fuzzel config errors on launch — `border-radius` and `border-width` were under `[main]`, but fuzzel expects them as `radius`/`width` under a `[border]` section. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 052f38d..2cc5280 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -323,13 +323,16 @@ in bg = "~/.config/sway/wallpaper.jpg fill"; }; }; - keybindings = let mod = "Mod4"; in { - "${mod}+Return" = "exec wezterm"; - "${mod}+Shift+q" = "kill"; - "${mod}+d" = "exec wmenu-run"; - "${mod}+space" = "exec fuzzel"; - "${mod}+Shift+c" = "reload"; - "${mod}+l" = "exec swaylock -f"; + # Extend (not replace) the home-manager default sway keybindings. + # lib.mkForce is needed on keys whose defaults we want to override + # (same priority otherwise conflicts). Audio keys and Mod+d (wmenu-run + # vs the default menu binding) don't collide with defaults. + keybindings = let mod = "Mod4"; in lib.mkOptionDefault { + "${mod}+Return" = lib.mkForce "exec wezterm"; + "${mod}+d" = lib.mkForce "exec wmenu-run"; + "${mod}+space" = lib.mkForce "exec fuzzel"; + "${mod}+l" = lib.mkForce "exec swaylock -f"; + "${mod}+F1" = "exec grep '^bindsym' ~/.config/sway/config | fuzzel --dmenu"; "--locked XF86AudioMute" = "exec pactl set-sink-mute @DEFAULT_SINK@ toggle"; "--locked XF86AudioLowerVolume" = "exec pactl set-sink-volume @DEFAULT_SINK@ -5%"; "--locked XF86AudioRaiseVolume" = "exec pactl set-sink-volume @DEFAULT_SINK@ +5%"; @@ -401,8 +404,10 @@ in width = 40; horizontal-pad = 16; vertical-pad = 8; - border-radius = 8; - border-width = 2; + }; + border = { + radius = 8; + width = 2; }; colors = { background = "24273add"; From 72b27b7fd258482f225d2b60b11cd149a479ddd4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 24 Apr 2026 19:04:28 -0700 Subject: [PATCH 022/122] =?UTF-8?q?C0:=20docs=20=E2=80=94=20add=20mealie?= =?UTF-8?q?=20borg=20restore=20how-to?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the procedure used to restore mealie's SQLite DB from a borgmatic archive after the post-DR wipe: extract from borg, snapshot the wiped DB, swap via a helper pod on the ReadWriteOnce PVC, fix UID 911 ownership. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/how-to/mealie/restore-from-borg.md | 157 ++++++++++++++++++++++++ docs/reference/services/mealie.md | 2 + 2 files changed, 159 insertions(+) create mode 100644 docs/how-to/mealie/restore-from-borg.md diff --git a/docs/how-to/mealie/restore-from-borg.md b/docs/how-to/mealie/restore-from-borg.md new file mode 100644 index 0000000..7ff3625 --- /dev/null +++ b/docs/how-to/mealie/restore-from-borg.md @@ -0,0 +1,157 @@ +--- +title: Restore Mealie from Borg +modified: 2026-04-24 +last-reviewed: 2026-04-24 +tags: + - how-to + - mealie + - backup +--- + +# Restore Mealie from Borg + +How to restore [[mealie]]'s SQLite database from a [[borgmatic]] archive when data has been lost (e.g. PVC wiped, accidental deletion, post-DR rebuild). + +## Prerequisites + +- SSH access to [[indri]] (where borgmatic runs and stores k8s SQLite dumps) +- Mealie deployment present in the cluster (the PVC `mealie-data` exists in namespace `mealie`) +- Know which borg archive predates the data loss + +## Procedure + +### 1. Identify a Pre-Loss Archive + +List archives and pick one before the incident: + +```bash +ssh indri 'BORG_PASSCOMMAND="cat /Users/erichblume/.borg/config.yaml" \ + /opt/homebrew/bin/borg list /Volumes/backups/borg | tail -30' +``` + +Compare dump sizes across archives if you're unsure when the loss happened — the daily borgmatic run captures `/Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db`. A sudden drop in size signals the wipe: + +```bash +ssh indri 'bash -c "BORG_PASSCOMMAND=\"cat /Users/erichblume/.borg/config.yaml\" \ + /opt/homebrew/bin/borg list /Volumes/backups/borg:: \ + --pattern=+Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db"' +``` + +### 2. Extract the Pre-Loss Dump + +```bash +ssh indri 'mkdir -p ~/tmp/mealie-restore && cd ~/tmp/mealie-restore && \ + BORG_PASSCOMMAND="cat /Users/erichblume/.borg/config.yaml" \ + /opt/homebrew/bin/borg extract /Volumes/backups/borg:: \ + Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db' +``` + +The file lands at `~/tmp/mealie-restore/Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db` (borg preserves the full path). + +### 3. Verify the Extracted DB + +```bash +ssh indri 'sqlite3 ~/tmp/mealie-restore/Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db \ + "PRAGMA integrity_check; SELECT COUNT(*) FROM recipes; SELECT COUNT(*) FROM users;"' +``` + +Expect `ok` and non-zero recipe/user counts. + +### 4. Snapshot the Current (Wiped) DB + +Belt and suspenders — keep a copy of the live DB before overwriting, in case the restore goes wrong: + +```bash +ssh indri 'bash -c "kubectl --context=minikube -n mealie exec deploy/mealie -- \ + python3 -c \"import sqlite3; sqlite3.connect(\\\"/app/data/mealie.db\\\").backup(sqlite3.connect(\\\"/tmp/wiped-mealie.db\\\"))\" && \ + POD=\$(kubectl --context=minikube -n mealie get pod -l app=mealie -o jsonpath=\"{.items[0].metadata.name}\") && \ + kubectl --context=minikube cp mealie/\$POD:/tmp/wiped-mealie.db /Users/erichblume/tmp/mealie-restore/wiped-mealie.db"' +``` + +### 5. Scale Mealie Down + +The PVC is `ReadWriteOnce`, so the helper pod can't mount it while mealie is running: + +```bash +ssh indri 'kubectl --context=minikube -n mealie scale deploy/mealie --replicas=0 && \ + kubectl --context=minikube -n mealie wait --for=delete pod -l app=mealie --timeout=60s' +``` + +### 6. Start a Helper Pod on the PVC + +```bash +ssh indri 'bash -c "cat > /tmp/mealie-helper.yaml < Date: Mon, 27 Apr 2026 09:48:46 -0700 Subject: [PATCH 023/122] C0: split gandi-operations docs; add dns-acme-cleanup mise task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Splits the nebulous gandi-operations how-to into two single-topic cards (manage-eblu-me-dns, rotate-gandi-pat) and adds a mise task for the recurring _acme-challenge TXT cleanup needed due to a value-comparison bug in libdns/gandi v1.1.0 that prevents certmagic's cleanup phase from removing presented TXT values. The gandi reference card is updated to drop the false "different credential from Pulumi PAT" claim — verified during the 2026-04-27 incident that Caddy and Pulumi share a single PAT. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/how-to/configuration/gandi-operations.md | 90 ------------- .../configuration/manage-eblu-me-dns.md | 52 ++++++++ .../configuration/manage-forgejo-mirrors.md | 2 +- docs/how-to/configuration/rotate-gandi-pat.md | 125 ++++++++++++++++++ docs/reference/infrastructure/gandi.md | 48 +++---- docs/reference/tools/mise-tasks.md | 1 + docs/reference/tools/pulumi.md | 3 +- mise-tasks/dns-acme-cleanup | 112 ++++++++++++++++ pulumi/gandi/README.md | 39 +----- pulumi/gandi/__main__.py | 2 +- 10 files changed, 315 insertions(+), 159 deletions(-) delete mode 100644 docs/how-to/configuration/gandi-operations.md create mode 100644 docs/how-to/configuration/manage-eblu-me-dns.md create mode 100644 docs/how-to/configuration/rotate-gandi-pat.md create mode 100755 mise-tasks/dns-acme-cleanup diff --git a/docs/how-to/configuration/gandi-operations.md b/docs/how-to/configuration/gandi-operations.md deleted file mode 100644 index 0be00dc..0000000 --- a/docs/how-to/configuration/gandi-operations.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: Gandi Operations -modified: 2026-02-17 -last-reviewed: 2026-02-17 -tags: - - how-to - - dns - - pulumi ---- - -# Gandi Operations - -How to manage DNS records and cycle the Gandi API token. - -## Prerequisites - -- Pulumi CLI installed (`brew install pulumi`) -- Access to 1Password blumeops vault (for PAT) -- On the tailnet (Pulumi resolves indri's IP via MagicDNS) - -## Preview and Apply DNS Changes - -```bash -# Preview changes (always do this first) -mise run dns-preview - -# Apply changes -mise run dns-up -``` - -Both tasks fetch the Gandi PAT from 1Password automatically. - -To run Pulumi directly: - -```bash -export GANDI_PERSONAL_ACCESS_TOKEN=$(op read "op://vg6xf6vvfmoh5hqjjhlhbeoaie/mco6ka3dc3rmw7zkg2dhia5d2m/pat") -cd pulumi/gandi -pulumi preview -pulumi up --yes -``` - -## Cycle the Gandi PAT - -The Gandi Personal Access Token has a maximum lifetime of 90 days. Currently set to 30 days as a security compromise, though shorter may be appropriate given infrequent use. - -### 1. Create a new PAT - -Go to the [Gandi admin console](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat) and create a new token: - -- **Name:** `blumeops-pulumi` (or similar) -- **Expiration:** 30 days (max 90; shorter is fine if you run this rarely) -- **Required permission:** Manage domain name technical configurations -- **Also enable:** See and renew domain names - -Copy the new PAT to your clipboard. - -### 2. Update 1Password - -With the new PAT on your clipboard: - -```bash -op item edit mco6ka3dc3rmw7zkg2dhia5d2m pat="$(pbpaste)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie -``` - -### 3. Delete the old PAT - -Return to the Gandi admin console and delete the previous token. - -### 4. Verify - -```bash -mise run dns-preview -``` - -A successful preview confirms the new PAT is working. - -## Break-Glass Override - -If MagicDNS is unavailable and Pulumi can't resolve indri's IP, set the target IP manually. Find indri's current Tailscale IP via `tailscale status` or the admin console: - -```bash -export BLUMEOPS_REVERSE_PROXY_IP= -mise run dns-up -``` - -## Related - -- [[gandi]] - DNS configuration reference -- [[caddy]] - Reverse proxy (also uses a Gandi token for TLS) -- [[update-tailscale-acls]] - Similar Pulumi workflow for Tailscale diff --git a/docs/how-to/configuration/manage-eblu-me-dns.md b/docs/how-to/configuration/manage-eblu-me-dns.md new file mode 100644 index 0000000..4c37d4c --- /dev/null +++ b/docs/how-to/configuration/manage-eblu-me-dns.md @@ -0,0 +1,52 @@ +--- +title: Manage eblu.me DNS Records +modified: 2026-04-27 +last-reviewed: 2026-04-27 +tags: + - how-to + - dns + - pulumi +--- + +# Manage eblu.me DNS Records + +How to add, change, and apply DNS records for `eblu.me` via [[pulumi]]. + +## Prerequisites + +- Pulumi CLI installed (`brew install pulumi`) +- 1Password access (`blumeops` vault) — Pulumi reads the Gandi PAT from there +- On the tailnet — Pulumi resolves [[indri]]'s IP via MagicDNS at apply time + +## Preview and apply + +```bash +mise run dns-preview # always do this first +mise run dns-up # apply +``` + +Both fetch the PAT from 1Password automatically. The Pulumi program is in `pulumi/gandi/`; stack is `eblu-me`. + +## Adding a record + +Edit `pulumi/gandi/__main__.py` and add a `gandi.livedns.Record(...)`. The stack config (`Pulumi.eblu-me.yaml`) only holds `domain` and `subdomain`; everything else is in the program. + +After editing, preview, then apply. + +## Break-glass: override the indri target IP + +The wildcard `*.ops.eblu.me` is computed from `indri.tail8d86e.ts.net` via MagicDNS at apply time. If MagicDNS is unavailable: + +```bash +export BLUMEOPS_REVERSE_PROXY_IP= +mise run dns-up +``` + +Find the IP via `tailscale status` or the Tailscale admin console. + +## Related + +- [[gandi]] — Gandi reference card +- [[rotate-gandi-pat]] — Rotate the PAT shared with [[caddy]] +- [[pulumi]] — Pulumi tooling reference +- [[routing]] — Service URLs and routing architecture diff --git a/docs/how-to/configuration/manage-forgejo-mirrors.md b/docs/how-to/configuration/manage-forgejo-mirrors.md index 7f98549..9c0e113 100644 --- a/docs/how-to/configuration/manage-forgejo-mirrors.md +++ b/docs/how-to/configuration/manage-forgejo-mirrors.md @@ -144,6 +144,6 @@ Trigger a manual sync on one mirror to confirm the new PAT works: ## Related - [[forgejo]] — Forgejo service reference -- [[gandi-operations]] — Similar PAT rotation workflow for Gandi DNS +- [[rotate-gandi-pat]] — Similar PAT rotation workflow for Gandi DNS - [[spork-strategy]] — floating-branch soft-fork strategy explanation - [[create-a-spork]] — create a spork on top of a mirror diff --git a/docs/how-to/configuration/rotate-gandi-pat.md b/docs/how-to/configuration/rotate-gandi-pat.md new file mode 100644 index 0000000..94a0b4e --- /dev/null +++ b/docs/how-to/configuration/rotate-gandi-pat.md @@ -0,0 +1,125 @@ +--- +title: Rotate the Gandi PAT +modified: 2026-04-27 +last-reviewed: 2026-04-27 +tags: + - how-to + - dns + - secrets +--- + +# Rotate the Gandi PAT + +How to rotate the Gandi Personal Access Token. **One PAT** is shared by [[caddy]] (TLS via ACME DNS-01) and Pulumi (DNS records). It lives in 1Password at `op://blumeops/gandi - blumeops/pat`. + +## When to rotate + +- Every 60 days (Todoist recurring task) +- After any compromise / accidental disclosure +- Whenever Gandi starts rejecting the PAT (see [Debugging](#debugging)) + +Gandi caps PAT lifetime at 90 days; rotating at 60 leaves a 30-day buffer. + +## Prerequisites + +- Access to the [Gandi PAT admin console](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat) +- 1Password (`blumeops` vault) +- Ability to run `mise run provision-indri` (ssh to [[indri]] + 1Password biometric) + +## Procedure + +### 1. Create a new PAT in Gandi + +In the [Gandi PAT console](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat), create a token: + +- **Name:** `blumeops` +- **Expiration:** **90 days** (the max — paired with the 60-day rotation cadence) +- **Permissions:** + - Manage domain name technical configurations *(required — DNS records and ACME TXT writes)* + - See and renew domain names + +Other permissions are not used. + +Copy the new PAT to your clipboard. + +### 2. Update 1Password + +```bash +op item edit mco6ka3dc3rmw7zkg2dhia5d2m pat="$(pbpaste)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +### 3. Push to indri + +The PAT lives in two places: 1Password (read by Pulumi at runtime) and `~/.config/caddy/gandi-token` on indri (read by Caddy at startup). The 1Password edit only updates the first. + +```bash +mise run provision-indri --tags caddy +``` + +This re-fetches the PAT from 1Password, writes it to indri, and restarts Caddy. Caddy will renew any due certificates within minutes. + +### 4. Verify + +```bash +mise run dns-preview +``` + +A successful preview confirms Pulumi can use the PAT. + +```bash +ssh indri 'tail -50 ~/Library/Logs/mcquack.caddy.err.log' \ + | grep -E "obtained|renew|error" +``` + +Expect to see no `LiveDNS returned a 403` lines, and either no renewal activity (if no certs were due) or `certificate obtained successfully`. + +### 5. Delete the old PAT in Gandi + +Return to the Gandi PAT console and delete the previous token. + +### 6. Clean up orphan ACME records + +Each successful Caddy renewal leaves orphan `_acme-challenge.ops` TXT records in the zone (a bug in `libdns/gandi` v1.1.0 — see the script docstring). Cadence aligns with rotation: + +```bash +mise run dns-acme-cleanup --dry-run +mise run dns-acme-cleanup +``` + +## Debugging + +### Caddy logs `LiveDNS returned a 403` + +The PAT is invalid (expired, revoked, or insufficient scope). **Gandi returns 403 — not 401 — for an expired PAT**, which can read as a permissions issue. The most common cause is plain expiry. Rotate. + +### `mise run dns-preview` returns 403 + +Same root cause — Pulumi and Caddy share this PAT. + +### After a fresh PAT, Caddy still fails + +Check that the value on indri matches 1Password: + +```bash +diff <(ssh indri 'cat ~/.config/caddy/gandi-token') \ + <(op read 'op://blumeops/gandi - blumeops/pat') +``` + +If they differ, `mise run provision-indri --tags caddy` was skipped or failed. + +Confirm the new PAT works against Gandi directly: + +```bash +curl -s -o /dev/null -w "HTTP %{http_code}\n" \ + -H "Authorization: Bearer $(op read 'op://blumeops/gandi - blumeops/pat')" \ + https://api.gandi.net/v5/livedns/domains/eblu.me +``` + +`200` = healthy. `403` = scope or expiry. `401` = malformed token. + +## Related + +- [[gandi]] — Gandi reference card +- [[manage-eblu-me-dns]] — DNS records workflow (separate operation, same PAT) +- [[caddy]] — Reverse proxy that uses the PAT for TLS +- [[mise-tasks]] — `dns-acme-cleanup`, `provision-indri`, `dns-preview` reference diff --git a/docs/reference/infrastructure/gandi.md b/docs/reference/infrastructure/gandi.md index ae1fe56..763bae3 100644 --- a/docs/reference/infrastructure/gandi.md +++ b/docs/reference/infrastructure/gandi.md @@ -1,7 +1,7 @@ --- title: Gandi -modified: 2026-04-09 -last-reviewed: 2026-04-09 +modified: 2026-04-27 +last-reviewed: 2026-04-27 tags: - infrastructure - networking @@ -20,12 +20,11 @@ DNS hosting provider for the `eblu.me` domain, managed via Pulumi IaC. | **Provider** | Gandi LiveDNS | | **IaC** | `pulumi/gandi/` | | **Stack** | `eblu-me` | +| **PAT** | `op://blumeops/gandi - blumeops/pat` | ## What It Does -Gandi hosts the DNS records that make `*.ops.eblu.me` resolve to [[indri]]'s Tailscale IP (`indri.tail8d86e.ts.net`). Since Tailscale IPs are not publicly routable, this gives services real DNS names while keeping them private to the tailnet. - -The target IP is resolved dynamically from `indri.tail8d86e.ts.net` at deploy time, so if indri's Tailscale IP changes, re-running the deployment is sufficient. +Gandi hosts the DNS records that make `*.ops.eblu.me` resolve to [[indri]]'s Tailscale IP. Since Tailscale IPs are not publicly routable, this gives services real DNS names while keeping them private to the tailnet. The target IP is resolved dynamically from `indri.tail8d86e.ts.net` at deploy time. ## DNS Records @@ -46,38 +45,25 @@ Both records point to [[indri]], which runs [[caddy]] as the reverse proxy for a | `cv.eblu.me` | CNAME | `blumeops-proxy.fly.dev` | 300s | | `forge.eblu.me` | CNAME | `blumeops-proxy.fly.dev` | 300s | -Public CNAMEs point to [[flyio-proxy]] on Fly.io. See [[expose-service-publicly]] for adding new public services. - -See [[routing]] for the full service URL map. - -## Pulumi Configuration - -The Pulumi program lives in `pulumi/gandi/`: - -- `__main__.py` - Creates A and CNAME records via `pulumiverse_gandi` -- `Pulumi.eblu-me.yaml` - Stack config (domain, subdomain) - -Stack config values: - -| Key | Value | -|-----|-------| -| `blumeops-dns:domain` | `eblu.me` | -| `blumeops-dns:subdomain` | `ops` | - -A break-glass override is available via the `BLUMEOPS_REVERSE_PROXY_IP` environment variable, which bypasses dynamic IP resolution. +Public CNAMEs point to [[flyio-proxy]] on Fly.io. See [[expose-service-publicly]] for adding new public services. See [[routing]] for the full service URL map. ## TLS Integration -[[caddy]] uses Gandi's API separately (via `GANDI_BEARER_TOKEN`) for ACME DNS-01 challenges to obtain a wildcard Let's Encrypt certificate for `*.ops.eblu.me`. This is a different credential from the Pulumi PAT. +[[caddy]] uses this same Gandi PAT for ACME DNS-01 challenges to obtain a wildcard Let's Encrypt certificate for `*.ops.eblu.me`. Caddy reads the PAT from `~/.config/caddy/gandi-token` on [[indri]], populated by ansible from 1Password. ## Authentication -Gandi requires a Personal Access Token (PAT) for API access. PATs have a maximum lifetime of 90 days (currently set to 30). See [[gandi-operations]] for deployment and PAT cycling instructions. +One Gandi Personal Access Token, shared by Pulumi and Caddy. Gandi caps PATs at 90 days; rotate every 60 days via [[rotate-gandi-pat]]. + +## ACME Challenge Cleanup + +Caddy's renewal flow leaves `_acme-challenge.ops` TXT orphans in the zone — a value-comparison bug in `libdns/gandi` v1.1.0 makes the cleanup phase a no-op. Run `mise run dns-acme-cleanup` periodically (alongside PAT rotation works well). ## Related -- [[gandi-operations]] - PAT cycling and deployment how-to -- [[routing]] - Service URLs and routing architecture -- [[caddy]] - Reverse proxy using Gandi for TLS -- [[tailscale]] - Tailnet networking -- [[indri]] - Server hosting Caddy (DNS target) +- [[manage-eblu-me-dns]] — Add/change DNS records via Pulumi +- [[rotate-gandi-pat]] — Rotate the shared Gandi PAT +- [[routing]] — Service URLs and routing architecture +- [[caddy]] — Reverse proxy using this PAT for TLS +- [[tailscale]] — Tailnet networking +- [[indri]] — Server hosting Caddy (DNS target) diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md index fefb30f..4ec3438 100644 --- a/docs/reference/tools/mise-tasks.md +++ b/docs/reference/tools/mise-tasks.md @@ -39,6 +39,7 @@ Run `mise tasks --sort name` for the live list with descriptions. | `fly-shutoff` | Emergency shutoff: stop all Fly.io proxy machines | | `dns-preview` | Preview DNS changes with [[pulumi]] | | `dns-up` | Apply DNS changes with [[pulumi]] | +| `dns-acme-cleanup` | Delete orphaned `_acme-challenge.ops` TXT records (libdns/gandi v1.1.0 workaround) | | `tailnet-preview` | Preview Tailscale ACL changes with [[pulumi]] | | `tailnet-up` | Apply Tailscale ACL changes with [[pulumi]] | diff --git a/docs/reference/tools/pulumi.md b/docs/reference/tools/pulumi.md index bdc7e8f..a716bb9 100644 --- a/docs/reference/tools/pulumi.md +++ b/docs/reference/tools/pulumi.md @@ -49,7 +49,8 @@ mise run tailnet-up # Apply ACL/tag changes ## Related -- [[gandi-operations]] — DNS PAT rotation and Pulumi workflow +- [[manage-eblu-me-dns]] — DNS records workflow +- [[rotate-gandi-pat]] — Rotate the Gandi PAT - [[update-tailscale-acls]] — ACL editing and Pulumi workflow - [[gandi]] — DNS hosting - [[tailscale]] — Tailnet configuration diff --git a/mise-tasks/dns-acme-cleanup b/mise-tasks/dns-acme-cleanup new file mode 100755 index 0000000..5152ae2 --- /dev/null +++ b/mise-tasks/dns-acme-cleanup @@ -0,0 +1,112 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# /// +#MISE description="Delete orphaned ACME challenge TXT records in eblu.me" +#USAGE flag "--dry-run" help="List orphans without deleting" +"""Clean up orphaned _acme-challenge TXT records in the eblu.me zone. + +Workaround for libdns/gandi v1.1.0: its DeleteRecords compares unquoted +certmagic values to Gandi-quoted stored values, so cleanup is a silent +no-op. Without this script, the rrset grows by ~2 values per successful +Caddy renewal cycle. + +In healthy steady state these records should be absent. Run alongside +PAT rotation, or any time after Caddy ACME activity. +""" + +import os +import subprocess +from typing import Annotated + +import httpx +import typer +from rich.console import Console +from rich.table import Table + +DOMAIN = "eblu.me" +RRSET = "_acme-challenge.ops" +GANDI_API = "https://api.gandi.net/v5/livedns" +OP_PAT_REF = "op://blumeops/gandi - blumeops/pat" + + +def resolve_token(console: Console) -> str: + env_token = os.environ.get("GANDI_PERSONAL_ACCESS_TOKEN", "").strip() + if env_token: + return env_token + console.print("[dim]Reading Gandi PAT from 1Password...[/dim]") + try: + result = subprocess.run( + ["op", "read", OP_PAT_REF], + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError) as e: + console.print(f"[red]Failed to read PAT from 1Password:[/red] {e}") + raise typer.Exit(1) + + +app = typer.Typer(add_completion=False) + + +@app.command() +def main( + dry_run: Annotated[ + bool, + typer.Option("--dry-run", help="List orphans without deleting"), + ] = False, +) -> None: + """Delete orphan _acme-challenge TXT records in eblu.me.""" + console = Console() + token = resolve_token(console) + + url = f"{GANDI_API}/domains/{DOMAIN}/records/{RRSET}/TXT" + headers = {"Authorization": f"Bearer {token}"} + + with httpx.Client(timeout=15, headers=headers) as client: + resp = client.get(url) + if resp.status_code == 404: + console.print( + f"[green]Clean — {RRSET}.{DOMAIN} TXT rrset is absent.[/green]" + ) + raise typer.Exit(0) + resp.raise_for_status() + values = resp.json().get("rrset_values", []) + + if not values: + console.print( + f"[green]Clean — {RRSET}.{DOMAIN} TXT rrset is empty.[/green]" + ) + raise typer.Exit(0) + + table = Table(title=f"Orphan ACME challenge values: {RRSET}.{DOMAIN}") + table.add_column("#", justify="right") + table.add_column("Value") + for i, v in enumerate(values, 1): + table.add_row(str(i), v) + console.print(table) + console.print(f"\n[bold]{len(values)}[/bold] orphan(s).") + + if dry_run: + console.print("\n[dim]Dry run — no records deleted.[/dim]") + raise typer.Exit(0) + + del_resp = client.delete(url) + if del_resp.status_code == 204: + console.print( + f"[green]Deleted {RRSET}.{DOMAIN} TXT " + f"({len(values)} values).[/green]" + ) + else: + console.print( + f"[red]Delete failed: HTTP {del_resp.status_code}[/red]\n" + f"{del_resp.text[:300]}" + ) + raise typer.Exit(1) + + +if __name__ == "__main__": + app() diff --git a/pulumi/gandi/README.md b/pulumi/gandi/README.md index 9d7b7aa..70d2821 100644 --- a/pulumi/gandi/README.md +++ b/pulumi/gandi/README.md @@ -27,50 +27,19 @@ pulumi stack select eblu-me # or: pulumi stack init eblu-me ## Authentication -This project requires a Gandi Personal Access Token (PAT) with LiveDNS permissions. +This project uses a Gandi Personal Access Token (PAT) shared with Caddy. See the [Gandi reference card](../../docs/reference/infrastructure/gandi.md) and [Rotate the Gandi PAT](../../docs/how-to/configuration/rotate-gandi-pat.md). -**The PAT expires every 30 days and must be cycled manually.** - -### Cycling the PAT - -1. Go to [Gandi PAT Management](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat) - -2. Create a new PAT: - - Name: `blumeops-pulumi` (or similar) - - Expiration: 30 days (maximum is 90; shorter is fine if used rarely) - - Permissions required: - - **Manage domain name technical configurations** (required for DNS records) - - See and renew domain names - - Optional permissions (enabled but not strictly required): - - See & download SSL certificates - - Manage Cloud resources - - See Cloud resources - - View Organization - - Deploy Web Hosting instances - - Manage Web Hosting instances - - See and renew Web Hosting instances - -3. Update 1Password: - ```bash - # Update the existing item with the new PAT value - op item edit mco6ka3dc3rmw7zkg2dhia5d2m pat="" --vault vg6xf6vvfmoh5hqjjhlhbeoaie - ``` - -4. Delete the old PAT from Gandi admin console - -### Running with Authentication - -The mise task handles fetching the PAT from 1Password: +The mise tasks handle fetching the PAT from 1Password: ```bash -mise run dns-up # Preview and apply changes mise run dns-preview # Preview only +mise run dns-up # Preview and apply ``` Or manually: ```bash -export GANDI_PERSONAL_ACCESS_TOKEN=$(op read "op://vg6xf6vvfmoh5hqjjhlhbeoaie/mco6ka3dc3rmw7zkg2dhia5d2m/pat") +export GANDI_PERSONAL_ACCESS_TOKEN=$(op read "op://blumeops/gandi - blumeops/pat") pulumi up ``` diff --git a/pulumi/gandi/__main__.py b/pulumi/gandi/__main__.py index e448ed2..bda7a8a 100644 --- a/pulumi/gandi/__main__.py +++ b/pulumi/gandi/__main__.py @@ -8,7 +8,7 @@ This program manages DNS records for blumeops infrastructure: Authentication: Set GANDI_PERSONAL_ACCESS_TOKEN environment variable. - See docs/how-to/gandi-operations.md for PAT management instructions. + See docs/how-to/configuration/rotate-gandi-pat.md for PAT management. """ import os From f9d9e00057a6a00887b16b18d3831a36b7837f44 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 27 Apr 2026 11:18:16 -0700 Subject: [PATCH 024/122] =?UTF-8?q?C0:=20blumeops-tasks=20=E2=80=94=20show?= =?UTF-8?q?=20due=20offset=20+=20recurrence,=20sort=20by=20overdue-ness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+blumeops-tasks-due-recurrence.feature.md | 1 + mise-tasks/blumeops-tasks | 50 ++++++++++++++++++- 2 files changed, 49 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md diff --git a/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md b/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md new file mode 100644 index 0000000..3d00e1c --- /dev/null +++ b/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md @@ -0,0 +1 @@ +`blumeops-tasks` now annotates each task with a signed `due:±N` offset (or `due:today`) and a `↻ ` marker for recurring tasks, and sorts by overdue-ness (most overdue first, no-due-date last) with priority as tiebreaker. diff --git a/mise-tasks/blumeops-tasks b/mise-tasks/blumeops-tasks index 333178e..1c41dea 100755 --- a/mise-tasks/blumeops-tasks +++ b/mise-tasks/blumeops-tasks @@ -101,9 +101,45 @@ def is_due(task: dict) -> bool: return due_date <= date.today() +def days_until_due(task: dict) -> int | None: + """Return signed days offset from today, or None if no due date. + + Negative = days remaining before due (e.g. -2 = due in 2 days). + Positive = days past due (overdue). Zero = due today. + """ + due = task.get("due") + if due is None: + return None + due_date = date.fromisoformat(due["date"][:10]) + return (date.today() - due_date).days + + +def recurrence_string(task: dict) -> str | None: + """Return the Todoist natural-language recurrence string, or None. + + Todoist's REST API doesn't expose RFC 5545 RRULE; the natural-language + `due.string` (e.g. "every monday", "every 2 weeks") is the terse form. + """ + due = task.get("due") + if due is None or not due.get("is_recurring"): + return None + return due.get("string") + + def sort_tasks(tasks: list[dict]) -> list[dict]: - """Sort tasks by custom priority order: p1, p2, p4, p3.""" - return sorted(tasks, key=lambda t: PRIORITY_SORT_ORDER.get(t["priority"], 5)) + """Sort by overdue-ness, then priority. + + Most overdue first (largest +N); tasks with no due date come last. + Within a given day, tiebreaker is the custom priority order p1, p2, p4, p3. + """ + + def key(task: dict) -> tuple[int, int, int]: + days = days_until_due(task) + no_due = 1 if days is None else 0 + days_key = -(days if days is not None else 0) # descending + return (no_due, days_key, PRIORITY_SORT_ORDER.get(task["priority"], 5)) + + return sorted(tasks, key=key) def main() -> int: @@ -149,6 +185,16 @@ def main() -> int: header = Text() header.append(f"[{label}]", style="bold") header.append(f" {content}") + + meta = [] + days = days_until_due(task) + if days is not None: + meta.append(f"due:{days:+d}" if days != 0 else "due:today") + recurrence = recurrence_string(task) + if recurrence: + meta.append(f"↻ {recurrence}") + if meta: + header.append(f" ({', '.join(meta)})", style="dim") console.print(header) # Description indented (escape rich markup to preserve brackets) From 4a37ffcdc239772caedb17b03fcd2a284bef13d1 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 27 Apr 2026 11:41:13 -0700 Subject: [PATCH 025/122] =?UTF-8?q?C0:=20CLAUDE.md=20=E2=80=94=20import=20?= =?UTF-8?q?AGENTS.md=20instead=20of=20redirecting=20to=20it?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Claude Code only auto-loads CLAUDE.md. The prose shim told agents to go read AGENTS.md, which is easy to skip. Replacing the shim with `@AGENTS.md` inlines AGENTS.md content into the session prompt, so the startup rules (ai-docs, blumeops-tasks, change classification) land in context unconditionally. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 8 +------- docs/changelog.d/+claude-md-import-agents.ai.md | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) create mode 100644 docs/changelog.d/+claude-md-import-agents.ai.md diff --git a/CLAUDE.md b/CLAUDE.md index d825c0f..43c994c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,7 +1 @@ -# CLAUDE.md - -Claude Code compatibility shim. - -The canonical agent instructions for this repository now live in [`AGENTS.md`](AGENTS.md). - -If a tool specifically looks for `CLAUDE.md`, read `AGENTS.md` and follow that file as the source of truth. +@AGENTS.md diff --git a/docs/changelog.d/+claude-md-import-agents.ai.md b/docs/changelog.d/+claude-md-import-agents.ai.md new file mode 100644 index 0000000..f63231e --- /dev/null +++ b/docs/changelog.d/+claude-md-import-agents.ai.md @@ -0,0 +1 @@ +CLAUDE.md now imports AGENTS.md via `@AGENTS.md` instead of telling agents to go read it. Claude Code only auto-loads CLAUDE.md, so the prose shim was easy to skip; the import inlines AGENTS.md into the session prompt unconditionally. From c9eb188e0530634ac7487f63b1a51a1e033f180f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 27 Apr 2026 11:49:46 -0700 Subject: [PATCH 026/122] =?UTF-8?q?C0:=20blumeops-tasks=20=E2=80=94=20repl?= =?UTF-8?q?ace=20ambiguous=20due:+N=20with=20"Nd=20overdue"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The signed offset format read as "due in 5 days" rather than "5 days overdue", causing misreads. Switch to self-explanatory text: "5d overdue" / "due in 2d" / "due today". Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md | 2 +- mise-tasks/blumeops-tasks | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md b/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md index 3d00e1c..83072dd 100644 --- a/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md +++ b/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md @@ -1 +1 @@ -`blumeops-tasks` now annotates each task with a signed `due:±N` offset (or `due:today`) and a `↻ ` marker for recurring tasks, and sorts by overdue-ness (most overdue first, no-due-date last) with priority as tiebreaker. +`blumeops-tasks` now annotates each task with a human-readable due offset (`5d overdue` / `due in 2d` / `due today`) and a `↻ ` marker for recurring tasks, and sorts by overdue-ness (most overdue first, no-due-date last) with priority as tiebreaker. diff --git a/mise-tasks/blumeops-tasks b/mise-tasks/blumeops-tasks index 1c41dea..e07e9bf 100755 --- a/mise-tasks/blumeops-tasks +++ b/mise-tasks/blumeops-tasks @@ -189,7 +189,12 @@ def main() -> int: meta = [] days = days_until_due(task) if days is not None: - meta.append(f"due:{days:+d}" if days != 0 else "due:today") + if days == 0: + meta.append("due today") + elif days > 0: + meta.append(f"{days}d overdue") + else: + meta.append(f"due in {-days}d") recurrence = recurrence_string(task) if recurrence: meta.append(f"↻ {recurrence}") From cfb6d7a7aa32cde44eebfc139a9927ac7f03f8c0 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 27 Apr 2026 11:57:33 -0700 Subject: [PATCH 027/122] =?UTF-8?q?C0:=20service-review=20=E2=80=94=20mark?= =?UTF-8?q?=20cv=20reviewed=202026-04-27?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit No version bump; build deps (jinja2, pyyaml) still loose-pinned and fine. Known issue: deployed v1.0.3 package predates phone-hide commit; tracked separately in Todoist by user. Co-Authored-By: Claude Opus 4.7 (1M context) --- service-versions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service-versions.yaml b/service-versions.yaml index f5811b5..0a4fe93 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -221,7 +221,7 @@ services: - name: cv type: argocd - last-reviewed: 2026-03-07 + last-reviewed: 2026-04-27 current-version: "1.0.3" upstream-source: https://forge.eblu.me/eblume/cv notes: Personal static site; review build deps (WeasyPrint, Jinja2) in source repo From 718e0a00433cc896acacd67d9718f9f6025a215c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 27 Apr 2026 12:18:06 -0700 Subject: [PATCH 028/122] =?UTF-8?q?C0:=20review-compliance-reports=20?= =?UTF-8?q?=E2=80=94=20summarize=20image=20and=20IaC=20scans?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously only the K8s CIS in-cluster scan was processed; the weekly container-image and IaC Prowler scans were running on schedule but never reviewed. Now each scan gets its own status / severity / week-over-week delta, with top-N grouped tables (by check ID and resource) for the high-volume image and IaC outputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+review-compliance-image-iac.feature.md | 1 + mise-tasks/review-compliance-reports | 531 +++++++++++------- 2 files changed, 324 insertions(+), 208 deletions(-) create mode 100644 docs/changelog.d/+review-compliance-image-iac.feature.md diff --git a/docs/changelog.d/+review-compliance-image-iac.feature.md b/docs/changelog.d/+review-compliance-image-iac.feature.md new file mode 100644 index 0000000..1125359 --- /dev/null +++ b/docs/changelog.d/+review-compliance-image-iac.feature.md @@ -0,0 +1 @@ +`review-compliance-reports` now also fetches and summarizes the weekly Prowler container-image and IaC scans (previously only the K8s CIS in-cluster scan was processed). For each scan it shows status counts, severity breakdown, week-over-week delta, and — for the high-volume image/IaC scans — top-N tables grouped by check ID and resource instead of per-finding listings. diff --git a/mise-tasks/review-compliance-reports b/mise-tasks/review-compliance-reports index 080271c..72f35cc 100755 --- a/mise-tasks/review-compliance-reports +++ b/mise-tasks/review-compliance-reports @@ -9,23 +9,26 @@ """Fetch and summarize compliance reports from sifaka. Covers: - - Prowler K8s CIS: CSV-based, full analysis with delta tracking + - Prowler K8s CIS (in-cluster): per-finding detail + - Prowler container image scans: grouped by check + resource + - Prowler IaC manifest scans: grouped by check + resource - Kingfisher secret scanning: TODO — pending upstream JSON/CSV output support (currently HTML-only; contribute from spork) -For Prowler, copies the two most recent K8s CIS reports, parses them, -and displays: +For each Prowler scan, copies the two most recent CSV reports, parses +them, and displays: 1. Overall status (pass/fail/manual/muted counts) 2. Unmuted failures by severity 3. Delta from the previous report (new vs resolved) - 4. Actionable unmuted failures with details + 4. Actionable unmuted failures (per-finding for in-cluster; grouped + by check ID and resource for image/IaC because they have far too + many findings to list individually) This is the primary tool for the weekly compliance report review. """ import csv import subprocess -import sys import tempfile from collections import Counter from pathlib import Path @@ -36,7 +39,12 @@ from rich.console import Console from rich.panel import Panel from rich.table import Table -REPORT_BASE = "sifaka:/volume1/reports/prowler" +PROWLER_SCANS: list[tuple[str, str, bool]] = [ + # (label, sifaka base path, group_findings) + ("K8s CIS (In-Cluster)", "/volume1/reports/prowler", False), + ("Container Images", "/volume1/reports/prowler-images", True), + ("IaC (manifests)", "/volume1/reports/prowler-iac", True), +] console = Console() @@ -52,18 +60,18 @@ def scp(remote: str, local: str) -> bool: return result.returncode == 0 -def list_reports() -> list[str]: - """List Prowler CSV reports on sifaka, sorted by embedded timestamp.""" +def list_reports(base: str) -> list[str]: + """List Prowler CSV reports under `base` on sifaka, sorted by timestamp.""" result = subprocess.run( - ["ssh", "sifaka", "find /volume1/reports/prowler/ -name '*.csv' " + ["ssh", "sifaka", f"find {base}/ -name '*.csv' " "-not -path '*/compliance/*' -not -name '@*'"], capture_output=True, text=True, timeout=15, ) if result.returncode != 0: - console.print("[bold red]Failed to list reports on sifaka[/bold red]") - raise typer.Exit(code=1) + console.print(f"[bold red]Failed to list reports under {base}[/bold red]") + return [] csvs = [p.strip() for p in result.stdout.strip().splitlines() if p.strip()] # Sort by the timestamp embedded in the filename (e.g. 20260405030007) @@ -306,136 +314,151 @@ def run_node_verification(console: Console) -> None: console.print() -def main( - full: Annotated[ - bool, typer.Option(help="Show all unmuted failures, not just new ones") - ] = False, - show_muted: Annotated[ - bool, typer.Option(help="Also show muted failures") - ] = False, +SEVERITY_STYLE = { + "critical": "bold red", + "high": "red", + "medium": "yellow", +} + + +def _sev_style(sev: str) -> str: + return SEVERITY_STYLE.get(sev.lower(), "") + + +def summarize_report( + label: str, + base: str, + tmpdir: str, + *, + show_muted: bool = False, + group_findings: bool = False, ) -> None: - csvs = list_reports() + """Fetch and summarize the latest Prowler report under `base`. + + When `group_findings` is True, top-N CHECK_ID and RESOURCE_NAME tables + are shown instead of a per-finding detail table — appropriate for + image and IaC scans that produce thousands of findings. + """ + console.rule(f"[bold]{label}[/bold]") + csvs = list_reports(base) if not csvs: - console.print("[bold red]No Prowler CSV reports found on sifaka[/bold red]") - raise typer.Exit(code=1) - - with tempfile.TemporaryDirectory() as tmpdir: - # Fetch the two most recent reports - latest_remote = csvs[-1] - latest_local = Path(tmpdir) / "latest.csv" - - console.print(f"[dim]Fetching {latest_remote}...[/dim]") - if not scp(f"sifaka:{latest_remote}", str(latest_local)): - console.print("[bold red]Failed to copy latest report[/bold red]") - raise typer.Exit(code=1) - - prev_local = None - if len(csvs) >= 2: - prev_remote = csvs[-2] - prev_local = Path(tmpdir) / "prev.csv" - console.print(f"[dim]Fetching {prev_remote}...[/dim]") - if not scp(f"sifaka:{prev_remote}", str(prev_local)): - prev_local = None - - latest = parse_findings(load_csv(str(latest_local))) - - # Extract report date from filename - report_name = Path(latest_remote).stem - console.print() - - # --- Overall status --- - status_table = Table( - show_header=True, header_style="bold", title=f"Report: {report_name}" + console.print( + f"[bold yellow]{label}: no Prowler CSV reports found " + f"under {base}[/bold yellow]" ) - status_table.add_column("Status") - status_table.add_column("Count", justify="right") + console.print() + return - for status in ["PASS", "FAIL", "MANUAL"]: - count = latest["statuses"].get(status, 0) - style = "red" if status == "FAIL" and count > 0 else "" - status_table.add_row( - f"[{style}]{status}[/{style}]" if style else status, + safe = "".join(c if c.isalnum() else "_" for c in label.lower()) + latest_remote = csvs[-1] + latest_local = Path(tmpdir) / f"{safe}_latest.csv" + + console.print(f"[dim]Fetching {latest_remote}...[/dim]") + if not scp(f"sifaka:{latest_remote}", str(latest_local)): + console.print(f"[bold red]Failed to copy {latest_remote}[/bold red]") + return + + prev_local: Path | None = None + if len(csvs) >= 2: + prev_remote = csvs[-2] + prev_path = Path(tmpdir) / f"{safe}_prev.csv" + console.print(f"[dim]Fetching {prev_remote}...[/dim]") + if scp(f"sifaka:{prev_remote}", str(prev_path)): + prev_local = prev_path + + latest = parse_findings(load_csv(str(latest_local))) + report_name = Path(latest_remote).stem + console.print() + + # --- Overall status --- + status_table = Table( + show_header=True, header_style="bold", title=f"Report: {report_name}" + ) + status_table.add_column("Status") + status_table.add_column("Count", justify="right") + + for status in ["PASS", "FAIL", "MANUAL"]: + count = latest["statuses"].get(status, 0) + style = "red" if status == "FAIL" and count > 0 else "" + status_table.add_row( + f"[{style}]{status}[/{style}]" if style else status, + f"[{style}]{count}[/{style}]" if style else str(count), + ) + + muted_count = len(latest["muted"]) + unmuted_count = len(latest["unmuted"]) + status_table.add_row("", "") + status_table.add_row("[dim]↳ muted[/dim]", f"[dim]{muted_count}[/dim]") + status_table.add_row( + "[bold]↳ unmuted (action needed)[/bold]", + f"[bold red]{unmuted_count}[/bold red]" + if unmuted_count > 0 + else "[bold green]0[/bold green]", + ) + status_table.add_row("", "") + status_table.add_row("[bold]Total[/bold]", f"[bold]{latest['total']}[/bold]") + + console.print(status_table) + console.print() + + # --- Unmuted failures by severity --- + if latest["unmuted"]: + sev_table = Table( + show_header=True, + header_style="bold", + title="Unmuted Failures by Severity", + ) + sev_table.add_column("Severity") + sev_table.add_column("Count", justify="right") + + for sev, count in sorted( + Counter(r["SEVERITY"] for r in latest["unmuted"]).items(), + key=lambda kv: severity_sort({"SEVERITY": kv[0]}), + ): + style = _sev_style(sev) + sev_table.add_row( + f"[{style}]{sev}[/{style}]" if style else sev, f"[{style}]{count}[/{style}]" if style else str(count), ) - fail_count = len(latest["fails"]) - muted_count = len(latest["muted"]) - unmuted_count = len(latest["unmuted"]) - status_table.add_row("", "") - status_table.add_row("[dim]↳ muted[/dim]", f"[dim]{muted_count}[/dim]") - status_table.add_row( - "[bold]↳ unmuted (action needed)[/bold]", - f"[bold red]{unmuted_count}[/bold red]" - if unmuted_count > 0 - else "[bold green]0[/bold green]", - ) - status_table.add_row("", "") - status_table.add_row("[bold]Total[/bold]", f"[bold]{latest['total']}[/bold]") - - console.print(status_table) + console.print(sev_table) console.print() - # --- Unmuted failures by severity --- - if latest["unmuted"]: - sev_table = Table( - show_header=True, - header_style="bold", - title="Unmuted Failures by Severity", + # --- Delta from previous report --- + if prev_local: + prev = parse_findings(load_csv(str(prev_local))) + + prev_keys = {finding_key(r): r for r in prev["unmuted"]} + curr_keys = {finding_key(r): r for r in latest["unmuted"]} + + new_keys = set(curr_keys.keys()) - set(prev_keys.keys()) + resolved_keys = set(prev_keys.keys()) - set(curr_keys.keys()) + + prev_name = Path(csvs[-2]).stem + delta_lines = [ + f"Compared against: [dim]{prev_name}[/dim]", + "", + f"Previous unmuted FAILs: {len(prev['unmuted'])}", + f"Current unmuted FAILs: {len(latest['unmuted'])}", + f"[green]Resolved: {len(resolved_keys)}[/green]", + f"[red]New: {len(new_keys)}[/red]" + if new_keys + else "[green]New: 0[/green]", + ] + + console.print( + Panel( + "\n".join(delta_lines), + title="[bold]Week-over-Week Delta (unmuted only)[/bold]", + border_style="cyan", ) - sev_table.add_column("Severity") - sev_table.add_column("Count", justify="right") - - for sev, count in Counter( - r["SEVERITY"] for r in latest["unmuted"] - ).most_common(): - style = ( - "bold red" - if sev == "critical" - else "red" - if sev == "high" - else "yellow" - if sev == "medium" - else "" - ) - sev_table.add_row( - f"[{style}]{sev}[/{style}]" if style else sev, - f"[{style}]{count}[/{style}]" if style else str(count), - ) - - console.print(sev_table) - console.print() - - # --- Delta from previous report --- - if prev_local: - prev = parse_findings(load_csv(str(prev_local))) - - prev_keys = {finding_key(r): r for r in prev["unmuted"]} - curr_keys = {finding_key(r): r for r in latest["unmuted"]} - - new_keys = set(curr_keys.keys()) - set(prev_keys.keys()) - resolved_keys = set(prev_keys.keys()) - set(curr_keys.keys()) - - prev_name = Path(csvs[-2]).stem - delta_lines = [ - f"Compared against: [dim]{prev_name}[/dim]", - "", - f"Previous unmuted FAILs: {len(prev['unmuted'])}", - f"Current unmuted FAILs: {len(latest['unmuted'])}", - f"[green]Resolved: {len(resolved_keys)}[/green]", - f"[red]New: {len(new_keys)}[/red]" - if new_keys - else f"[green]New: 0[/green]", - ] - - console.print( - Panel( - "\n".join(delta_lines), - title="[bold]Week-over-Week Delta (unmuted only)[/bold]", - border_style="cyan", - ) - ) - console.print() + ) + console.print() + # For grouped scans the new/resolved listings are too noisy + # (potentially thousands of lines). Skip the listings; the count + # is in the panel above and detail is in the grouped tables. + if not group_findings: if new_keys: console.print("[bold red]New Unmuted Failures:[/bold red]") for k in sorted(new_keys): @@ -456,85 +479,177 @@ def main( ) console.print() - # --- Unmuted failure details --- - findings_to_show = latest["unmuted"] if full else [] - if not full and latest["unmuted"]: - findings_to_show = latest["unmuted"] - - if findings_to_show: - detail_table = Table( - show_header=True, - header_style="bold", - title="Unmuted Failures — Action Needed", - ) - detail_table.add_column("Severity") - detail_table.add_column("Check") - detail_table.add_column("Resource") - detail_table.add_column("Detail", max_width=60) - - for r in sorted(findings_to_show, key=severity_sort): - sev = r["SEVERITY"] - style = ( - "bold red" - if sev == "critical" - else "red" - if sev == "high" - else "yellow" - if sev == "medium" - else "" - ) - detail_table.add_row( - f"[{style}]{sev}[/{style}]" if style else sev, - r["CHECK_ID"], - r.get("RESOURCE_NAME", ""), - r["STATUS_EXTENDED"][:60], - ) - - console.print(detail_table) - console.print() - - # --- Muted findings summary --- - if show_muted and latest["muted"]: - muted_table = Table( - show_header=True, - header_style="bold", - title="Muted Failures (for reference)", - ) - muted_table.add_column("Severity") - muted_table.add_column("Check") - muted_table.add_column("Count", justify="right") - - muted_groups: dict[tuple[str, str], int] = Counter() - for r in latest["muted"]: - muted_groups[(r["SEVERITY"], r["CHECK_ID"])] += 1 - - for (sev, check), count in sorted( - muted_groups.items(), key=lambda x: severity_sort({"SEVERITY": x[0][0]}) - ): - muted_table.add_row(f"[dim]{sev}[/dim]", f"[dim]{check}[/dim]", f"[dim]{count}[/dim]") - - console.print(muted_table) - console.print() - - # --- Verdict --- - if not latest["unmuted"]: - console.print( - Panel( - "[bold green]All clear.[/bold green] No unmuted failures.", - title="Prowler Verdict", - border_style="green", - ) - ) + # --- Unmuted failure details (grouped or per-finding) --- + if latest["unmuted"]: + if group_findings: + _print_grouped_findings(latest["unmuted"]) else: - console.print( - Panel( - f"[bold yellow]{len(latest['unmuted'])} unmuted failure(s) " - f"need triage.[/bold yellow]\n\n" - "For each: remediate (fix the pod spec) or mute " - "(add to mutelist + compensating control).", - title="Prowler Verdict", - border_style="yellow", - ) + _print_findings_detail(latest["unmuted"]) + + # --- Muted findings summary --- + if show_muted and latest["muted"]: + muted_table = Table( + show_header=True, + header_style="bold", + title="Muted Failures (for reference)", + ) + muted_table.add_column("Severity") + muted_table.add_column("Check") + muted_table.add_column("Count", justify="right") + + muted_groups: dict[tuple[str, str], int] = Counter() + for r in latest["muted"]: + muted_groups[(r["SEVERITY"], r["CHECK_ID"])] += 1 + + for (sev, check), count in sorted( + muted_groups.items(), + key=lambda x: severity_sort({"SEVERITY": x[0][0]}), + ): + muted_table.add_row( + f"[dim]{sev}[/dim]", + f"[dim]{check}[/dim]", + f"[dim]{count}[/dim]", + ) + + console.print(muted_table) + console.print() + + # --- Verdict --- + if not latest["unmuted"]: + console.print( + Panel( + "[bold green]All clear.[/bold green] No unmuted failures.", + title=f"{label} Verdict", + border_style="green", + ) + ) + else: + console.print( + Panel( + f"[bold yellow]{len(latest['unmuted'])} unmuted failure(s) " + f"need triage.[/bold yellow]\n\n" + "For each: remediate or mute " + "(add to mutelist + compensating control).", + title=f"{label} Verdict", + border_style="yellow", + ) + ) + console.print() + + +def _print_findings_detail(unmuted: list[dict]) -> None: + """Per-finding detail table — appropriate when finding count is small.""" + detail_table = Table( + show_header=True, + header_style="bold", + title="Unmuted Failures — Action Needed", + ) + detail_table.add_column("Severity") + detail_table.add_column("Check") + detail_table.add_column("Resource") + detail_table.add_column("Detail", max_width=60) + + for r in sorted(unmuted, key=severity_sort): + sev = r["SEVERITY"] + style = _sev_style(sev) + detail_table.add_row( + f"[{style}]{sev}[/{style}]" if style else sev, + r["CHECK_ID"], + r.get("RESOURCE_NAME", ""), + r["STATUS_EXTENDED"][:60], + ) + + console.print(detail_table) + console.print() + + +def _worst_severity(rows: list[dict]) -> str: + """Return the most severe severity label across `rows`.""" + if not rows: + return "" + return min( + (r["SEVERITY"] for r in rows), + key=lambda s: severity_sort({"SEVERITY": s}), + ) + + +def _print_grouped_findings(unmuted: list[dict], top_n: int = 15) -> None: + """Top-N tables grouped by CHECK_ID and RESOURCE_NAME. + + Used for image and IaC scans where per-finding tables would be too + large to be useful. Shows count and worst severity for each group. + """ + by_check: dict[str, list[dict]] = {} + by_resource: dict[str, list[dict]] = {} + for r in unmuted: + by_check.setdefault(r["CHECK_ID"], []).append(r) + by_resource.setdefault(r.get("RESOURCE_NAME", "") or "(no resource)", []).append(r) + + check_table = Table( + show_header=True, + header_style="bold", + title=f"Top {top_n} Checks by Unmuted Finding Count", + ) + check_table.add_column("Worst Sev") + check_table.add_column("Check ID") + check_table.add_column("Count", justify="right") + + for check, rows in sorted( + by_check.items(), key=lambda kv: -len(kv[1]) + )[:top_n]: + worst = _worst_severity(rows) + style = _sev_style(worst) + check_table.add_row( + f"[{style}]{worst}[/{style}]" if style else worst, + check, + str(len(rows)), + ) + + console.print(check_table) + console.print() + + res_table = Table( + show_header=True, + header_style="bold", + title=f"Top {top_n} Resources by Unmuted Finding Count", + ) + res_table.add_column("Worst Sev") + res_table.add_column("Resource") + res_table.add_column("Count", justify="right") + + for resource, rows in sorted( + by_resource.items(), key=lambda kv: -len(kv[1]) + )[:top_n]: + worst = _worst_severity(rows) + style = _sev_style(worst) + res_table.add_row( + f"[{style}]{worst}[/{style}]" if style else worst, + resource[:80], + str(len(rows)), + ) + + console.print(res_table) + console.print() + + +def main( + full: Annotated[ + bool, typer.Option(help="(reserved) currently a no-op; all unmuted failures already shown") + ] = False, + show_muted: Annotated[ + bool, typer.Option(help="Also show muted failures") + ] = False, +) -> None: + del full # historical flag, kept for backwards compatibility + + with tempfile.TemporaryDirectory() as tmpdir: + for label, base, group in PROWLER_SCANS: + summarize_report( + label, + base, + tmpdir, + show_muted=show_muted, + group_findings=group, ) # --- Node-level MANUAL check verification --- From 495e45d01dc3aa6d42124a4f3ca88a6816cfe9bb Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 10:43:32 -0700 Subject: [PATCH 029/122] Address 6 critical Prowler IaC findings (mute + grafana RBAC tighten) (#340) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary The weekly Prowler IaC scan reported 6 critical findings against `argocd/manifests/`. They split cleanly into two patterns: - **Legitimate-by-design RBAC → mute with new compensating controls** - `external-secrets-controller`, `external-secrets-cert-controller` manage `secrets` (KSV-0041) and the cert-controller mutates its own webhook configurations (KSV-0114). This is what the operator is *for*. New CC: `operator-purpose-bound-rbac`. - `kube-state-metrics` (both `minikube-indri` and `k3s-ringtail`) holds `list/watch` on secrets to expose `kube_secret_info` and `kube_secret_labels` metrics. KSM's metric schema only reads metadata, never the `data:` field. New CC: `kube-state-metrics-metadata-only`. - **Over-broad RBAC → fix** - `grafana-clusterrole` had `get/watch/list` on `secrets` because the dashboard-sidecar config used `RESOURCE=both` (ConfigMaps + Secrets). Nothing in the cluster labels Secrets with `grafana_dashboard=1`, so this was unused power. Switched both sidecar instances to `RESOURCE=configmap` and removed `secrets` from the ClusterRole. The IaC cronjob also did not previously pass `--mutelist-file`, which is why every IaC finding reported as unmuted regardless of mutelist configuration. The new `mutelist/iac.yaml` is bundled into the existing `prowler-mutelist` ConfigMap and mounted via `items:` selector. ## Test plan - [ ] `kubectl --context=minikube-indri kustomize argocd/manifests/prowler/` — already passes locally - [ ] `kubectl --context=minikube-indri kustomize argocd/manifests/grafana/` — already passes locally - [ ] Deploy from this branch via `argocd app set prowler --revision prowler-iac-mutelist && argocd app sync prowler` and same for `grafana` - [ ] Manually trigger the IaC cronjob and verify `MUTED=True` on the 6 critical findings (`kubectl --context=minikube-indri -n prowler create job --from=cronjob/prowler-iac-scan prowler-iac-test`) - [ ] Restart grafana pod and confirm dashboards still render (sidecar still finds them via ConfigMap watch) - [ ] After verify, `argocd app set --revision main && argocd app sync ` post-merge 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/340 --- argocd/manifests/grafana/deployment.yaml | 6 ++- argocd/manifests/grafana/rbac.yaml | 2 +- .../manifests/prowler/cronjob-iac-scan.yaml | 16 ++++++++ argocd/manifests/prowler/kustomization.yaml | 3 +- .../prowler/mutelist/trivyignore.yaml | 39 +++++++++++++++++++ compensating-controls.yaml | 36 +++++++++++++++++ containers/prowler/Dockerfile | 20 +++++++++- .../changelog.d/prowler-iac-mutelist.infra.md | 1 + 8 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 argocd/manifests/prowler/mutelist/trivyignore.yaml create mode 100644 docs/changelog.d/prowler-iac-mutelist.infra.md diff --git a/argocd/manifests/grafana/deployment.yaml b/argocd/manifests/grafana/deployment.yaml index 848503e..0aad9b3 100644 --- a/argocd/manifests/grafana/deployment.yaml +++ b/argocd/manifests/grafana/deployment.yaml @@ -156,7 +156,9 @@ spec: - name: FOLDER value: /tmp/dashboards - name: RESOURCE - value: both + # ConfigMap-only — no dashboards are sourced from Secrets, + # so the ServiceAccount has no read access to secrets. + value: configmap - name: FOLDER_ANNOTATION value: grafana_folder securityContext: @@ -183,7 +185,7 @@ spec: - name: FOLDER value: /tmp/dashboards - name: RESOURCE - value: both + value: configmap - name: FOLDER_ANNOTATION value: grafana_folder - name: REQ_USERNAME diff --git a/argocd/manifests/grafana/rbac.yaml b/argocd/manifests/grafana/rbac.yaml index d0d0c843..1c2dee3 100644 --- a/argocd/manifests/grafana/rbac.yaml +++ b/argocd/manifests/grafana/rbac.yaml @@ -7,7 +7,7 @@ metadata: app.kubernetes.io/instance: grafana rules: - apiGroups: [""] - resources: ["configmaps", "secrets"] + resources: ["configmaps"] verbs: ["get", "watch", "list"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/argocd/manifests/prowler/cronjob-iac-scan.yaml b/argocd/manifests/prowler/cronjob-iac-scan.yaml index 49c8ce6..c1303a5 100644 --- a/argocd/manifests/prowler/cronjob-iac-scan.yaml +++ b/argocd/manifests/prowler/cronjob-iac-scan.yaml @@ -19,6 +19,13 @@ spec: - name: prowler image: registry.ops.eblu.me/blumeops/prowler:kustomized command: ["/bin/sh", "-c"] + # Prowler's --mutelist-file is a no-op for the IaC provider + # (it delegates to Trivy). The Prowler image's trivy shim + # injects --ignorefile $TRIVY_IGNOREFILE when set; see + # containers/prowler/Dockerfile. + env: + - name: TRIVY_IGNOREFILE + value: /mutelist/trivyignore.yaml args: - | DATEDIR=/reports/prowler-iac/$(date +%Y-%m-%d) @@ -31,8 +38,17 @@ spec: volumeMounts: - name: reports mountPath: /reports + - name: mutelist + mountPath: /mutelist + readOnly: true restartPolicy: OnFailure volumes: - name: reports persistentVolumeClaim: claimName: prowler-reports + - name: mutelist + configMap: + name: prowler-mutelist + items: + - key: trivyignore.yaml + path: trivyignore.yaml diff --git a/argocd/manifests/prowler/kustomization.yaml b/argocd/manifests/prowler/kustomization.yaml index 7024aff..cf644dc 100644 --- a/argocd/manifests/prowler/kustomization.yaml +++ b/argocd/manifests/prowler/kustomization.yaml @@ -23,7 +23,8 @@ configMapGenerator: - mutelist/core-pod-security.yaml - mutelist/manual-node-checks.yaml - mutelist/rbac.yaml + - mutelist/trivyignore.yaml images: - name: registry.ops.eblu.me/blumeops/prowler - newTag: v5.23.0-7c1cd11 + newTag: v5.23.0-2daf629 diff --git a/argocd/manifests/prowler/mutelist/trivyignore.yaml b/argocd/manifests/prowler/mutelist/trivyignore.yaml new file mode 100644 index 0000000..22c612a --- /dev/null +++ b/argocd/manifests/prowler/mutelist/trivyignore.yaml @@ -0,0 +1,39 @@ +# Trivy ignorefile for Prowler IaC scan. +# +# Prowler's `--mutelist-file` flag is a no-op for the IaC provider +# (iac_provider.py sets self._mutelist = None and delegates to Trivy). +# Trivy in turn does not auto-discover this YAML form from cwd, so the +# Prowler image ships a shim wrapper around `trivy` that injects +# --ignorefile $TRIVY_IGNOREFILE when the env var is set. The cronjob +# mounts this file and sets TRIVY_IGNOREFILE accordingly. +# +# Schema: https://trivy.dev/latest/docs/configuration/filtering/ +# IDs use the hyphenated form Trivy displays (KSV-0041, not KSV0041). +misconfigurations: + - id: KSV-0041 + paths: + - "argocd/manifests/external-secrets/rbac.yaml" + statement: >- + CC: operator-purpose-bound-rbac. external-secrets-operator's entire + function is to read and synthesize Secret objects; ClusterRole over + secrets is its purpose. Both the controller and cert-controller are + upstream-defined. + - id: KSV-0041 + paths: + - "argocd/manifests/kube-state-metrics/rbac.yaml" + - "argocd/manifests/kube-state-metrics-ringtail/rbac.yaml" + statement: >- + CC: kube-state-metrics-metadata-only. KSM exposes only Secret + metadata (name, namespace, type, labels), never the data field. + list/watch on secrets is required for kube_secret_info / + kube_secret_labels metrics. + - id: KSV-0114 + paths: + - "argocd/manifests/external-secrets/rbac.yaml" + statement: >- + CC: operator-purpose-bound-rbac. cert-controller manages the + external-secrets validating webhook configurations to inject its + own rotating CA bundle. RBAC is scoped to two named webhooks + (secretstore-validate, externalsecret-validate) via resourceNames; + KSV-0114 doesn't see the resourceNames restriction so reports the + full ClusterRole. diff --git a/compensating-controls.yaml b/compensating-controls.yaml index 67bbf75..d9d7c6c 100644 --- a/compensating-controls.yaml +++ b/compensating-controls.yaml @@ -139,6 +139,42 @@ controls: MANUAL findings appear in Prowler, add corresponding verification logic to the script and update the mutelist. + - id: operator-purpose-bound-rbac + description: >- + Operators whose entire function is to manage a sensitive resource + legitimately need RBAC over that resource. external-secrets-operator + manages Secret objects (its purpose) and the cert-controller mutates + its own ValidatingWebhookConfigurations to inject rotating CA bundles. + Risk is bounded by: (1) the operator code being upstream open-source + and reviewed; (2) RBAC scoped to specific named webhooks where + possible; (3) supply chain controls on the operator image (mirrored + to local registry, version tracked in service-versions.yaml). + created: 2026-04-27 + last-reviewed: 2026-04-27 + notes: >- + Verify by checking that the operators in question still match their + stated purpose (i.e. external-secrets is still the only consumer of + these ClusterRoles) and that upstream hasn't published advisories + for credential-handling bugs. Re-evaluate if a non-secrets-managing + ClusterRole appears under this control. + + - id: kube-state-metrics-metadata-only + description: >- + kube-state-metrics holds list/watch on Secrets cluster-wide but only + exposes Secret object *metadata* (name, namespace, type, creation + timestamp, labels) via the kube_secret_info / kube_secret_labels + metrics. Secret data fields are never read into KSM's exposed + metrics by upstream design. Mitigation rests on KSM's metric + schema, the version pin in service-versions.yaml, and the metrics + endpoint being reachable only on the cluster network. + created: 2026-04-27 + last-reviewed: 2026-04-27 + notes: >- + Verify by inspecting the /metrics endpoint output for any series + that include secret data (only *_info and *_labels metrics should + reference secrets, and labels should be limited to user-applied + labels — never the data:). Re-evaluate on KSM version bumps. + - id: observability-stack-audit description: >- Alloy collects pod logs and ships them to Loki, providing an diff --git a/containers/prowler/Dockerfile b/containers/prowler/Dockerfile index bd74bdb..c5157cb 100644 --- a/containers/prowler/Dockerfile +++ b/containers/prowler/Dockerfile @@ -44,10 +44,28 @@ RUN ARCH=$(dpkg --print-architecture) \ && apt-get update && apt-get install -y --no-install-recommends wget ca-certificates \ && wget -q "https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/trivy_${TRIVY_VERSION}_${TRIVY_ARCH}.tar.gz" -O /tmp/trivy.tar.gz \ && tar xzf /tmp/trivy.tar.gz -C /usr/local/bin trivy \ - && chmod +x /usr/local/bin/trivy \ + && mv /usr/local/bin/trivy /usr/local/bin/trivy.real \ + && chmod +x /usr/local/bin/trivy.real \ && rm /tmp/trivy.tar.gz \ && apt-get purge -y wget && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* +# Shim: Prowler's IaC provider invokes `trivy fs` directly with no +# --ignorefile flag, so any TRIVY_IGNOREFILE the user sets is ignored. +# This wrapper injects --ignorefile when the env var points at a real +# file and the invocation is `trivy fs ...`. Other subcommands and +# global-only invocations (--version, --help) pass through unchanged. +# TODO(upstream): contribute --ignorefile plumbing to prowler-cloud/prowler +# iac_provider.py so this shim isn't necessary. +RUN printf '%s\n' \ + '#!/bin/sh' \ + 'if [ "${1:-}" = "fs" ] && [ -n "${TRIVY_IGNOREFILE:-}" ] && [ -f "${TRIVY_IGNOREFILE}" ]; then' \ + ' shift' \ + ' exec /usr/local/bin/trivy.real fs --ignorefile "${TRIVY_IGNOREFILE}" "$@"' \ + 'fi' \ + 'exec /usr/local/bin/trivy.real "$@"' \ + > /usr/local/bin/trivy \ + && chmod +x /usr/local/bin/trivy + RUN addgroup --gid 1000 prowler \ && adduser --uid 1000 --gid 1000 --disabled-password --gecos "" prowler \ && mkdir -p /tmp/.cache/trivy && chown prowler:prowler /tmp/.cache/trivy diff --git a/docs/changelog.d/prowler-iac-mutelist.infra.md b/docs/changelog.d/prowler-iac-mutelist.infra.md new file mode 100644 index 0000000..793c1ec --- /dev/null +++ b/docs/changelog.d/prowler-iac-mutelist.infra.md @@ -0,0 +1 @@ +Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var. Two new compensating controls — `operator-purpose-bound-rbac` and `kube-state-metrics-metadata-only` — justify muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`. From 4d76fd5de5774013b997c7c8d9cf4623d3fe526c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 10:49:27 -0700 Subject: [PATCH 030/122] =?UTF-8?q?C0:=20prowler=20=E2=80=94=20rebuild=20i?= =?UTF-8?q?mage=20against=20main=20HEAD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Squash-merge of #340 changed the SHA. Bump prowler tag from v5.23.0-2daf629 (PR branch) to v5.23.0-495e45d (main HEAD) so the Dockerfile changes are present in the image deployed off main. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/prowler/kustomization.yaml | 2 +- docs/changelog.d/+prowler-rebuild-on-main.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+prowler-rebuild-on-main.infra.md diff --git a/argocd/manifests/prowler/kustomization.yaml b/argocd/manifests/prowler/kustomization.yaml index cf644dc..1d92a6b 100644 --- a/argocd/manifests/prowler/kustomization.yaml +++ b/argocd/manifests/prowler/kustomization.yaml @@ -27,4 +27,4 @@ configMapGenerator: images: - name: registry.ops.eblu.me/blumeops/prowler - newTag: v5.23.0-2daf629 + newTag: v5.23.0-495e45d diff --git a/docs/changelog.d/+prowler-rebuild-on-main.infra.md b/docs/changelog.d/+prowler-rebuild-on-main.infra.md new file mode 100644 index 0000000..107b687 --- /dev/null +++ b/docs/changelog.d/+prowler-rebuild-on-main.infra.md @@ -0,0 +1 @@ +Rebuild Prowler container against main HEAD (v5.23.0-495e45d) after merging the IaC mutelist Dockerfile changes. From 817acc5e5eaa0db51276231fcb22af897391a5ab Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 11:00:01 -0700 Subject: [PATCH 031/122] =?UTF-8?q?C0:=20transmission=20doc=20=E2=80=94=20?= =?UTF-8?q?review=20and=20correct=20storage/monitoring=20details?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marked last-reviewed: 2026-04-29. Fixed the storage layout table — `/config/` is an emptyDir (ephemeral), not NFS, and the watch directory is disabled. Documented the transmission-exporter sidecar that exposes Prometheus metrics on port 19091. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+transmission-doc-review.doc.md | 1 + docs/reference/services/transmission.md | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) create mode 100644 docs/changelog.d/+transmission-doc-review.doc.md diff --git a/docs/changelog.d/+transmission-doc-review.doc.md b/docs/changelog.d/+transmission-doc-review.doc.md new file mode 100644 index 0000000..418504f --- /dev/null +++ b/docs/changelog.d/+transmission-doc-review.doc.md @@ -0,0 +1 @@ +Reviewed transmission card: corrected storage layout (`/config/` is emptyDir, watch dir disabled) and noted the Prometheus exporter sidecar. diff --git a/docs/reference/services/transmission.md b/docs/reference/services/transmission.md index 3676177..89904ce 100644 --- a/docs/reference/services/transmission.md +++ b/docs/reference/services/transmission.md @@ -1,6 +1,7 @@ --- title: Transmission -modified: 2026-02-07 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - torrent @@ -22,14 +23,13 @@ BitTorrent daemon, primarily for downloading ZIM archives for [[kiwix]]. ## Storage Layout -NFS share on sifaka (`/volume1/torrents`): +| Path | Backing | Purpose | +|------|---------|---------| +| `/downloads/incomplete/` | NFS (`sifaka:/volume1/torrents`) | Active downloads | +| `/downloads/complete/` | NFS (`sifaka:/volume1/torrents`) | Completed downloads | +| `/config/` | `emptyDir` (ephemeral) | Transmission `settings.json`, regenerated on pod start | -| Path | Purpose | -|------|---------| -| `/downloads/` | Active downloads and metadata | -| `/downloads/complete/` | Completed downloads | -| `/config/` | Transmission configuration | -| `/watch/` | Watch directory for .torrent files | +The watch directory is disabled (`watch-dir-enabled: false`); torrents are added via RPC (see Kiwix integration below). [[kiwix]] reads from `/downloads/complete/` to serve ZIM archives. @@ -44,7 +44,7 @@ When downloads complete, the zim-watcher CronJob detects new ZIMs and restarts K ## Monitoring -Basic uptime via blackbox probe in [[alloy|Alloy]] k8s (Services Health dashboard). +A `transmission-exporter` sidecar (image `registry.ops.eblu.me/blumeops/transmission-exporter`) scrapes the local RPC and exposes Prometheus metrics on port 19091. Uptime is also covered by a blackbox probe in [[alloy|Alloy]] k8s (Services Health dashboard). Web UI shows: active/seeding/paused counts, speeds, disk usage. From f4a24595b124cb21fcdcbf95ac9ddbdff3901caa Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 11:09:34 -0700 Subject: [PATCH 032/122] C0: review CC ephemeral-privileged-jobs Verified TTL=604800s and hostPID limited to ephemeral Prowler CronJob on indri. Noted that alloy-tracing on ringtail also uses hostPID but is out of scope until Prowler scans ringtail (tracked in Todoist). Co-Authored-By: Claude Opus 4.7 (1M context) --- compensating-controls.yaml | 9 +++++++-- .../+review-cc-ephemeral-privileged-jobs.misc.md | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md diff --git a/compensating-controls.yaml b/compensating-controls.yaml index d9d7c6c..fb5450d 100644 --- a/compensating-controls.yaml +++ b/compensating-controls.yaml @@ -94,10 +94,15 @@ controls: auto-deletion, not as a persistent privileged workload. hostPID exposure is time-bounded to scan duration (~20s). created: 2026-03-30 - last-reviewed: 2026-03-30 + last-reviewed: 2026-04-29 notes: >- Verify TTL is set in cronjob.yaml. Check that no persistent - pods run with hostPID. + pods run with hostPID on the scanned cluster (indri). The + alloy-tracing DaemonSet on ringtail also uses hostPID but is + out of scope — Prowler only scans indri. Tracked in Todoist: + "prowler scan against ringtail" — once that lands, the + DaemonSet's hostPID+privileged posture will surface as a CIS + finding and need its own CC or remediation. - id: trusted-ci-only description: >- diff --git a/docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md b/docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md new file mode 100644 index 0000000..14dcdca --- /dev/null +++ b/docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md @@ -0,0 +1 @@ +Reviewed compensating control `ephemeral-privileged-jobs`: TTL and hostPID scope verified on indri. Noted that the alloy-tracing DaemonSet on ringtail is out of scope until Prowler scans ringtail (tracked in Todoist). From 14ca0160ba5f76ab8ad348f64f68c75fb6ec3659 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 13:38:36 -0700 Subject: [PATCH 033/122] Migrate devpi from minikube to indri (launchd) (#341) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Devpi was crash-looping under memory pressure on the minikube StatefulSet, breaking the Python toolchain across the repo (`mise run docs-mikado`, `prek`, every `uv pip install`). It moves to indri as a native LaunchAgent. ## What changed - **New ansible role** `ansible/roles/devpi/`: installs `devpi-server` + `devpi-web` into a uv-managed venv, initializes the server-dir on first run via 1Password root password, runs as a LaunchAgent (`mcquack.eblume.devpi`) bound to `127.0.0.1:3141`. Bootstraps from upstream PyPI (so devpi can install itself on a fresh box). - **Caddy**: `pypi.ops.eblu.me` now proxies to `http://localhost:3141`. - **Playbook**: `indri.yml` gains pre_tasks for the root password and the new role. - **service-versions.yaml**: devpi flipped from `type: argocd` to `type: ansible`. - **ArgoCD**: removed `apps/devpi.yaml` and `manifests/devpi/`. The in-cluster Application, namespace, and PVC have been deleted. - **Docs**: new how-to `docs/how-to/operations/devpi-on-indri.md`; `restart-indri.md` lists devpi in the LaunchAgent stop list. ## Already deployed (live on indri) - Service running: `launchctl list mcquack.eblume.devpi` → PID 53888 - `curl https://pypi.ops.eblu.me/+api` returns 200 ✅ - `mise run docs-mikado` works again ✅ - 1.0G of cached PyPI data was migrated from the PVC to `~erichblume/devpi/server-dir/` - Minikube namespace and PVC fully reclaimed ## Test plan - [ ] `mise run services-check` (after merge) - [ ] CI workflows that use devpi succeed - [ ] No regressions in tools that depend on `pypi.ops.eblu.me` (prek, uv-script tasks, dagger pipelines) ## Context This is the C1 prelude to a planned C2 chain (`mikado/retire-minikube-indri`) to retire minikube on indri entirely. Doing devpi as a standalone C1 was the right call because (a) it was urgent — it was breaking the toolchain — and (b) it shakes out the migration recipe before we commit to a multi-leaf chain. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/341 --- ansible/playbooks/indri.yml | 19 +++++ ansible/roles/caddy/defaults/main.yml | 2 +- ansible/roles/devpi/defaults/main.yml | 21 ++++++ ansible/roles/devpi/handlers/main.yml | 6 ++ ansible/roles/devpi/tasks/main.yml | 71 ++++++++++++++++++ ansible/roles/devpi/templates/devpi.plist.j2 | 34 +++++++++ argocd/apps/devpi.yaml | 29 -------- argocd/manifests/alloy-k8s/config.alloy | 4 +- argocd/manifests/devpi/README.md | 72 ------------------ argocd/manifests/devpi/external-secret.yaml | 25 ------- argocd/manifests/devpi/ingress-tailscale.yaml | 25 ------- argocd/manifests/devpi/kustomization.yaml | 14 ---- argocd/manifests/devpi/service.yaml | 13 ---- argocd/manifests/devpi/statefulset.yaml | 64 ---------------- .../migrate-devpi-to-indri.infra.md | 1 + docs/how-to/operations/devpi-on-indri.md | 74 +++++++++++++++++++ .../operations/rebuild-minikube-cluster.md | 20 +---- docs/how-to/operations/restart-indri.md | 1 + docs/reference/infrastructure/tailscale.md | 2 +- docs/reference/services/devpi.md | 34 +++++---- docs/reference/storage/backups.md | 2 +- pulumi/tailscale/__main__.py | 2 +- pulumi/tailscale/policy.hujson | 9 +-- service-versions.yaml | 5 +- 24 files changed, 260 insertions(+), 289 deletions(-) create mode 100644 ansible/roles/devpi/defaults/main.yml create mode 100644 ansible/roles/devpi/handlers/main.yml create mode 100644 ansible/roles/devpi/tasks/main.yml create mode 100644 ansible/roles/devpi/templates/devpi.plist.j2 delete mode 100644 argocd/apps/devpi.yaml delete mode 100644 argocd/manifests/devpi/README.md delete mode 100644 argocd/manifests/devpi/external-secret.yaml delete mode 100644 argocd/manifests/devpi/ingress-tailscale.yaml delete mode 100644 argocd/manifests/devpi/kustomization.yaml delete mode 100644 argocd/manifests/devpi/service.yaml delete mode 100644 argocd/manifests/devpi/statefulset.yaml create mode 100644 docs/changelog.d/migrate-devpi-to-indri.infra.md create mode 100644 docs/how-to/operations/devpi-on-indri.md diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index ce6a930..fa87b36 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -212,6 +212,23 @@ no_log: true tags: [forgejo_metrics] + # Devpi root password (PyPI mirror admin) + - name: Fetch devpi root password + ansible.builtin.command: + cmd: op read "op://vg6xf6vvfmoh5hqjjhlhbeoaie/kyhzfifryqnuk7jeyibmmjvxxm/add more/root password" + delegate_to: localhost + register: _devpi_root_password + changed_when: false + no_log: true + check_mode: false + tags: [devpi] + + - name: Set devpi root password fact + ansible.builtin.set_fact: + devpi_root_password: "{{ _devpi_root_password.stdout }}" + no_log: true + tags: [devpi] + roles: - role: alloy tags: alloy @@ -227,6 +244,8 @@ tags: zot - role: zot_metrics tags: zot_metrics + - role: devpi + tags: devpi - role: minikube tags: minikube - role: minikube_metrics diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index ebb210b..80993ee 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -51,7 +51,7 @@ caddy_services: backend: "https://feed.tail8d86e.ts.net" - name: devpi host: "pypi.{{ caddy_domain }}" - backend: "https://pypi.tail8d86e.ts.net" + backend: "http://localhost:3141" - name: kiwix host: "kiwix.{{ caddy_domain }}" backend: "https://kiwix.tail8d86e.ts.net" diff --git a/ansible/roles/devpi/defaults/main.yml b/ansible/roles/devpi/defaults/main.yml new file mode 100644 index 0000000..6d52b9b --- /dev/null +++ b/ansible/roles/devpi/defaults/main.yml @@ -0,0 +1,21 @@ +--- +# devpi PyPI caching mirror (native launchd, replaces minikube StatefulSet) + +devpi_home: /Users/erichblume/devpi +devpi_venv: "{{ devpi_home }}/venv" +devpi_server_dir: "{{ devpi_home }}/server-dir" +devpi_binary: "{{ devpi_venv }}/bin/devpi-server" +devpi_init_binary: "{{ devpi_venv }}/bin/devpi-init" + +devpi_python_version: "3.12" +devpi_server_version: "6.19.3" +devpi_web_version: "5.0.2" + +devpi_host: 127.0.0.1 +devpi_port: 3141 +devpi_outside_url: "https://pypi.ops.eblu.me" + +devpi_log_dir: /Users/erichblume/Library/Logs + +# uv binary on indri — mise shim so version bumps via `mise upgrade uv` flow through transparently +devpi_uv_binary: /Users/erichblume/.local/share/mise/shims/uv diff --git a/ansible/roles/devpi/handlers/main.yml b/ansible/roles/devpi/handlers/main.yml new file mode 100644 index 0000000..2765850 --- /dev/null +++ b/ansible/roles/devpi/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart devpi + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.devpi.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.devpi.plist + changed_when: true diff --git a/ansible/roles/devpi/tasks/main.yml b/ansible/roles/devpi/tasks/main.yml new file mode 100644 index 0000000..985ca46 --- /dev/null +++ b/ansible/roles/devpi/tasks/main.yml @@ -0,0 +1,71 @@ +--- +# devpi role — devpi-server in a uv-managed venv, run via LaunchAgent. +# Replaces the prior minikube StatefulSet; see [[devpi-on-indri]]. +# +# The root password is fetched in the indri.yml playbook pre_tasks and +# exposed as `devpi_root_password`. + +- name: Ensure devpi home exists + ansible.builtin.file: + path: "{{ devpi_home }}" + state: directory + mode: '0755' + +- name: Ensure devpi server-dir exists + ansible.builtin.file: + path: "{{ devpi_server_dir }}" + state: directory + mode: '0700' + +- name: Create devpi venv if missing + ansible.builtin.command: + cmd: "{{ devpi_uv_binary }} venv --python {{ devpi_python_version }} {{ devpi_venv }}" + creates: "{{ devpi_venv }}/bin/python" + +- name: Install devpi-server and devpi-web into venv + # Always bootstrap from upstream PyPI — devpi is the index it would otherwise resolve through, + # and that's a circular dependency (devpi cannot install itself from itself). + ansible.builtin.command: + cmd: >- + {{ devpi_uv_binary }} pip install + --python {{ devpi_venv }}/bin/python + --index-url https://pypi.org/simple/ + devpi-server=={{ devpi_server_version }} + devpi-web=={{ devpi_web_version }} + register: devpi_pip_install + changed_when: "'Installed' in devpi_pip_install.stdout or 'Uninstalled' in devpi_pip_install.stdout" + notify: Restart devpi + +- name: Check if devpi server-dir is initialized + ansible.builtin.stat: + path: "{{ devpi_server_dir }}/.serverversion" + register: devpi_serverversion + +- name: Initialize devpi server-dir + ansible.builtin.command: + cmd: >- + {{ devpi_init_binary }} + --serverdir {{ devpi_server_dir }} + --root-passwd {{ devpi_root_password }} + when: not devpi_serverversion.stat.exists + changed_when: true + no_log: true + +- name: Deploy devpi LaunchAgent plist + ansible.builtin.template: + src: devpi.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.devpi.plist + mode: '0644' + notify: Restart devpi + +- name: Check if devpi LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.devpi + register: devpi_launchctl_check + changed_when: false + failed_when: false + +- name: Load devpi LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.devpi.plist + when: devpi_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/devpi/templates/devpi.plist.j2 b/ansible/roles/devpi/templates/devpi.plist.j2 new file mode 100644 index 0000000..b9485e6 --- /dev/null +++ b/ansible/roles/devpi/templates/devpi.plist.j2 @@ -0,0 +1,34 @@ + + + + + + Label + mcquack.eblume.devpi + ProgramArguments + + {{ devpi_binary }} + --serverdir + {{ devpi_server_dir }} + --host + {{ devpi_host }} + --port + {{ devpi_port }} + --outside-url + {{ devpi_outside_url }} + + RunAtLoad + + KeepAlive + + EnvironmentVariables + + PATH + {{ devpi_venv }}/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin + + StandardOutPath + {{ devpi_log_dir }}/mcquack.devpi.out.log + StandardErrorPath + {{ devpi_log_dir }}/mcquack.devpi.err.log + + diff --git a/argocd/apps/devpi.yaml b/argocd/apps/devpi.yaml deleted file mode 100644 index 4a15672..0000000 --- a/argocd/apps/devpi.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# devpi PyPI Caching Proxy -# Provides PyPI cache and private package hosting -# -# After first deployment, initialize devpi: -# kubectl -n devpi exec -it devpi-0 -- devpi-init --serverdir /devpi --root-passwd -# kubectl -n devpi rollout restart statefulset devpi -# -# Then create user/index: -# uvx devpi use https://pypi.tail8d86e.ts.net -# uvx devpi login root -# uvx devpi user -c eblume email=blume.erich@gmail.com -# uvx devpi index -c eblume/dev bases=root/pypi -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: devpi - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/devpi - destination: - server: https://kubernetes.default.svc - namespace: devpi - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index a716ddc..56a2e13 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -159,8 +159,10 @@ prometheus.exporter.blackbox "services" { } target { + // devpi runs natively on indri (LaunchAgent), not in-cluster. + // We probe through Caddy (https://pypi.ops.eblu.me) which the cluster can reach via Tailscale. name = "devpi" - address = "http://devpi.devpi.svc.cluster.local:3141/+api" + address = "https://pypi.ops.eblu.me/+api" module = "http_2xx" } diff --git a/argocd/manifests/devpi/README.md b/argocd/manifests/devpi/README.md deleted file mode 100644 index 11fd697..0000000 --- a/argocd/manifests/devpi/README.md +++ /dev/null @@ -1,72 +0,0 @@ -# devpi PyPI Caching Proxy - -devpi-server running in Kubernetes, providing: -- PyPI caching proxy at `root/pypi` -- Private package hosting at `eblume/dev` - -## Setup - -### 1. Create the root password secret - -```fish -kubectl create namespace devpi -op inject -i argocd/manifests/devpi/secret-root.yaml.tpl | kubectl apply -f - -``` - -### 2. Deploy via ArgoCD - -```fish -argocd app sync apps -argocd app sync devpi -``` - -The container will auto-initialize on first startup using the root password from the secret. - -### 3. Create user and index (first time only) - -After the pod is running: - -```fish -# Login to devpi as root -uvx --from devpi-client devpi use https://pypi.tail8d86e.ts.net -uvx --from devpi-client devpi login root -# Enter root password when prompted - -# Create eblume user (prompts for password - use the one from 1Password) -uvx --from devpi-client devpi user -c eblume email=blume.erich@gmail.com - -# Create private index inheriting from PyPI -uvx --from devpi-client devpi index -c eblume/dev bases=root/pypi -``` - -## Usage - -### As pip index (caching proxy) - -Configure `~/.config/pip/pip.conf`: - -```ini -[global] -index-url = https://pypi.tail8d86e.ts.net/root/pypi/+simple/ -trusted-host = pypi.tail8d86e.ts.net -``` - -### Upload private packages - -```fish -cd ~/code/personal/your-package -uv build -uv publish --publish-url https://pypi.tail8d86e.ts.net/eblume/dev/ -``` - -## URLs - -- Web UI: https://pypi.tail8d86e.ts.net -- PyPI cache: https://pypi.tail8d86e.ts.net/root/pypi/+simple/ -- Private index: https://pypi.tail8d86e.ts.net/eblume/dev/+simple/ - -## Credentials - -Stored in 1Password vault `blumeops`, item `kyhzfifryqnuk7jeyibmmjvxxm`: -- `root password` - devpi root user -- `password` - eblume user password diff --git a/argocd/manifests/devpi/external-secret.yaml b/argocd/manifests/devpi/external-secret.yaml deleted file mode 100644 index 290ea67..0000000 --- a/argocd/manifests/devpi/external-secret.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# ExternalSecret for devpi root password -# -# Replaces the manual op inject workflow from secret-root.yaml.tpl -# -# 1Password item: "devpi" in blumeops vault -# Field: "root password" -# -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: devpi-root - namespace: devpi -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: devpi-root - creationPolicy: Owner - data: - - secretKey: password - remoteRef: - key: devpi - property: root password diff --git a/argocd/manifests/devpi/ingress-tailscale.yaml b/argocd/manifests/devpi/ingress-tailscale.yaml deleted file mode 100644 index 474bf72..0000000 --- a/argocd/manifests/devpi/ingress-tailscale.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: devpi-tailscale - namespace: devpi - annotations: - tailscale.com/proxy-class: "default" - tailscale.com/proxy-group: "ingress" - gethomepage.dev/enabled: "true" - gethomepage.dev/name: "PyPI" - gethomepage.dev/group: "Infrastructure" - gethomepage.dev/icon: "pypi.png" - gethomepage.dev/description: "PyPI cache" - gethomepage.dev/href: "https://pypi.ops.eblu.me" - gethomepage.dev/pod-selector: "app=devpi" -spec: - ingressClassName: tailscale - defaultBackend: - service: - name: devpi - port: - number: 3141 - tls: - - hosts: - - pypi diff --git a/argocd/manifests/devpi/kustomization.yaml b/argocd/manifests/devpi/kustomization.yaml deleted file mode 100644 index 2083aaa..0000000 --- a/argocd/manifests/devpi/kustomization.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: devpi - -resources: - - statefulset.yaml - - service.yaml - - ingress-tailscale.yaml - - external-secret.yaml - -images: - - name: registry.ops.eblu.me/blumeops/devpi - newTag: v6.19.3-37b8a21 diff --git a/argocd/manifests/devpi/service.yaml b/argocd/manifests/devpi/service.yaml deleted file mode 100644 index 42e1543..0000000 --- a/argocd/manifests/devpi/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: devpi - namespace: devpi -spec: - selector: - app: devpi - ports: - - name: http - port: 3141 - targetPort: 3141 - protocol: TCP diff --git a/argocd/manifests/devpi/statefulset.yaml b/argocd/manifests/devpi/statefulset.yaml deleted file mode 100644 index 91875df..0000000 --- a/argocd/manifests/devpi/statefulset.yaml +++ /dev/null @@ -1,64 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: devpi - namespace: devpi -spec: - serviceName: devpi - replicas: 1 - selector: - matchLabels: - app: devpi - template: - metadata: - labels: - app: devpi - spec: - securityContext: - fsGroup: 1000 - seccompProfile: - type: RuntimeDefault - containers: - - name: devpi - image: registry.ops.eblu.me/blumeops/devpi:kustomized - env: - - name: DEVPI_ROOT_PASSWORD - valueFrom: - secretKeyRef: - name: devpi-root - key: password - - name: DEVPI_OUTSIDE_URL - value: "https://pypi.ops.eblu.me" - ports: - - containerPort: 3141 - name: http - volumeMounts: - - name: data - mountPath: /devpi - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "2Gi" # High limit for initial PyPI index build, reclaimed after - cpu: "500m" - livenessProbe: - httpGet: - path: /+api - port: 3141 - initialDelaySeconds: 30 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /+api - port: 3141 - initialDelaySeconds: 10 - periodSeconds: 10 - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 50Gi diff --git a/docs/changelog.d/migrate-devpi-to-indri.infra.md b/docs/changelog.d/migrate-devpi-to-indri.infra.md new file mode 100644 index 0000000..418db70 --- /dev/null +++ b/docs/changelog.d/migrate-devpi-to-indri.infra.md @@ -0,0 +1 @@ +Migrated devpi (PyPI mirror at `pypi.ops.eblu.me`) from a minikube StatefulSet to a launchd-managed service on indri. devpi-server now runs in a uv-managed venv with pinned `devpi-server` and `devpi-web` versions, listens on `127.0.0.1:3141`, and is fronted by Caddy. The minikube StatefulSet was crash-looping under memory pressure (and breaking the Python toolchain everywhere); the new layout removes a layer of dependency on cluster health for critical-path tooling. See [[devpi-on-indri]]. diff --git a/docs/how-to/operations/devpi-on-indri.md b/docs/how-to/operations/devpi-on-indri.md new file mode 100644 index 0000000..0334d37 --- /dev/null +++ b/docs/how-to/operations/devpi-on-indri.md @@ -0,0 +1,74 @@ +--- +title: Devpi on Indri +modified: 2026-04-29 +last-reviewed: 2026-04-29 +tags: + - how-to + - operations +--- + +# Devpi on Indri + +How devpi (the PyPI caching mirror at `pypi.ops.eblu.me`) is deployed on indri as a launchd-managed native service. Replaces the prior minikube StatefulSet. + +## Why native, not Kubernetes + +Devpi has no runtime dependencies beyond a Python interpreter, a writable directory, and outbound HTTPS to upstream PyPI. Running it on indri natively removes a layer of operational complexity, frees minikube resources, and decouples this critical-path tooling (used by every Python build, including `mise run docs-mikado` itself) from cluster health. + +## Layout + +| Concern | Path / detail | +|---|---| +| Service binary | `/Users/erichblume/devpi/venv/bin/devpi-server` | +| Server-dir (data) | `/Users/erichblume/devpi/server-dir/` | +| Logs | `/Users/erichblume/Library/Logs/mcquack.devpi.{out,err}.log` | +| LaunchAgent label | `mcquack.eblume.devpi` | +| LaunchAgent plist | `~/Library/LaunchAgents/mcquack.eblume.devpi.plist` | +| Listen address | `127.0.0.1:3141` (loopback only) | +| Public URL | `https://pypi.ops.eblu.me` (via Caddy reverse proxy) | +| Root password secret | 1Password item `devpi`, field `root password` | + +The venv is built fresh by ansible from a pinned `devpi-server` and `devpi-web` version; bumping versions is a config change in `ansible/roles/devpi/defaults/main.yml`. + +## Deploy + +```fish +mise run provision-indri -- --tags devpi +``` + +Ansible will: + +1. Fetch the root password from 1Password (in playbook `pre_tasks`) +2. Create the venv at `~/devpi/venv` if absent and install/upgrade `devpi-server` + `devpi-web` to the pinned versions +3. Initialize the server-dir (only on first run, when `.serverversion` is missing) +4. Render and load the LaunchAgent plist +5. Restart the service if the plist or config changed + +Caddy already proxies `pypi.ops.eblu.me` → `127.0.0.1:3141`; nothing else routes traffic. + +## Verify + +```fish +ssh indri 'launchctl list mcquack.eblume.devpi' +curl -fsS https://pypi.ops.eblu.me/+api | jq +uv pip install --index-url https://pypi.ops.eblu.me/root/pypi/+simple/ requests +``` + +## Logs + +```fish +ssh indri 'tail -f ~/Library/Logs/mcquack.devpi.err.log' +``` + +## Bumping devpi versions + +Edit `devpi_server_version` / `devpi_web_version` in `ansible/roles/devpi/defaults/main.yml`, then re-run the playbook with `--tags devpi`. The role rebuilds the venv in-place; the server-dir survives. + +## Backup + +The server-dir is **not** in `borgmatic_source_directories` and is not backed up. The PyPI cache (`+files/`) is re-fetchable from upstream on first request; the local `eblume/dev` index can be republished from source. If retention becomes important, add `/Users/erichblume/devpi/server-dir/` to the borgmatic source list. + +## Related + +- [[restart-indri]] — devpi is one of the LaunchAgents to stop on graceful shutdown +- [[connect-to-postgres]] — pattern for indri-native services (different stack, similar shape) diff --git a/docs/how-to/operations/rebuild-minikube-cluster.md b/docs/how-to/operations/rebuild-minikube-cluster.md index e23d027..0d924e9 100644 --- a/docs/how-to/operations/rebuild-minikube-cluster.md +++ b/docs/how-to/operations/rebuild-minikube-cluster.md @@ -235,25 +235,7 @@ mise run services-check ## Post-Rebuild: Cold Cache Failures -### Devpi (PyPI Cache) - -After a rebuild, devpi's package cache is empty. The first Dagger-based container build will trigger a flood of concurrent package downloads. Devpi uses lazy caching — it serves package metadata (simple index) immediately from upstream PyPI but fetches wheel files on demand. Under heavy concurrent load with a cold cache, the upstream fetch can race with the client request, causing devpi to return `no such file` (HTTP 404) for packages it knows about but hasn't finished downloading yet. - -**Why devpi, not PyPI?** The repo's `uv.lock` was generated with devpi as the index, so every package source URL points at `pypi.ops.eblu.me`. Dagger's Python SDK runtime does a locked install (`uv sync`), not fresh resolution — it fetches from whatever URLs are in the lockfile. This is intentional (supply chain control), but means all builds — local and CI — depend on devpi being available and warm. - -**Symptoms:** Forgejo Actions Dagger builds fail during module initialization with errors like: -``` -Failed to download `googleapis-common-protos==1.74.0` -HTTP status client error (404 Not Found) for url (https://pypi.ops.eblu.me/root/pypi/+f/...) -``` - -**Fix:** Re-run the failed build. The first attempt warms the cache; subsequent builds succeed. Alternatively, warm the cache manually before triggering CI builds: - -```bash -# From any machine that can reach pypi.ops.eblu.me, install the Dagger SDK -# to pre-populate the most common packages: -pip install --dry-run --index-url https://pypi.ops.eblu.me/root/pypi/+simple/ dagger-io -``` +Devpi runs natively on indri (see [[devpi-on-indri]]) and is unaffected by minikube rebuilds, so the historical "devpi cold cache after rebuild" failure mode no longer applies. If devpi itself goes cold (fresh server-dir), the same lazy-cache race can still cause `404` on the first Dagger build under concurrent load — re-run the build to warm the cache, or pre-warm with `uv pip install --dry-run --index-url https://pypi.ops.eblu.me/root/pypi/+simple/ dagger-io`. ## Related diff --git a/docs/how-to/operations/restart-indri.md b/docs/how-to/operations/restart-indri.md index a956644..e92581e 100644 --- a/docs/how-to/operations/restart-indri.md +++ b/docs/how-to/operations/restart-indri.md @@ -41,6 +41,7 @@ Native services managed by launchd will stop automatically during macOS shutdown ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.forgejo.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.caddy.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.zot.plist' +ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.devpi.plist' # see [[devpi-on-indri]] ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.jellyfin.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.borgmatic.plist' diff --git a/docs/reference/infrastructure/tailscale.md b/docs/reference/infrastructure/tailscale.md index 2794111..9c15d83 100644 --- a/docs/reference/infrastructure/tailscale.md +++ b/docs/reference/infrastructure/tailscale.md @@ -33,7 +33,7 @@ ACLs managed via Pulumi in `pulumi/tailscale/policy.hujson`. | `tag:loki` | indri | Loki log aggregation | | `tag:k8s-api` | indri | Kubernetes API server (minikube) | | `tag:k8s-operator` | (operator pod) | Tailscale operator for k8s — see [[tailscale-operator]] | -| `tag:k8s` | (Ingress proxy pods) | Kubernetes Tailscale Ingress nodes; each also carries a per-service tag (`tag:grafana`, `tag:kiwix`, `tag:devpi`, `tag:feed`, `tag:pg`) | +| `tag:k8s` | (Ingress proxy pods) | Kubernetes Tailscale Ingress nodes; each also carries a per-service tag (`tag:grafana`, `tag:kiwix`, `tag:feed`, `tag:pg`) | | `tag:ci-gateway` | (ephemeral CI containers) | CI containers pushing images to registry | | `tag:flyio-proxy` | (Fly.io proxy container) | Public reverse proxy | | `tag:flyio-target` | indri, designated Ingress endpoints | Endpoints reachable by the Fly.io proxy (indri for Caddy routing, Ingress pods for Alloy metrics/logs) | diff --git a/docs/reference/services/devpi.md b/docs/reference/services/devpi.md index c6493fe..589a802 100644 --- a/docs/reference/services/devpi.md +++ b/docs/reference/services/devpi.md @@ -1,7 +1,7 @@ --- title: Devpi -modified: 2026-03-23 -last-reviewed: 2026-03-23 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - python @@ -9,31 +9,37 @@ tags: # devpi (PyPI Proxy) -PyPI caching proxy and private package index. +PyPI caching proxy and private package index. Runs natively on [[indri]] as a LaunchAgent (not in-cluster). See [[devpi-on-indri]] for deploy and operations. ## Quick Reference | Property | Value | |----------|-------| -| **URL** | https://pypi.ops.eblu.me | -| **Namespace** | `devpi` | -| **ArgoCD App** | `devpi` | -| **Storage** | 50Gi PVC | -| **Image** | `registry.ops.eblu.me/blumeops/devpi` (see `argocd/manifests/devpi/kustomization.yaml` for current tag) | +| **URL** | `https://pypi.ops.eblu.me` | +| **Listen** | `127.0.0.1:3141` (loopback only; reached via Caddy) | +| **Service** | LaunchAgent `mcquack.eblume.devpi` on indri | +| **Server-dir** | `/Users/erichblume/devpi/server-dir/` | +| **Runtime** | uv-managed venv at `/Users/erichblume/devpi/venv/` | +| **Ansible role** | `ansible/roles/devpi/` | +| **Versions** | Pinned in `ansible/roles/devpi/defaults/main.yml` (`devpi_server_version`, `devpi_web_version`) | ## Indices | Index | Purpose | |-------|---------| -| `root/pypi` | PyPI mirror/cache (auto-created) | -| `eblume/dev` | Private packages (inherits from root/pypi) | +| `root/pypi` | PyPI mirror/cache (auto-created by `devpi-init`) | +| `eblume/dev` | Private packages (inherits from `root/pypi`) | ## Credentials -Root password stored in 1Password (blumeops vault), injected via ExternalSecret. +Root password stored in 1Password (`blumeops` vault, item `devpi`, field `root password`). Fetched via `op read` in the `ansible/playbooks/indri.yml` `pre_tasks` and passed to the role on first init. + +## Backup + +The server-dir is **not** backed up. The PyPI cache (`+files/`) is re-fetchable from upstream on first request. The local `eblume/dev` index metadata is small but also not critical to retain — packages can be republished from source. If retention becomes important, add `/Users/erichblume/devpi/server-dir/` to `borgmatic_source_directories`. ## Related -- [[use-pypi-proxy]] - Client configuration and package uploads -- [[argocd]] - Deployment -- [[1password]] - Secrets management +- [[devpi-on-indri]] — Deploy, verify, and version-bump procedures +- [[use-pypi-proxy]] — Client configuration and package uploads +- [[1password]] — Secrets management diff --git a/docs/reference/storage/backups.md b/docs/reference/storage/backups.md index 9ca3bcb..14dbcea 100644 --- a/docs/reference/storage/backups.md +++ b/docs/reference/storage/backups.md @@ -62,7 +62,7 @@ Other data lives directly on [[sifaka]] (music via [[navidrome]], video via [[je | ZIM archives (`~/transmission/`) | Re-downloadable via torrent | | Prometheus metrics | Ephemeral, in k8s PVC | | Loki logs | Ephemeral, in k8s PVC | -| devpi cache | Re-fetchable from PyPI | +| devpi cache (`~/devpi/server-dir/` on indri) | Re-fetchable from PyPI on first request | ## Retention Policy diff --git a/pulumi/tailscale/__main__.py b/pulumi/tailscale/__main__.py index 2f5262b..3acbb62 100644 --- a/pulumi/tailscale/__main__.py +++ b/pulumi/tailscale/__main__.py @@ -37,7 +37,7 @@ acl = tailscale.Acl( # indri - Mac Mini M1, primary homelab server # Hosts forge, loki, zot registry, and the k8s control plane. -# Other services (grafana, kiwix, devpi, etc.) run in k8s with their own Tailscale devices. +# Other services (grafana, kiwix, etc.) run in k8s with their own Tailscale devices. indri = tailscale.get_device(name="indri.tail8d86e.ts.net") indri_tags = tailscale.DeviceTags( "indri-tags", diff --git a/pulumi/tailscale/policy.hujson b/pulumi/tailscale/policy.hujson index 84f1f17..88408ef 100644 --- a/pulumi/tailscale/policy.hujson +++ b/pulumi/tailscale/policy.hujson @@ -20,7 +20,8 @@ }, // --- Members: user-facing services only --- - // Kiwix, Forge, devpi, Miniflux, PostgreSQL + // Kiwix, Forge, Miniflux, PostgreSQL + // (devpi moved off-cluster to indri; reachable via Caddy on tag:flyio-target) { "src": ["autogroup:member"], "dst": ["tag:kiwix"], @@ -31,11 +32,6 @@ "dst": ["tag:forge"], "ip": ["tcp:443", "tcp:22"], }, - { - "src": ["autogroup:member"], - "dst": ["tag:devpi"], - "ip": ["tcp:443"], - }, { "src": ["autogroup:member"], "dst": ["tag:feed"], @@ -152,7 +148,6 @@ "tag:grafana": ["autogroup:admin", "tag:blumeops"], "tag:kiwix": ["autogroup:admin", "tag:blumeops"], "tag:forge": ["autogroup:admin", "tag:blumeops"], - "tag:devpi": ["autogroup:admin", "tag:blumeops"], "tag:loki": ["autogroup:admin", "tag:blumeops"], "tag:pg": ["autogroup:admin", "tag:blumeops"], "tag:feed": ["autogroup:admin", "tag:blumeops"], diff --git a/service-versions.yaml b/service-versions.yaml index 0a4fe93..e819c6c 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -214,10 +214,11 @@ services: upstream-source: https://github.com/kiwix/kiwix-tools/releases - name: devpi - type: argocd - last-reviewed: 2026-04-18 + type: ansible + last-reviewed: 2026-04-29 current-version: "6.19.3" upstream-source: https://github.com/devpi/devpi/releases + notes: Installed via uv into a venv on indri; version pinned in ansible/roles/devpi/defaults/main.yml - name: cv type: argocd From a529d60f60f0864c0f839930ef43f2830a9ac56b Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 13:40:45 -0700 Subject: [PATCH 034/122] C0: remove containers/devpi/ build artifact Devpi now runs natively on indri (uv venv via ansible role), so the Dagger container build at containers/devpi/ is unused. Removing it. Also updated dagger.md examples to use 'miniflux' as the example container-name argument. Co-Authored-By: Claude Opus 4.7 (1M context) --- containers/devpi/container.py | 56 ------------------- containers/devpi/start.sh | 31 ---------- .../+remove-devpi-container-build.misc.md | 1 + docs/reference/tools/dagger.md | 8 +-- 4 files changed, 5 insertions(+), 91 deletions(-) delete mode 100644 containers/devpi/container.py delete mode 100644 containers/devpi/start.sh create mode 100644 docs/changelog.d/+remove-devpi-container-build.misc.md diff --git a/containers/devpi/container.py b/containers/devpi/container.py deleted file mode 100644 index 0067e95..0000000 --- a/containers/devpi/container.py +++ /dev/null @@ -1,56 +0,0 @@ -"""devpi PyPI server and caching proxy — native Dagger build. - -Single-stage build: install devpi-server and devpi-web into a Python slim image. -""" - -import dagger -from dagger import dag - -from blumeops.containers import oci_labels - -VERSION = "6.19.3" - -DEVPI_WEB_VERSION = "5.0.2" -PYTHON_BASE = "python:3.12-slim" - - -async def build(src: dagger.Directory) -> dagger.Container: - ctr = ( - dag.container() - .from_(PYTHON_BASE) - .with_exec( - [ - "pip", - "install", - "--no-cache-dir", - f"devpi-server=={VERSION}", - f"devpi-web=={DEVPI_WEB_VERSION}", - ] - ) - .with_exec( - [ - "useradd", - "-r", - "-u", - "1000", - "devpi", - ] - ) - .with_exec(["mkdir", "-p", "/devpi"]) - .with_exec(["chown", "devpi:devpi", "/devpi"]) - .with_file( - "/usr/local/bin/start.sh", - src.file("containers/devpi/start.sh"), - ) - .with_exec(["chmod", "+x", "/usr/local/bin/start.sh"]) - .with_user("devpi") - .with_workdir("/devpi") - .with_exposed_port(3141) - .with_entrypoint(["/usr/local/bin/start.sh"]) - ) - return oci_labels( - ctr, - title="devpi", - description="devpi PyPI server and caching proxy", - version=VERSION, - ) diff --git a/containers/devpi/start.sh b/containers/devpi/start.sh deleted file mode 100644 index 8ed46a2..0000000 --- a/containers/devpi/start.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -e - -SERVERDIR="${DEVPI_SERVERDIR:-/devpi}" -HOST="${DEVPI_HOST:-0.0.0.0}" -# Note: Can't use DEVPI_PORT - Kubernetes auto-sets it for service discovery -PORT="${DEVPI_LISTEN_PORT:-3141}" -OUTSIDE_URL="${DEVPI_OUTSIDE_URL:-}" - -# Check if devpi is initialized -if [ ! -f "$SERVERDIR/.serverversion" ]; then - echo "Initializing devpi server..." - - if [ -z "$DEVPI_ROOT_PASSWORD" ]; then - echo "ERROR: DEVPI_ROOT_PASSWORD environment variable must be set for initialization" - exit 1 - fi - - devpi-init --serverdir "$SERVERDIR" --root-passwd "$DEVPI_ROOT_PASSWORD" - echo "Devpi initialized successfully" -fi - -# Build command -CMD=(devpi-server --serverdir "$SERVERDIR" --host "$HOST" --port "$PORT") - -if [ -n "$OUTSIDE_URL" ]; then - CMD+=(--outside-url "$OUTSIDE_URL") -fi - -echo "Starting devpi-server..." -exec "${CMD[@]}" diff --git a/docs/changelog.d/+remove-devpi-container-build.misc.md b/docs/changelog.d/+remove-devpi-container-build.misc.md new file mode 100644 index 0000000..8ebec54 --- /dev/null +++ b/docs/changelog.d/+remove-devpi-container-build.misc.md @@ -0,0 +1 @@ +Removed the now-unused `containers/devpi/` Dagger build artifact. Devpi runs natively on indri via uv venv; the container image is no longer referenced anywhere. Doc examples in `docs/reference/tools/dagger.md` updated to use `miniflux` as the example container name. diff --git a/docs/reference/tools/dagger.md b/docs/reference/tools/dagger.md index 89be50c..81c5caf 100644 --- a/docs/reference/tools/dagger.md +++ b/docs/reference/tools/dagger.md @@ -50,16 +50,16 @@ New containers for indri (k8s runner) should use `container.py`. Ringtail contai ```bash # Build a container -dagger call build --src=. --container-name=devpi +dagger call build --src=. --container-name=miniflux # Drop into container shell for inspection -dagger call build --src=. --container-name=devpi terminal +dagger call build --src=. --container-name=miniflux terminal # Debug a failure interactively -dagger call --interactive build --src=. --container-name=devpi +dagger call --interactive build --src=. --container-name=miniflux # Publish a container to zot -dagger call publish --src=. --container-name=devpi --version=v1.1.0 +dagger call publish --src=. --container-name=miniflux --version=v1.1.0 # Build a nix container (no local nix required) dagger call build-nix --src=. --container-name=ntfy export --path=./ntfy.tar.gz From 8d634861f606b5571a2de72f1f377ec9da32d654 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 14:55:11 -0700 Subject: [PATCH 035/122] C1: migrate cv + docs from minikube to indri-native (#342) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Replace the cv (`cv.eblu.me`) and docs (`docs.eblu.me`) minikube Deployments with indri-native ansible roles. Caddy serves the extracted release tarballs directly via a new `kind: static` service-block — no daemon, no nginx pod, no ProxyGroup ingress on the request path. Mirrors the rationale of the recent devpi migration; part of the broader minikube wind-down. ## What's in this commit - `ansible/roles/{cv,docs}` — sentinel-gated tarball download + extract into `~/{cv,docs}/content/` - `ansible/roles/caddy/` — new `kind: static` branch in the Caddyfile template (encoded gzip, immutable cache headers for fingerprinted assets, optional `try_html` for Quartz-style clean URLs, optional per-path `download_paths` for the resume PDF's `Content-Disposition`) - `ansible/playbooks/indri.yml` — wires `cv` and `docs` roles before `caddy` - `service-versions.yaml` — both services flip to `type: ansible`. `docs.current-version` stays at `1.28.2` for this commit so `container-version-check` keeps passing while `containers/quartz/Dockerfile` still exists; it moves to the docs release tag in the cleanup commit - `.forgejo/workflows/{cv-deploy,build-blumeops}.yaml` — deploy step now bumps `cv_version`/`docs_version` in the role defaults and pushes; running ansible + purging the Fly cache is manual from gilbert (matches devpi) - Docs: `docs/how-to/operations/{cv,docs}-on-indri.md`, updated `docs/reference/services/{cv,docs}.md`, changelog fragment ## What is not in this commit The dead artifacts. After PR review and successful cutover, a follow-up commit deletes: - `argocd/apps/{cv,docs}.yaml` and `argocd/manifests/{cv,docs}/` - `containers/cv/`, `containers/quartz/` - `CONTAINER_TO_SERVICE['quartz']` mapping in `mise-tasks/container-version-check` - bumps `docs.current-version` in `service-versions.yaml` to the release tag ## Cutover plan (manual, from gilbert, after review) 1. **Take down old:** - Remove the cv and docs Applications: `argocd app delete cv --cascade && argocd app delete docs --cascade` - Verify k8s namespaces gone: `kubectl --context=minikube-indri get ns | grep -E '^(cv|docs)\\b'` (should be empty) - Verify tailnet MagicDNS no longer advertises the VIPs: `nslookup cv.tail8d86e.ts.net` and `nslookup docs.tail8d86e.ts.net` should both fail 2. **Bring up new:** - `mise run provision-indri -- --tags cv,docs,caddy --check --diff` (already validated on branch) - `mise run provision-indri -- --tags cv,docs,caddy` - `fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'"` 3. **Verify:** `mise run services-check` and the curl checks listed in `docs/how-to/operations/{cv,docs}-on-indri.md` 4. **Cleanup commit + merge.** Total expected downtime: minutes (not the few-hour budget you authorized). ## Test plan - [ ] `mise run provision-indri -- --tags cv,docs --check --diff` clean - [ ] `mise run provision-indri -- --tags caddy --check --diff` shows only the cv + docs blocks changing as previewed in the PR thread - [ ] After cutover: `cv.eblu.me`, `cv.ops.eblu.me`, `docs.eblu.me`, `docs.ops.eblu.me` all return 200 - [ ] `cv.eblu.me/resume.pdf` includes `Content-Disposition: attachment` - [ ] A clean Quartz URL (e.g. `docs.eblu.me/explanation/agent-change-process`) resolves to the right page - [ ] `mise run services-check` clean - [ ] `mise run service-review --type ansible` shows cv and docs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/342 --- .forgejo/workflows/build-blumeops.yaml | 58 +++++---------- .forgejo/workflows/cv-deploy.yaml | 58 +++++---------- ansible/playbooks/indri.yml | 4 ++ ansible/roles/caddy/defaults/main.yml | 10 ++- ansible/roles/caddy/templates/Caddyfile.j2 | 20 ++++++ ansible/roles/cv/defaults/main.yml | 10 +++ ansible/roles/cv/tasks/main.yml | 57 +++++++++++++++ ansible/roles/docs/defaults/main.yml | 11 +++ ansible/roles/docs/tasks/main.yml | 57 +++++++++++++++ argocd/manifests/homepage/services.yaml | 16 +++++ .../migrate-cv-docs-to-indri.infra.md | 1 + docs/how-to/operations/cv-on-indri.md | 72 +++++++++++++++++++ docs/how-to/operations/docs-on-indri.md | 66 +++++++++++++++++ docs/reference/services/cv.md | 43 ++++++----- docs/reference/services/docs.md | 46 ++++++------ service-versions.yaml | 22 ++++-- 16 files changed, 415 insertions(+), 136 deletions(-) create mode 100644 ansible/roles/cv/defaults/main.yml create mode 100644 ansible/roles/cv/tasks/main.yml create mode 100644 ansible/roles/docs/defaults/main.yml create mode 100644 ansible/roles/docs/tasks/main.yml create mode 100644 docs/changelog.d/migrate-cv-docs-to-indri.infra.md create mode 100644 docs/how-to/operations/cv-on-indri.md create mode 100644 docs/how-to/operations/docs-on-indri.md diff --git a/.forgejo/workflows/build-blumeops.yaml b/.forgejo/workflows/build-blumeops.yaml index 383542f..c6e6c3c 100644 --- a/.forgejo/workflows/build-blumeops.yaml +++ b/.forgejo/workflows/build-blumeops.yaml @@ -178,10 +178,11 @@ jobs: echo "## Documentation" echo "" - echo "Download \`$TARBALL\` and configure the quartz container with:" + echo "Download \`$TARBALL\` directly, or bump \`docs_version\`" + echo "in \`ansible/roles/docs/defaults/main.yml\` and run:" echo "" echo "\`\`\`" - echo "DOCS_RELEASE_URL=https://forge.eblu.me/eblume/blumeops/releases/download/$VERSION/$TARBALL" + echo "mise run provision-indri -- --tags docs" echo "\`\`\`" } > /tmp/release_body.txt @@ -223,18 +224,16 @@ jobs: echo "" echo "Release created successfully!" - - name: Update docs deployment + - name: Bump docs_version in ansible role run: | VERSION="${{ steps.version.outputs.version }}" - TARBALL="docs-${VERSION}.tar.gz" - DEPLOYMENT_FILE="argocd/manifests/docs/deployment.yaml" - RELEASE_URL="https://forge.eblu.me/eblume/blumeops/releases/download/${VERSION}/${TARBALL}" + DEFAULTS_FILE="ansible/roles/docs/defaults/main.yml" - echo "Updating $DEPLOYMENT_FILE with new release URL..." - yq -i "(.spec.template.spec.containers[0].env[] | select(.name == \"DOCS_RELEASE_URL\")).value = \"${RELEASE_URL}\"" "$DEPLOYMENT_FILE" + echo "Bumping docs_version in $DEFAULTS_FILE to ${VERSION}..." + yq -i ".docs_version = \"${VERSION}\"" "$DEFAULTS_FILE" - echo "Updated deployment:" - grep -A1 "DOCS_RELEASE_URL" "$DEPLOYMENT_FILE" + echo "Updated defaults:" + grep -E "^docs_version:" "$DEFAULTS_FILE" - name: Commit release changes env: @@ -248,7 +247,7 @@ jobs: git config user.email "actions@forge.ops.eblu.me" # Stage deployment changes - git add argocd/manifests/docs/deployment.yaml + git add ansible/roles/docs/defaults/main.yml # Stage changelog changes if updated if [ "$CHANGELOG_UPDATED" = "true" ]; then @@ -270,34 +269,6 @@ jobs: echo "Changes committed and pushed" fi - - name: Deploy docs - env: - ARGOCD_AUTH_TOKEN: ${{ secrets.ARGOCD_AUTH_TOKEN }} - run: | - echo "Syncing docs app via ArgoCD..." - - # Sync docs app (uses ARGOCD_AUTH_TOKEN env var for auth) - argocd app sync docs \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --prune - - # Wait for sync to complete - argocd app wait docs \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --timeout 120 - - echo "Docs app synced successfully!" - - - name: Purge Fly.io proxy cache - env: - FLY_API_TOKEN: ${{ secrets.FLY_DEPLOY_TOKEN }} - run: | - echo "Purging nginx cache on Fly.io proxy..." - fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'" - echo "Cache purged" - - name: Summary run: | VERSION="${{ steps.version.outputs.version }}" @@ -309,5 +280,12 @@ jobs: echo "Release URL:" echo " https://forge.eblu.me/eblume/blumeops/releases/tag/$VERSION" echo "" - echo "Asset URL (for DOCS_RELEASE_URL ConfigMap):" + echo "Asset URL:" echo " https://forge.eblu.me/eblume/blumeops/releases/download/$VERSION/$TARBALL" + echo "" + echo "To deploy on indri, run from gilbert:" + echo " mise run provision-indri -- --tags docs" + echo "" + echo "Then purge the Fly.io proxy cache:" + echo " fly ssh console -a blumeops-proxy -C \\" + echo " \"sh -c 'rm -rf /tmp/cache && nginx -s reload'\"" diff --git a/.forgejo/workflows/cv-deploy.yaml b/.forgejo/workflows/cv-deploy.yaml index f99352d..001aa36 100644 --- a/.forgejo/workflows/cv-deploy.yaml +++ b/.forgejo/workflows/cv-deploy.yaml @@ -1,12 +1,14 @@ # CV Deploy Workflow # -# Updates the CV deployment to a specific package version, commits -# the change, and syncs via ArgoCD. +# Bumps cv_version in ansible/roles/cv/defaults/main.yml and pushes the change. +# Deployment to indri is manual (runner has no SSH access to indri): +# mise run provision-indri -- --tags cv # # Usage: # 1. Release a new CV package from the cv repo first # 2. Go to Actions > Deploy CV > Run workflow # 3. Enter the version to deploy, or leave as "latest" +# 4. Run the command above on gilbert to apply name: Deploy CV @@ -60,18 +62,16 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Update CV deployment + - name: Bump cv_version in ansible role run: | VERSION="${{ steps.version.outputs.version }}" - TARBALL="cv-${VERSION}.tar.gz" - DEPLOYMENT_FILE="argocd/manifests/cv/deployment.yaml" - RELEASE_URL="https://forge.eblu.me/api/packages/eblume/generic/cv/${VERSION}/${TARBALL}" + DEFAULTS_FILE="ansible/roles/cv/defaults/main.yml" - echo "Updating $DEPLOYMENT_FILE with CV_RELEASE_URL..." - yq -i "(.spec.template.spec.containers[0].env[] | select(.name == \"CV_RELEASE_URL\")).value = \"${RELEASE_URL}\"" "$DEPLOYMENT_FILE" + echo "Bumping cv_version in $DEFAULTS_FILE to ${VERSION}..." + yq -i ".cv_version = \"${VERSION}\"" "$DEFAULTS_FILE" - echo "Updated deployment:" - grep -A1 "CV_RELEASE_URL" "$DEPLOYMENT_FILE" + echo "Updated defaults:" + grep -E "^cv_version:" "$DEFAULTS_FILE" - name: Commit release changes env: @@ -82,7 +82,7 @@ jobs: git config user.name "Forgejo Actions" git config user.email "actions@forge.ops.eblu.me" - git add argocd/manifests/cv/deployment.yaml + git add ansible/roles/cv/defaults/main.yml if git diff --cached --quiet; then echo "No changes to commit (already at $VERSION)" @@ -94,38 +94,16 @@ jobs: echo "Changes committed and pushed" fi - - name: Deploy CV - env: - ARGOCD_AUTH_TOKEN: ${{ secrets.ARGOCD_AUTH_TOKEN }} - run: | - echo "Syncing CV app via ArgoCD..." - - argocd app sync cv \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --prune - - argocd app wait cv \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --timeout 120 - - echo "CV app synced successfully!" - - - name: Purge Fly.io proxy cache - env: - FLY_API_TOKEN: ${{ secrets.FLY_DEPLOY_TOKEN }} - run: | - echo "Purging nginx cache on Fly.io proxy..." - fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'" - echo "Cache purged" - - name: Summary run: | VERSION="${{ steps.version.outputs.version }}" echo "================================================" - echo "CV Deployed: $VERSION" + echo "CV version bumped: $VERSION" echo "================================================" echo "" - echo "CV should now be live at:" - echo " https://cv.ops.eblu.me/" + echo "To deploy on indri, run from gilbert:" + echo " mise run provision-indri -- --tags cv" + echo "" + echo "Then purge the Fly.io proxy cache:" + echo " fly ssh console -a blumeops-proxy -C \\" + echo " \"sh -c 'rm -rf /tmp/cache && nginx -s reload'\"" diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index fa87b36..ddb57f8 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -256,5 +256,9 @@ tags: jellyfin_metrics - role: forgejo_metrics tags: forgejo_metrics + - role: cv + tags: cv + - role: docs + tags: docs - role: caddy tags: caddy diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index 80993ee..6eada76 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -72,10 +72,16 @@ caddy_services: backend: "https://go.tail8d86e.ts.net" - name: docs host: "docs.{{ caddy_domain }}" - backend: "https://docs.tail8d86e.ts.net" + kind: static + root: "{{ docs_content_dir }}" + try_html: true # Quartz: path → path/ → path.html → 404.html - name: cv host: "cv.{{ caddy_domain }}" - backend: "https://cv.tail8d86e.ts.net" + kind: static + root: "{{ cv_content_dir }}" + download_paths: + - path: /resume.pdf + filename: erich-blume-resume.pdf - name: nvr host: "nvr.{{ caddy_domain }}" backend: "https://nvr.tail8d86e.ts.net" diff --git a/ansible/roles/caddy/templates/Caddyfile.j2 b/ansible/roles/caddy/templates/Caddyfile.j2 index 4f103f1..b08f16a 100644 --- a/ansible/roles/caddy/templates/Caddyfile.j2 +++ b/ansible/roles/caddy/templates/Caddyfile.j2 @@ -31,6 +31,25 @@ {% for service in caddy_services %} @{{ service.name }} host {{ service.host }} handle @{{ service.name }} { +{% if service.kind | default('proxy') == 'static' %} + root * {{ service.root }} + encode gzip + # Long-cache fingerprinted assets; everything else stays default. + @{{ service.name }}_assets path_regexp \.(css|js|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ + header @{{ service.name }}_assets Cache-Control "public, max-age=31536000, immutable" +{% for dl in service.download_paths | default([]) %} + @{{ service.name }}_dl{{ loop.index }} path {{ dl.path }} + header @{{ service.name }}_dl{{ loop.index }} Content-Disposition `attachment; filename="{{ dl.filename }}"` +{% endfor %} +{% if service.try_html | default(false) %} + try_files {path} {path}/ {path}.html + handle_errors 404 { + rewrite * /404.html + file_server + } +{% endif %} + file_server +{% else %} {% if service.cache_policy | default('') == 'spa' %} # SPA cache policy: hashed static assets are immutable, HTML must revalidate. # Prevents stale HTML from referencing chunk hashes that no longer exist. @@ -47,6 +66,7 @@ } {% else %} reverse_proxy {{ service.backend }} +{% endif %} {% endif %} } diff --git a/ansible/roles/cv/defaults/main.yml b/ansible/roles/cv/defaults/main.yml new file mode 100644 index 0000000..734e52b --- /dev/null +++ b/ansible/roles/cv/defaults/main.yml @@ -0,0 +1,10 @@ +--- +# CV / resume static site (native, replaces minikube Deployment) +# Caddy serves cv_content_dir directly via the static-kind service block. + +cv_version: "v1.0.3" +cv_release_url: "https://forge.eblu.me/api/packages/eblume/generic/cv/{{ cv_version }}/cv-{{ cv_version }}.tar.gz" + +cv_home: /Users/erichblume/blumeops/cv +cv_content_dir: "{{ cv_home }}/content" +cv_version_sentinel: "{{ cv_home }}/.installed-version" diff --git a/ansible/roles/cv/tasks/main.yml b/ansible/roles/cv/tasks/main.yml new file mode 100644 index 0000000..c254325 --- /dev/null +++ b/ansible/roles/cv/tasks/main.yml @@ -0,0 +1,57 @@ +--- +# cv role — download and extract the CV release tarball into cv_content_dir. +# Caddy serves the directory directly; there is no daemon to manage. +# +# Idempotency: a sentinel file records the installed cv_version. The +# download/extract steps only run when the sentinel doesn't match cv_version. +# +# We use curl rather than ansible.builtin.get_url because the forge generic- +# packages endpoint returns 405 on HEAD requests, which get_url issues before +# downloading. + +- name: Ensure cv home exists + ansible.builtin.file: + path: "{{ cv_home }}" + state: directory + mode: '0755' + +- name: Read installed cv version sentinel + ansible.builtin.slurp: + src: "{{ cv_version_sentinel }}" + register: cv_installed_raw + failed_when: false + changed_when: false + +- name: Set installed cv version fact + ansible.builtin.set_fact: + cv_installed_version: >- + {{ (cv_installed_raw.content | b64decode).strip() + if (cv_installed_raw.content is defined) else '' }} + +- name: Recreate cv content dir + ansible.builtin.file: + path: "{{ cv_content_dir }}" + state: "{{ item }}" + mode: '0755' + loop: + - absent + - directory + when: cv_installed_version != cv_version + +- name: Download and extract cv release tarball + ansible.builtin.shell: + cmd: >- + set -euo pipefail; + curl -fsSL {{ cv_release_url | quote }} -o {{ cv_home }}/cv.tar.gz && + tar -xzf {{ cv_home }}/cv.tar.gz -C {{ cv_content_dir }} && + rm -f {{ cv_home }}/cv.tar.gz + executable: /bin/bash + when: cv_installed_version != cv_version + changed_when: true + +- name: Write cv version sentinel + ansible.builtin.copy: + content: "{{ cv_version }}\n" + dest: "{{ cv_version_sentinel }}" + mode: '0644' + when: cv_installed_version != cv_version diff --git a/ansible/roles/docs/defaults/main.yml b/ansible/roles/docs/defaults/main.yml new file mode 100644 index 0000000..f09221b --- /dev/null +++ b/ansible/roles/docs/defaults/main.yml @@ -0,0 +1,11 @@ +--- +# Docs (Quartz-built static site) — replaces minikube Deployment. +# Caddy serves docs_content_dir directly via the static-kind service block, +# with Quartz-style try_files (path → path/ → path.html → 404). + +docs_version: "v1.16.0" +docs_release_url: "https://forge.eblu.me/eblume/blumeops/releases/download/{{ docs_version }}/docs-{{ docs_version }}.tar.gz" + +docs_home: /Users/erichblume/blumeops/docs +docs_content_dir: "{{ docs_home }}/content" +docs_version_sentinel: "{{ docs_home }}/.installed-version" diff --git a/ansible/roles/docs/tasks/main.yml b/ansible/roles/docs/tasks/main.yml new file mode 100644 index 0000000..dec775e --- /dev/null +++ b/ansible/roles/docs/tasks/main.yml @@ -0,0 +1,57 @@ +--- +# docs role — download and extract the Quartz-built docs tarball into +# docs_content_dir. Caddy serves the directory directly with Quartz-style +# try_files; there is no daemon to manage. +# +# Idempotency: a sentinel file records the installed docs_version. The +# download/extract steps only run when the sentinel doesn't match docs_version. +# +# Mirrors the cv role's curl-based download for consistency, even though the +# forge releases endpoint here does support HEAD. + +- name: Ensure docs home exists + ansible.builtin.file: + path: "{{ docs_home }}" + state: directory + mode: '0755' + +- name: Read installed docs version sentinel + ansible.builtin.slurp: + src: "{{ docs_version_sentinel }}" + register: docs_installed_raw + failed_when: false + changed_when: false + +- name: Set installed docs version fact + ansible.builtin.set_fact: + docs_installed_version: >- + {{ (docs_installed_raw.content | b64decode).strip() + if (docs_installed_raw.content is defined) else '' }} + +- name: Recreate docs content dir + ansible.builtin.file: + path: "{{ docs_content_dir }}" + state: "{{ item }}" + mode: '0755' + loop: + - absent + - directory + when: docs_installed_version != docs_version + +- name: Download and extract docs release tarball + ansible.builtin.shell: + cmd: >- + set -euo pipefail; + curl -fsSL {{ docs_release_url | quote }} -o {{ docs_home }}/docs.tar.gz && + tar -xzf {{ docs_home }}/docs.tar.gz -C {{ docs_content_dir }} && + rm -f {{ docs_home }}/docs.tar.gz + executable: /bin/bash + when: docs_installed_version != docs_version + changed_when: true + +- name: Write docs version sentinel + ansible.builtin.copy: + content: "{{ docs_version }}\n" + dest: "{{ docs_version_sentinel }}" + mode: '0644' + when: docs_installed_version != docs_version diff --git a/argocd/manifests/homepage/services.yaml b/argocd/manifests/homepage/services.yaml index 58b8bb7..211e043 100644 --- a/argocd/manifests/homepage/services.yaml +++ b/argocd/manifests/homepage/services.yaml @@ -12,6 +12,10 @@ href: https://registry.ops.eblu.me icon: zot-registry description: Container registry + - Devpi: + href: https://pypi.ops.eblu.me + icon: mdi-language-python + description: PyPI caching mirror - Sifaka NAS: href: https://nas.ops.eblu.me icon: synology @@ -77,3 +81,15 @@ href: https://ntfy.ops.eblu.me icon: ntfy.png description: Push notifications +- Services: + # CV and Docs were previously auto-discovered from k8s Ingresses; after + # the indri-native migration ([[cv-on-indri]], [[docs-on-indri]]) there + # is no Ingress to discover, so they live here as static entries. + - CV: + href: https://cv.eblu.me + icon: mdi-file-document + description: Resume / CV + - Docs: + href: https://docs.eblu.me + icon: mdi-book-open-page-variant + description: BlumeOps Documentation diff --git a/docs/changelog.d/migrate-cv-docs-to-indri.infra.md b/docs/changelog.d/migrate-cv-docs-to-indri.infra.md new file mode 100644 index 0000000..608a6b9 --- /dev/null +++ b/docs/changelog.d/migrate-cv-docs-to-indri.infra.md @@ -0,0 +1 @@ +Migrated CV (`cv.eblu.me`) and Docs (`docs.eblu.me`) from minikube Deployments to indri-native ansible roles. Caddy now serves the extracted release tarballs directly via a new `kind: static` service-block in the Caddy template — no daemon, no container — replacing the prior nginx-in-a-pod layer. Removes a network hop on every request and shrinks minikube's footprint. See [[cv-on-indri]] and [[docs-on-indri]]. Part of the broader minikube wind-down. diff --git a/docs/how-to/operations/cv-on-indri.md b/docs/how-to/operations/cv-on-indri.md new file mode 100644 index 0000000..432acab --- /dev/null +++ b/docs/how-to/operations/cv-on-indri.md @@ -0,0 +1,72 @@ +--- +title: CV on Indri +modified: 2026-04-29 +last-reviewed: 2026-04-29 +tags: + - how-to + - operations +--- + +# CV on Indri + +How the CV/resume static site (`cv.eblu.me`) is deployed on indri natively. Replaces the prior minikube Deployment; mirrors the rationale of [[devpi-on-indri]]. + +## Why native, not Kubernetes + +CV is a tiny static site (HTML + CSS + PDF). It needs no daemon, no database, no auth. Caddy on indri can serve the extracted tarball directly via `file_server`. Removing the minikube Deployment shrinks the cluster's footprint and removes a network hop (Fly → indri Caddy → ProxyGroup ingress → minikube pod becomes Fly → indri Caddy → local files). + +## Layout + +| Concern | Path / detail | +|---|---| +| Content dir | `/Users/erichblume/blumeops/cv/content/` | +| Version sentinel | `/Users/erichblume/blumeops/cv/.installed-version` | +| Caddy entry | `cv` service in `ansible/roles/caddy/defaults/main.yml` (`kind: static`) | +| Public URL | `https://cv.eblu.me` (via [[flyio-proxy]]) | +| Private URL | `https://cv.ops.eblu.me` (Caddy on indri) | +| Tarball source | Forgejo generic package `cv` (`forge.eblu.me/eblume/-/packages`) | + +The role is driven by `cv_version` in `ansible/roles/cv/defaults/main.yml`. The download and extract steps only fire when the on-disk sentinel doesn't match `cv_version` — i.e. after a version bump. + +## Deploy + +Two paths: + +**From a release workflow** (most common): + +1. Run the `Release CV` workflow in the cv repo → produces a new generic package +2. Run the blumeops `Deploy CV` workflow → bumps `cv_version` in `ansible/roles/cv/defaults/main.yml` and pushes to main +3. From gilbert: `mise run provision-indri -- --tags cv` +4. From gilbert: `fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'"` to purge the public-edge cache + +**Manual** (e.g., reverting): edit `cv_version` in the role defaults yourself, then steps 3–4. + +## Verify + +```fish +ssh indri 'cat ~/blumeops/cv/.installed-version' +ssh indri 'ls -la ~/blumeops/cv/content/' +curl -fsSI https://cv.ops.eblu.me/ # private +curl -fsSI https://cv.eblu.me/ # public +curl -fsSI https://cv.eblu.me/resume.pdf | grep -i disposition +``` + +The PDF response should include `content-disposition: attachment; filename="erich-blume-resume.pdf"`. + +## Bumping the cv version + +Edit `cv_version` in `ansible/roles/cv/defaults/main.yml` and re-run `mise run provision-indri -- --tags cv`. The role recreates the content dir from the new tarball; the sentinel update triggers the next idempotent skip. + +## Backup + +The content dir is **not** in `borgmatic_source_directories`. The tarball is re-downloadable from the Forgejo generic package store on every deploy, and the source is in the cv repo — recovery is just re-running the role. + +## Rollback + +If a bad version is published, set `cv_version` back to the previous tag in `ansible/roles/cv/defaults/main.yml` and re-run the role. The full minikube manifest set is preserved in git history (commits prior to the migration cleanup) for the worst case. + +## Related + +- [[devpi-on-indri]] — same shape, different upstream +- [[restart-indri]] — graceful indri restart procedure +- [[cv]] — service reference diff --git a/docs/how-to/operations/docs-on-indri.md b/docs/how-to/operations/docs-on-indri.md new file mode 100644 index 0000000..e683db5 --- /dev/null +++ b/docs/how-to/operations/docs-on-indri.md @@ -0,0 +1,66 @@ +--- +title: Docs on Indri +modified: 2026-04-29 +last-reviewed: 2026-04-29 +tags: + - how-to + - operations +--- + +# Docs on Indri + +How the Quartz documentation site (`docs.eblu.me`) is deployed on indri natively. Replaces the prior minikube Deployment; same shape as [[cv-on-indri]] with one extra wrinkle for Quartz's clean URLs. + +## Why native, not Kubernetes + +The docs site is fully static HTML produced by Quartz. Caddy can serve the extracted tarball directly. The Quartz-specific behavior the previous nginx container provided (`try_files $uri $uri/ $uri.html =404` and a custom `/404.html`) maps cleanly to Caddy's `try_files` and `handle_errors`. + +## Layout + +| Concern | Path / detail | +|---|---| +| Content dir | `/Users/erichblume/blumeops/docs/content/` | +| Version sentinel | `/Users/erichblume/blumeops/docs/.installed-version` | +| Caddy entry | `docs` service in `ansible/roles/caddy/defaults/main.yml` (`kind: static`, `try_html: true`) | +| Public URL | `https://docs.eblu.me` (via [[flyio-proxy]]) | +| Private URL | `https://docs.ops.eblu.me` (Caddy on indri) | +| Tarball source | Forgejo release asset on the blumeops repo (`docs-.tar.gz`) | + +`docs_version` in `ansible/roles/docs/defaults/main.yml` is the blumeops release tag (e.g. `v1.16.0`). The role's download/extract is gated by an on-disk sentinel. + +## Deploy + +1. Run the `Build BlumeOps` Forgejo workflow → builds the tarball, creates a release, bumps `docs_version` in the ansible role, pushes to main +2. From gilbert: `mise run provision-indri -- --tags docs` +3. From gilbert: `fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'"` + +The Caddy block uses `try_files {path} {path}/ {path}.html` and a `handle_errors 404 → /404.html` rewrite, matching the original nginx behavior so Quartz's clean URLs continue to work. + +## Verify + +```fish +ssh indri 'cat ~/blumeops/docs/.installed-version' +ssh indri 'ls ~/blumeops/docs/content/' +curl -fsSI https://docs.ops.eblu.me/ # private +curl -fsSI https://docs.eblu.me/ # public +curl -fsSI https://docs.eblu.me/explanation/agent-change-process # clean URL → .html fallback +curl -fsSI https://docs.eblu.me/no-such-path-exists/ # → /404.html +``` + +## Bumping the docs version + +Normally driven by the workflow. If you need to pin manually, edit `docs_version` in `ansible/roles/docs/defaults/main.yml` and re-run `mise run provision-indri -- --tags docs`. + +## Backup + +Content dir is not borgmatic-backed. Source is in this repo; release tarballs are on the forge. + +## Rollback + +Set `docs_version` back to the previous release tag in the role defaults and re-run. Older release tarballs remain available as Forgejo release assets. + +## Related + +- [[cv-on-indri]] — sibling service, simpler (no `try_html`) +- [[devpi-on-indri]] — pattern reference for indri-native services +- [[docs]] — service reference diff --git a/docs/reference/services/cv.md b/docs/reference/services/cv.md index 55805d6..1bc5f15 100644 --- a/docs/reference/services/cv.md +++ b/docs/reference/services/cv.md @@ -1,7 +1,7 @@ --- title: CV -modified: 2026-03-27 -last-reviewed: 2026-03-27 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - resume @@ -15,37 +15,36 @@ Personal resume/CV served as a static HTML page with PDF download, built from YA | Property | Value | |----------|-------| -| **URL** | `cv.eblu.me` (public, via [[flyio-proxy]]) | -| **Namespace** | `cv` | -| **Container** | `registry.ops.eblu.me/blumeops/cv` ([kustomization](https://forge.eblu.me/eblume/blumeops/src/branch/main/argocd/manifests/cv/kustomization.yaml)) | +| **Public URL** | `cv.eblu.me` (via [[flyio-proxy]]) | +| **Private URL** | `cv.ops.eblu.me` (Caddy on indri) | +| **Deployment** | Ansible role `cv` on indri (no daemon — Caddy serves files directly) | +| **Content dir** | `~/blumeops/cv/content/` on indri | | **Source repo** | `forge.eblu.me/eblume/cv` (private, not mirrored to GitHub) | | **Content packages** | `forge.eblu.me/eblume/-/packages` (generic package `cv`) | -| **ArgoCD App** | `cv` | + +Migrated from minikube to indri-native on 2026-04-29 (see [[cv-on-indri]]). ## Architecture 1. **Source**: `resume.yaml` (content) + `template.html` (Jinja2) + `style.css` in the cv repo 2. **Build**: `render.py` (uv script runner) generates `index.html`; WeasyPrint generates `resume.pdf` 3. **Release**: Dagger `build` function packages `index.html`, `style.css`, `resume.pdf` into a tarball, uploaded to Forgejo generic packages -4. **Deploy**: nginx container downloads the tarball at startup via `CV_RELEASE_URL` env var +4. **Deploy**: ansible role downloads the tarball into `~/blumeops/cv/content/` on indri; Caddy serves the directory directly ## Endpoints | Path | Description | |------|-------------| | `/` | Resume HTML page | -| `/resume.pdf` | PDF download (Content-Disposition: attachment) | -| `/healthz` | Health check (200 OK) | +| `/resume.pdf` | PDF download (Caddy adds `Content-Disposition: attachment`) | ## Configuration **Key files (blumeops):** -- `containers/cv/Dockerfile` — nginx:alpine container -- `containers/cv/start.sh` — tarball download + extraction -- `containers/cv/default.conf` — nginx config (gzip, caching, PDF headers) -- `argocd/manifests/cv/deployment.yaml` — `CV_RELEASE_URL` env var -- `argocd/apps/cv.yaml` — ArgoCD Application +- `ansible/roles/cv/defaults/main.yml` — pinned `cv_version` and tarball URL +- `ansible/roles/cv/tasks/main.yml` — sentinel-gated download + extract +- `ansible/roles/caddy/defaults/main.yml` — `cv` service entry (`kind: static`, `download_paths` for the PDF) **Key files (cv repo):** @@ -56,17 +55,15 @@ Personal resume/CV served as a static HTML page with PDF download, built from YA - `src/cv_ci/main.py` — Dagger pipeline (alpine + uv + WeasyPrint) - `.forgejo/workflows/cv-release.yaml` — Release workflow -## Secrets +## Release flow -| Secret | Repo | Source | Description | -|--------|------|--------|-------------| -| `FORGE_TOKEN` | cv | 1Password (via Ansible) | Forgejo API token for package uploads | - -Provisioned via `forgejo_actions_secrets` Ansible role. See [[create-release-artifact-workflow]]. +1. Release a new package from the cv repo (`Release CV` workflow) +2. Run the blumeops `Deploy CV` workflow → bumps `cv_version` in the ansible role and pushes +3. Run `mise run provision-indri -- --tags cv` from gilbert +4. Purge the Fly.io proxy cache so the new content is fetched ## Related -- [[docs]] — Similar architecture (nginx container + content tarball) +- [[cv-on-indri]] — Operations how-to +- [[docs]] — Similar architecture (Caddy serving a tarball-extracted dir) - [[flyio-proxy]] — Exposes `cv.eblu.me` publicly via Tailscale tunnel -- [[create-release-artifact-workflow]] — How to set up release artifact workflows -- [[deploy-k8s-service]] — General k8s deployment guide diff --git a/docs/reference/services/docs.md b/docs/reference/services/docs.md index 1361d02..8ca8310 100644 --- a/docs/reference/services/docs.md +++ b/docs/reference/services/docs.md @@ -1,7 +1,7 @@ --- title: Docs -modified: 2026-03-23 -last-reviewed: 2026-03-23 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - documentation @@ -9,44 +9,42 @@ tags: # Docs (Quartz) -Documentation site built with [Quartz](https://quartz.jzhao.xyz/) and served via nginx. +Documentation site built with [Quartz](https://quartz.jzhao.xyz/). ## Quick Reference | Property | Value | |----------|-------| -| **Public URL** | https://docs.eblu.me | -| **Private URL** | `docs.ops.eblu.me` (tailnet only, via [[caddy]]) | -| **Namespace** | `docs` | -| **Image** | `registry.ops.eblu.me/blumeops/quartz` (see `argocd/manifests/docs/kustomization.yaml` for current tag) | +| **Public URL** | https://docs.eblu.me (via [[flyio-proxy]]) | +| **Private URL** | `docs.ops.eblu.me` (Caddy on indri) | +| **Deployment** | Ansible role `docs` on indri (no daemon — Caddy serves files directly) | +| **Content dir** | `~/blumeops/docs/content/` on indri | | **Source** | `docs/` directory in blumeops repo | | **Build** | Forgejo workflow `build-blumeops.yaml` | -| **Public proxy** | [[flyio-proxy]] (Fly.io → Tailscale tunnel) | + +Migrated from minikube to indri-native on 2026-04-29 (see [[docs-on-indri]]). ## Architecture 1. **Source**: Markdown files in `docs/` with Obsidian-compatible wiki-links -2. **Build**: Forgejo workflow builds Quartz static site on push to main -3. **Release**: Built assets published as Forgejo release attachments -4. **Deploy**: Container downloads release bundle on startup, serves via nginx - -## Release Process - -Documentation is built and released via the `build-blumeops` Forgejo workflow (manual dispatch): - -1. Quartz builds static HTML/CSS/JS -2. Assets uploaded as Forgejo release attachment -3. Workflow updates `DOCS_RELEASE_URL` in `argocd/manifests/docs/deployment.yaml` and commits to main -4. ArgoCD syncs the updated deployment; new pod downloads the release bundle at startup +2. **Build**: `Build BlumeOps` Forgejo workflow runs towncrier + Quartz, uploads tarball as a release asset, and bumps `docs_version` in the ansible role +3. **Deploy**: ansible role downloads the tarball into `~/blumeops/docs/content/` on indri; Caddy serves the directory directly with Quartz-style `try_files` (path → path/ → path.html → 404.html) ## Configuration - **Quartz config**: `quartz.config.ts` - **Layout**: `quartz.layout.ts` -- **ArgoCD app**: `argocd/apps/docs.yaml` -- **Manifests**: `argocd/manifests/docs/` +- **Ansible role**: `ansible/roles/docs/` +- **Caddy entry**: `ansible/roles/caddy/defaults/main.yml` (`kind: static`, `try_html: true`) + +## Release flow + +1. Run the `Build BlumeOps` workflow → builds tarball, creates release, bumps `docs_version` in the ansible role and pushes +2. Run `mise run provision-indri -- --tags docs` from gilbert +3. Purge the Fly.io proxy cache so the new content is fetched ## Related -- [[argocd]] - Deployment management -- [[forgejo]] - Build workflows +- [[docs-on-indri]] — Operations how-to +- [[cv]] — Similar architecture +- [[forgejo]] — Build workflows diff --git a/service-versions.yaml b/service-versions.yaml index e819c6c..d77fa13 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -221,18 +221,26 @@ services: notes: Installed via uv into a venv on indri; version pinned in ansible/roles/devpi/defaults/main.yml - name: cv - type: argocd - last-reviewed: 2026-04-27 + type: ansible + last-reviewed: 2026-04-29 current-version: "1.0.3" upstream-source: https://forge.eblu.me/eblume/cv - notes: Personal static site; review build deps (WeasyPrint, Jinja2) in source repo + notes: >- + Static tarball downloaded by ansible/roles/cv into ~/blumeops/cv/content on indri; + served directly by Caddy (kind=static). Migrated from minikube 2026-04-29. + Review build deps (WeasyPrint, Jinja2) in source repo on upstream review. - name: docs - type: argocd - last-reviewed: 2026-03-07 + type: ansible + last-reviewed: 2026-04-29 current-version: "1.28.2" - upstream-source: https://github.com/jackyzha0/quartz/releases - notes: Quartz static site generator; container version tracks nginx base + upstream-source: https://forge.eblu.me/eblume/blumeops/releases + notes: >- + Quartz-built tarball downloaded by ansible/roles/docs into ~/blumeops/docs/content + on indri; served directly by Caddy (kind=static, try_html). Migrated from + minikube 2026-04-29. current-version still tracks the legacy quartz/nginx + base; will switch to the docs release tag (e.g. v1.16.0) once the dead + containers/quartz Dockerfile is removed in the cleanup commit. - name: forgejo-runner type: argocd From 2ee53fe3758cbeac769b005c0774b040158df2c6 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 15:16:44 -0700 Subject: [PATCH 036/122] =?UTF-8?q?C0:=20fix=20Caddyfile=20try=5Fhtml=20?= =?UTF-8?q?=E2=80=94=20handle=5Ferrors=20can't=20nest=20inside=20handle{}?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kind=static branch added in #342 put handle_errors inside the @host handle{} block. handle_errors is a top-level site-block directive, not an ordered HTTP handler, so Caddy refuses to load the config: parsing caddyfile tokens for 'handle': directive 'handle_errors' is not an ordered HTTP handler This crash-loops the whole reverse proxy and takes down every *.ops.eblu.me service. Tripped today during the live cv/docs cutover. Fix: drop handle_errors and append /404.html as the final try_files candidate. The 404 page is served with status 200 instead of 404, but that's acceptable for a human-facing curated 404 — the page renders correctly. Documented inline. The running Caddy on indri already has the fixed config (deployed manually during the cutover); this lands the fix in main so future provision-indri --tags caddy runs don't re-break it. Co-Authored-By: Claude Opus 4.7 (1M context) --- ansible/roles/caddy/templates/Caddyfile.j2 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/roles/caddy/templates/Caddyfile.j2 b/ansible/roles/caddy/templates/Caddyfile.j2 index b08f16a..f6b5f64 100644 --- a/ansible/roles/caddy/templates/Caddyfile.j2 +++ b/ansible/roles/caddy/templates/Caddyfile.j2 @@ -42,11 +42,11 @@ header @{{ service.name }}_dl{{ loop.index }} Content-Disposition `attachment; filename="{{ dl.filename }}"` {% endfor %} {% if service.try_html | default(false) %} - try_files {path} {path}/ {path}.html - handle_errors 404 { - rewrite * /404.html - file_server - } + # Quartz clean URLs: path → path/ → path.html → /404.html (200). + # Caddy's handle_errors is a top-level directive and can't live in + # this nested handle, so the 404 page rides as the final try_files + # candidate (served with 200 — acceptable for a human-facing 404). + try_files {path} {path}/ {path}.html /404.html {% endif %} file_server {% else %} From 5096223b485308a6234eeb069a1cbb40d1c850b8 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 29 Apr 2026 15:18:39 -0700 Subject: [PATCH 037/122] C1: clean up cv + docs minikube artifacts (#343) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Follow-up to #342. The cv and docs services are now live on indri (Caddy file_server backed by ansible-managed tarball extraction) and verified working. This PR removes the dead minikube artifacts and the tooling shims that referenced them. ## Changes **Deletions:** - ``argocd/apps/{cv,docs}.yaml`` - ``argocd/manifests/{cv,docs}/`` (deployment, service, ingress, pdb, kustomization) - ``containers/{cv,quartz}/`` (Dockerfiles + start scripts) **Tooling:** - ``mise-tasks/container-version-check``: remove the ``quartz``→``docs`` CONTAINER_TO_SERVICE mapping (containers/quartz no longer exists) - ``service-versions.yaml``: bump ``docs.current-version`` to ``v1.16.0`` (the blumeops docs release tag) and trim the migration-window comment ## Live state context The argocd Applications ``cv`` and ``docs`` were already deleted from the cluster manually as part of the cutover; this PR just removes the YAML files that the ``apps`` app-of-apps was still ingesting. After merge, ``argocd app sync apps`` will reconcile and the ``apps`` Application returns to Synced. The Caddyfile ``handle_errors`` bug that briefly crashed all ``*.ops.eblu.me`` services during cutover is fixed in a separate C0 (``2ee53fe``) on main, not here. ## Test plan - [x] ``mise run container-version-check --all-files`` clean - [x] ``mise run service-review --type ansible`` shows cv at 1.0.3, docs at v1.16.0 - [ ] After merge: ``argocd app sync apps`` returns clean (cv/docs entries gone, no children to reconcile) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/343 --- argocd/apps/cv.yaml | 18 ------- argocd/apps/docs.yaml | 18 ------- argocd/manifests/cv/deployment.yaml | 51 ------------------- argocd/manifests/cv/ingress-tailscale.yaml | 27 ---------- argocd/manifests/cv/kustomization.yaml | 12 ----- argocd/manifests/cv/pdb.yaml | 10 ---- argocd/manifests/cv/service.yaml | 13 ----- argocd/manifests/docs/deployment.yaml | 51 ------------------- argocd/manifests/docs/ingress-tailscale.yaml | 27 ---------- argocd/manifests/docs/kustomization.yaml | 12 ----- argocd/manifests/docs/pdb.yaml | 10 ---- argocd/manifests/docs/service.yaml | 13 ----- containers/cv/Dockerfile | 30 ----------- containers/cv/default.conf | 33 ------------ containers/cv/start.sh | 31 ----------- containers/quartz/Dockerfile | 31 ----------- containers/quartz/default.conf | 34 ------------- containers/quartz/start.sh | 31 ----------- ...cleanup-cv-docs-minikube-artifacts.misc.md | 1 + mise-tasks/container-version-check | 1 - service-versions.yaml | 8 ++- 21 files changed, 4 insertions(+), 458 deletions(-) delete mode 100644 argocd/apps/cv.yaml delete mode 100644 argocd/apps/docs.yaml delete mode 100644 argocd/manifests/cv/deployment.yaml delete mode 100644 argocd/manifests/cv/ingress-tailscale.yaml delete mode 100644 argocd/manifests/cv/kustomization.yaml delete mode 100644 argocd/manifests/cv/pdb.yaml delete mode 100644 argocd/manifests/cv/service.yaml delete mode 100644 argocd/manifests/docs/deployment.yaml delete mode 100644 argocd/manifests/docs/ingress-tailscale.yaml delete mode 100644 argocd/manifests/docs/kustomization.yaml delete mode 100644 argocd/manifests/docs/pdb.yaml delete mode 100644 argocd/manifests/docs/service.yaml delete mode 100644 containers/cv/Dockerfile delete mode 100644 containers/cv/default.conf delete mode 100644 containers/cv/start.sh delete mode 100644 containers/quartz/Dockerfile delete mode 100644 containers/quartz/default.conf delete mode 100644 containers/quartz/start.sh create mode 100644 docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md diff --git a/argocd/apps/cv.yaml b/argocd/apps/cv.yaml deleted file mode 100644 index ad09a8d..0000000 --- a/argocd/apps/cv.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: cv - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/cv - destination: - server: https://kubernetes.default.svc - namespace: cv - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/docs.yaml b/argocd/apps/docs.yaml deleted file mode 100644 index cd8db35..0000000 --- a/argocd/apps/docs.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: docs - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/docs - destination: - server: https://kubernetes.default.svc - namespace: docs - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/manifests/cv/deployment.yaml b/argocd/manifests/cv/deployment.yaml deleted file mode 100644 index f2b00e6..0000000 --- a/argocd/manifests/cv/deployment.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cv - namespace: cv -spec: - replicas: 2 - strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 - selector: - matchLabels: - app: cv - template: - metadata: - labels: - app: cv - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: cv - image: registry.ops.eblu.me/blumeops/cv:kustomized - ports: - - containerPort: 80 - name: http - env: - - name: CV_RELEASE_URL - value: "https://forge.eblu.me/api/packages/eblume/generic/cv/v1.0.3/cv-v1.0.3.tar.gz" - resources: - requests: - memory: "64Mi" - cpu: "10m" - limits: - memory: "128Mi" - livenessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 10 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 5 - periodSeconds: 10 diff --git a/argocd/manifests/cv/ingress-tailscale.yaml b/argocd/manifests/cv/ingress-tailscale.yaml deleted file mode 100644 index 489f95a..0000000 --- a/argocd/manifests/cv/ingress-tailscale.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: cv-tailscale - namespace: cv - annotations: - tailscale.com/proxy-class: "default" - tailscale.com/proxy-group: "ingress" - tailscale.com/tags: "tag:k8s,tag:flyio-target" - gethomepage.dev/enabled: "true" - gethomepage.dev/name: "CV" - gethomepage.dev/group: "Services" - gethomepage.dev/icon: "mdi-file-document" - gethomepage.dev/description: "Resume / CV" - gethomepage.dev/href: "https://cv.eblu.me" - gethomepage.dev/pod-selector: "app=cv" -spec: - ingressClassName: tailscale - defaultBackend: - service: - name: cv - port: - number: 80 - tls: - - hosts: - - cv diff --git a/argocd/manifests/cv/kustomization.yaml b/argocd/manifests/cv/kustomization.yaml deleted file mode 100644 index 199108d..0000000 --- a/argocd/manifests/cv/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: cv -resources: - - deployment.yaml - - service.yaml - - ingress-tailscale.yaml - - pdb.yaml -images: - - name: registry.ops.eblu.me/blumeops/cv - newTag: v1.0.3-613f05d diff --git a/argocd/manifests/cv/pdb.yaml b/argocd/manifests/cv/pdb.yaml deleted file mode 100644 index db5240d..0000000 --- a/argocd/manifests/cv/pdb.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: cv -spec: - minAvailable: 1 - selector: - matchLabels: - app: cv diff --git a/argocd/manifests/cv/service.yaml b/argocd/manifests/cv/service.yaml deleted file mode 100644 index 23e0e94..0000000 --- a/argocd/manifests/cv/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: cv - namespace: cv -spec: - selector: - app: cv - ports: - - name: http - port: 80 - targetPort: 80 diff --git a/argocd/manifests/docs/deployment.yaml b/argocd/manifests/docs/deployment.yaml deleted file mode 100644 index c477b83..0000000 --- a/argocd/manifests/docs/deployment.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: docs - namespace: docs -spec: - replicas: 2 - strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 - selector: - matchLabels: - app: docs - template: - metadata: - labels: - app: docs - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: docs - image: registry.ops.eblu.me/blumeops/quartz:kustomized - ports: - - containerPort: 80 - name: http - env: - - name: DOCS_RELEASE_URL - value: "https://forge.eblu.me/eblume/blumeops/releases/download/v1.16.0/docs-v1.16.0.tar.gz" - resources: - requests: - memory: "64Mi" - cpu: "10m" - limits: - memory: "128Mi" - livenessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 10 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 5 - periodSeconds: 10 diff --git a/argocd/manifests/docs/ingress-tailscale.yaml b/argocd/manifests/docs/ingress-tailscale.yaml deleted file mode 100644 index 047e823..0000000 --- a/argocd/manifests/docs/ingress-tailscale.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: docs-tailscale - namespace: docs - annotations: - tailscale.com/proxy-class: "default" - tailscale.com/proxy-group: "ingress" - tailscale.com/tags: "tag:k8s,tag:flyio-target" - gethomepage.dev/enabled: "true" - gethomepage.dev/name: "Docs" - gethomepage.dev/group: "Services" - gethomepage.dev/icon: "mdi-book-open-page-variant" - gethomepage.dev/description: "BlumeOps Documentation" - gethomepage.dev/href: "https://docs.eblu.me" - gethomepage.dev/pod-selector: "app=docs" -spec: - ingressClassName: tailscale - defaultBackend: - service: - name: docs - port: - number: 80 - tls: - - hosts: - - docs diff --git a/argocd/manifests/docs/kustomization.yaml b/argocd/manifests/docs/kustomization.yaml deleted file mode 100644 index a16185f..0000000 --- a/argocd/manifests/docs/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: docs -resources: - - deployment.yaml - - service.yaml - - ingress-tailscale.yaml - - pdb.yaml -images: - - name: registry.ops.eblu.me/blumeops/quartz - newTag: v1.28.2-613f05d diff --git a/argocd/manifests/docs/pdb.yaml b/argocd/manifests/docs/pdb.yaml deleted file mode 100644 index a87b8e9..0000000 --- a/argocd/manifests/docs/pdb.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: docs -spec: - minAvailable: 1 - selector: - matchLabels: - app: docs diff --git a/argocd/manifests/docs/service.yaml b/argocd/manifests/docs/service.yaml deleted file mode 100644 index 62b0f83..0000000 --- a/argocd/manifests/docs/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: docs - namespace: docs -spec: - selector: - app: docs - ports: - - name: http - port: 80 - targetPort: 80 diff --git a/containers/cv/Dockerfile b/containers/cv/Dockerfile deleted file mode 100644 index 9bfebe0..0000000 --- a/containers/cv/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -# CV/Resume Static Site Server -# Downloads and serves a CV site tarball (HTML+CSS+PDF) via nginx -# -# Configuration (via environment): -# CV_RELEASE_URL - URL to download the CV content tarball -# -# The container downloads the tarball on startup, extracts it, and serves with nginx. - -ARG CONTAINER_APP_VERSION=1.0.3 - -FROM nginx:alpine - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="CV" -LABEL org.opencontainers.image.description="Static site server for CV/resume" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -# Install curl for downloading release assets -RUN apk add --no-cache curl - -# Copy startup script and nginx config -COPY start.sh /start.sh -COPY default.conf /etc/nginx/conf.d/default.conf -RUN chmod +x /start.sh - -EXPOSE 80 - -CMD ["/start.sh"] diff --git a/containers/cv/default.conf b/containers/cv/default.conf deleted file mode 100644 index 7c89b08..0000000 --- a/containers/cv/default.conf +++ /dev/null @@ -1,33 +0,0 @@ -server { - listen 80; - server_name _; - root /usr/share/nginx/html; - index index.html; - - # Enable gzip compression - gzip on; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml text/javascript; - - # Cache static assets - location ~* \.(css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ { - expires 1y; - add_header Cache-Control "public, immutable"; - } - - # Force PDF download - location = /resume.pdf { - add_header Content-Disposition 'attachment; filename="erich-blume-resume.pdf"'; - } - - # Serve files directly - location / { - try_files $uri $uri/ =404; - } - - # Health check endpoint - location /healthz { - access_log off; - return 200 "ok\n"; - add_header Content-Type text/plain; - } -} diff --git a/containers/cv/start.sh b/containers/cv/start.sh deleted file mode 100644 index bb81c20..0000000 --- a/containers/cv/start.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -set -e - -HTML_DIR="/usr/share/nginx/html" - -# Check for required environment variable -if [ -z "$CV_RELEASE_URL" ]; then - echo "Error: CV_RELEASE_URL environment variable is required" - echo "Set it to the URL of the CV content tarball to serve" - exit 1 -fi - -echo "Downloading CV content from: $CV_RELEASE_URL" - -# Download the tarball -if ! curl -fsSL "$CV_RELEASE_URL" -o /tmp/cv.tar.gz; then - echo "Error: Failed to download CV content from $CV_RELEASE_URL" - exit 1 -fi - -# Clear existing content and extract -rm -rf "${HTML_DIR:?}"/* -echo "Extracting CV content to $HTML_DIR" -tar -xzf /tmp/cv.tar.gz -C "$HTML_DIR" -rm /tmp/cv.tar.gz - -echo "CV content extracted successfully" -echo "Starting nginx..." - -# Start nginx in foreground -exec nginx -g "daemon off;" diff --git a/containers/quartz/Dockerfile b/containers/quartz/Dockerfile deleted file mode 100644 index 8ffd44c..0000000 --- a/containers/quartz/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -# Quartz Static Site Server -# Downloads and serves a Quartz-built static site from a release bundle -# -# Configuration (via environment): -# DOCS_RELEASE_URL - URL to download the static site tarball -# -# The container downloads the tarball on startup, extracts it, and serves with nginx. - -ARG CONTAINER_APP_VERSION=1.28.2 -ARG NGINX_VERSION=${CONTAINER_APP_VERSION} - -FROM nginx:${NGINX_VERSION}-alpine - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Quartz" -LABEL org.opencontainers.image.description="Static site server for Quartz-built documentation" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -# Install curl for downloading release assets -RUN apk add --no-cache curl - -# Copy startup script and nginx config -COPY start.sh /start.sh -COPY default.conf /etc/nginx/conf.d/default.conf -RUN chmod +x /start.sh - -EXPOSE 80 - -CMD ["/start.sh"] diff --git a/containers/quartz/default.conf b/containers/quartz/default.conf deleted file mode 100644 index 64eec4e..0000000 --- a/containers/quartz/default.conf +++ /dev/null @@ -1,34 +0,0 @@ -server { - listen 80; - server_name _; - root /usr/share/nginx/html; - index index.html; - - # Enable gzip compression - gzip on; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml text/javascript; - - # Cache static assets - location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ { - expires 1y; - add_header Cache-Control "public, immutable"; - } - - # Static file serving — no SPA fallback. - # Quartz generates complete HTML for every page, so all valid URLs - # map to real files. Non-existent paths get 404.html (generated by - # Quartz's NotFoundPage plugin), preventing the spider-trap issue - # where crawlers would get index.html for fabricated URLs. - location / { - try_files $uri $uri/ $uri.html =404; - } - - error_page 404 /404.html; - - # Health check endpoint - location /healthz { - access_log off; - return 200 "ok\n"; - add_header Content-Type text/plain; - } -} diff --git a/containers/quartz/start.sh b/containers/quartz/start.sh deleted file mode 100644 index 778eeb1..0000000 --- a/containers/quartz/start.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -set -e - -HTML_DIR="/usr/share/nginx/html" - -# Check for required environment variable -if [ -z "$DOCS_RELEASE_URL" ]; then - echo "Error: DOCS_RELEASE_URL environment variable is required" - echo "Set it to the URL of the static site tarball to serve" - exit 1 -fi - -echo "Downloading docs from: $DOCS_RELEASE_URL" - -# Download the tarball -if ! curl -fsSL "$DOCS_RELEASE_URL" -o /tmp/docs.tar.gz; then - echo "Error: Failed to download docs from $DOCS_RELEASE_URL" - exit 1 -fi - -# Clear existing content and extract -rm -rf "${HTML_DIR:?}"/* -echo "Extracting docs to $HTML_DIR" -tar -xzf /tmp/docs.tar.gz -C "$HTML_DIR" -rm /tmp/docs.tar.gz - -echo "Docs extracted successfully" -echo "Starting nginx..." - -# Start nginx in foreground -exec nginx -g "daemon off;" diff --git a/docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md b/docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md new file mode 100644 index 0000000..79a81cf --- /dev/null +++ b/docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md @@ -0,0 +1 @@ +Removed the dead minikube manifests, container builds, and tooling shims left behind after the cv + docs migration to indri-native (#342). Deletes `argocd/{apps,manifests}/{cv,docs}/`, `containers/{cv,quartz}/`, and the `quartz`→`docs` mapping in `mise-tasks/container-version-check`. Bumps `docs.current-version` to `v1.16.0` (the blumeops release tag) now that the legacy nginx-base version pin is gone. diff --git a/mise-tasks/container-version-check b/mise-tasks/container-version-check index 6270ae1..95cf6f0 100755 --- a/mise-tasks/container-version-check +++ b/mise-tasks/container-version-check @@ -42,7 +42,6 @@ BLACKLIST = {"kubectl"} # Container dir name → service-versions.yaml name (when they differ) CONTAINER_TO_SERVICE = { - "quartz": "docs", "kiwix-serve": "kiwix", } diff --git a/service-versions.yaml b/service-versions.yaml index d77fa13..f4f4a6a 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -233,14 +233,12 @@ services: - name: docs type: ansible last-reviewed: 2026-04-29 - current-version: "1.28.2" + current-version: "v1.16.0" upstream-source: https://forge.eblu.me/eblume/blumeops/releases notes: >- Quartz-built tarball downloaded by ansible/roles/docs into ~/blumeops/docs/content - on indri; served directly by Caddy (kind=static, try_html). Migrated from - minikube 2026-04-29. current-version still tracks the legacy quartz/nginx - base; will switch to the docs release tag (e.g. v1.16.0) once the dead - containers/quartz Dockerfile is removed in the cleanup commit. + on indri; served directly by Caddy (kind=static, try_html). current-version + tracks the blumeops docs release tag. - name: forgejo-runner type: argocd From f6e392b80cacda422f27af14af71c5a182bc4009 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 30 Apr 2026 16:51:43 -0700 Subject: [PATCH 038/122] C1: SHA-pin tooling dependencies (2026-04 cycle) (#344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Monthly tooling dependency refresh, with a one-time conversion from version-tag pins (`rev = "vX.Y.Z"`, `image:tag`, `>=`) to SHA / digest pins everywhere. ## Changes - **prek hooks**: all `rev = "vX.Y.Z"` → commit SHA + `# vX.Y.Z` comment. Bumped trufflehog (3.94.0→3.95.2), kingfisher (1.91.0→1.97.0), ruff (0.15.7→0.15.12), shfmt (3.13.0→3.13.1), prettier (3.8.1→3.8.3), actionlint (1.7.11→1.7.12). - **fly/Dockerfile**: tag pins → `image@sha256:...` digest pins. Bumped nginx (1.29.6→1.30.0-alpine), tailscale (v1.94.1→v1.94.2 — still inside the safe pre-1.96.5 range), alloy (v1.14.1→v1.16.0). - **mise-tasks**: PEP 723 inline deps converted from `>=` to `==` (PEP 508 doesn't support hashes inline). All scripts pinned to current latest: rich 15.0.0, typer 0.25.0, pyyaml 6.0.3, httpx 0.28.1. - **prek `additional_dependencies`**: ansible-lint==26.4.0, ansible-core==2.20.5. - **taplo-lint**: pass `--no-schema`. Upstream's `--default-schema-catalogs` returns a format taplo v0.9.3 can't parse — we don't validate against TOML schemas anyway, so this turns off the broken catalog fetch. - **docs/update-tooling-dependencies**: documents the SHA-pin convention, `docker buildx imagetools inspect` for digest lookup, and `prek clean` before re-verifying (cache grows to several GiB). Forgejo workflow `actions/checkout@v6.0.2` was already at the latest SHA — no change. ## Test plan - [x] `prek run --all-files` passes after `prek clean` - [x] `deploy-fly` workflow builds and deploys the new fly image on merge - [x] `fly status -a blumeops-proxy` healthy after deploy - [x] Spot-check a few mise tasks (`mise run blumeops-tasks`, `mise run docs-check-links`) to confirm pinned deps resolve cleanly Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/344 --- .../update-tooling-deps-2026-04.doc.md | 1 + .../update-tooling-deps-2026-04.infra.md | 1 + .../configuration/rotate-fly-deploy-token.md | 108 ++++++++++++++++++ .../update-tooling-dependencies.md | 28 +++-- docs/how-to/operations/manage-flyio-proxy.md | 4 + fly/Dockerfile | 13 ++- mise-tasks/blumeops-tasks | 2 +- mise-tasks/branch-cleanup | 2 +- mise-tasks/container-build-and-release | 2 +- mise-tasks/container-list | 2 +- mise-tasks/container-version-check | 2 +- mise-tasks/dns-acme-cleanup | 2 +- mise-tasks/docs-check-frontmatter | 2 +- mise-tasks/docs-check-links | 2 +- mise-tasks/docs-mikado | 2 +- mise-tasks/docs-preview | 2 +- mise-tasks/docs-review | 2 +- mise-tasks/docs-review-stale | 2 +- mise-tasks/docs-review-tags | 2 +- mise-tasks/mikado-branch-invariant-check | 2 +- mise-tasks/op-backup | 2 +- mise-tasks/pr-comments | 2 +- mise-tasks/prune-ringtail-generations | 2 +- mise-tasks/review-compensating-controls | 2 +- mise-tasks/review-compliance-reports | 2 +- mise-tasks/runner-logs | 2 +- mise-tasks/service-review | 2 +- mise-tasks/spork-create | 2 +- prek.toml | 24 ++-- 29 files changed, 175 insertions(+), 48 deletions(-) create mode 100644 docs/changelog.d/update-tooling-deps-2026-04.doc.md create mode 100644 docs/changelog.d/update-tooling-deps-2026-04.infra.md create mode 100644 docs/how-to/configuration/rotate-fly-deploy-token.md diff --git a/docs/changelog.d/update-tooling-deps-2026-04.doc.md b/docs/changelog.d/update-tooling-deps-2026-04.doc.md new file mode 100644 index 0000000..141e975 --- /dev/null +++ b/docs/changelog.d/update-tooling-deps-2026-04.doc.md @@ -0,0 +1 @@ +New how-to: rotate-fly-deploy-token. Documents the 75-day rotation cadence, why we use `org`-scoped tokens (silences the cosmetic metrics-token warning on `fly status` with marginal blast-radius cost given the single-app personal org), and the procedure for rotation + Forgejo Actions secret sync. diff --git a/docs/changelog.d/update-tooling-deps-2026-04.infra.md b/docs/changelog.d/update-tooling-deps-2026-04.infra.md new file mode 100644 index 0000000..4731eca --- /dev/null +++ b/docs/changelog.d/update-tooling-deps-2026-04.infra.md @@ -0,0 +1 @@ +Monthly tooling dependency refresh: prek hooks (trufflehog, kingfisher, ruff, shfmt, prettier, actionlint, ansible-lint), fly proxy base images (nginx 1.30.0, tailscale v1.94.2, alloy v1.16.0), normalize pyyaml lower bound in mise-tasks. diff --git a/docs/how-to/configuration/rotate-fly-deploy-token.md b/docs/how-to/configuration/rotate-fly-deploy-token.md new file mode 100644 index 0000000..58aba21 --- /dev/null +++ b/docs/how-to/configuration/rotate-fly-deploy-token.md @@ -0,0 +1,108 @@ +--- +title: Rotate the Fly.io API Token +modified: 2026-04-30 +last-reviewed: 2026-04-30 +tags: + - how-to + - fly-io + - secrets +--- + +# Rotate the Fly.io API Token + +How to rotate the Fly.io API token used to deploy [[flyio-proxy]]. The token lives in 1Password at `op://blumeops/fly.io admin/add more/deploy-token` and is consumed by [`mise run fly-deploy`](../../../mise-tasks/fly-deploy) and the `deploy-fly` Forgejo workflow (via the `FLY_DEPLOY_TOKEN` secret). + +## When to rotate + +- Every 75 days (Todoist recurring task) +- After any compromise / accidental disclosure +- If `fly deploy` starts returning auth errors + +Fly.io tokens default to a 20-year expiry, but a short rotation cadence limits the blast radius of an undetected leak. Token expiry is set to **90 days** (longer than the rotation window), leaving a 15-day buffer if a rotation is delayed. + +## Scope + +Use **`fly tokens create org`**, not `deploy`. + +| Scope | What it grants | Practical blast radius (this org) | +|-------|---------------|-----------------------------------| +| `deploy` | Manage one app and its resources | Same single-app surface as `org` for current setup | +| `org` | Manage one org and its resources | Adds: ability to create new apps (billing abuse) and read org-level metadata | +| `readonly` | Read one org | Not enough to deploy | +| Personal access token | Full account | Excessive | + +The personal Fly org currently contains a single app (`blumeops-proxy`), so the marginal blast radius of `org` over `deploy` is small. The benefit of `org` is that `fly status` works without a `Metrics token unavailable: ... context canceled` warning. That warning happens because `fly status` always tries to fetch org-level metrics-token info, and an app-scoped `deploy` token can't query the org. The warning is benign but persistent and could mask a real future failure. + +If a second Fly app is ever added to this org, reconsider — at that point the marginal scope cost of `org` grows. + +## Procedure + +### 1. Authenticate flyctl with the current token + +```fish +fly auth login +``` + +(Browser-based. Required to mint a new token, since the existing deploy token can't create tokens.) + +### 2. Mint the new token + +```fish +fly tokens create org \ + --org personal \ + --name "blumeops-proxy deploy $(date +%Y-%m-%d)" \ + --expiry 2160h +``` + +(`2160h` = 90 days, paired with the 75-day rotation cadence for a 15-day buffer. Capture the output — it's the only time the token is shown.) + +### 3. Update 1Password + +```fish +op item edit on5slfaygtdjrxmdwezyhfmqsq 'add more.deploy-token=' --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +### 4. Sync to Forgejo Actions + +The `deploy-fly` workflow reads the same token from a Forgejo Actions secret named `FLY_DEPLOY_TOKEN`, populated by the `forgejo_actions_secrets` ansible role: + +```fish +mise run provision-indri -- --tags forgejo_actions_secrets +``` + +### 5. Verify + +```fish +mise run fly-deploy +``` + +A successful deploy confirms the new token works locally. Watch for the metrics-token warning — it should be **absent** with an `org`-scoped token. If still present, the rotation produced a `deploy`-scoped token by mistake. + +Then trigger the CI workflow (push a no-op commit touching `fly/`, or dispatch manually) to confirm Forgejo Actions has the new secret. + +### 6. Revoke the old token + +```fish +fly tokens list +fly tokens revoke +``` + +## Debugging + +### `fly deploy` returns "unauthorized" + +Token is invalid (expired, revoked, or wrong scope). Repeat the procedure. + +### `Metrics token unavailable: ... context canceled` after rotation + +The new token was created with `deploy` scope, not `org`. Either accept it (cosmetic) or re-mint with `fly tokens create org`. + +### Forgejo Actions deploy fails but local works + +The Forgejo secret wasn't synced. Re-run `mise run provision-indri -- --tags forgejo_actions_secrets` and confirm the secret value in Forgejo matches 1Password. + +## Related + +- [[flyio-proxy]] — Service reference card +- [[manage-flyio-proxy]] — Day-to-day operations and Tailscale auth-key rotation (separate 90-day rotation) +- [[expose-service-publicly]] — Full setup architecture diff --git a/docs/how-to/configuration/update-tooling-dependencies.md b/docs/how-to/configuration/update-tooling-dependencies.md index 8b09e6d..2bfe887 100644 --- a/docs/how-to/configuration/update-tooling-dependencies.md +++ b/docs/how-to/configuration/update-tooling-dependencies.md @@ -28,33 +28,45 @@ Out of scope: ArgoCD-deployed service images, Ansible role versions, NixOS flake ### 1. Check prek hook versions -For each repo in `prek.toml` with a `rev =` value, check the upstream GitHub releases page for a newer tag. Update each `rev` to the latest release tag. Also check `additional_dependencies` entries for PyPI version bumps. - -Verify after updating: +For each repo in `prek.toml` with a `rev =` value, check the upstream GitHub releases page for a newer tag. Update each `rev` to the **commit SHA** of the latest release with a trailing `# vX.Y.Z` comment (matches the `additional_dependencies` and Forgejo workflow pinning style). Also check `additional_dependencies` entries for PyPI version bumps and pin them with `==`. ```fish +git ls-remote --tags https://github.com//.git 'refs/tags/v*' | sort -t/ -k3 -V | tail -5 +``` + +Clear the prek cache before verifying — it can grow to several GiB (one venv per hook per version) and old cached environments can mask resolution failures or stale catalogs: + +```fish +prek clean prek run --all-files ``` ### 2. Check Fly.io Dockerfile pins -Review `fly/Dockerfile` for pinned image tags: +Review `fly/Dockerfile` for pinned image digests. Each `FROM` and `COPY --from=` uses `image@sha256:...` digest pinning with a comment line above documenting the human-readable version. - **nginx** — check [Docker Hub](https://hub.docker.com/_/nginx) for latest stable alpine tag - **grafana/alloy** — check [GitHub releases](https://github.com/grafana/alloy/releases) -- **tailscale/tailscale** — uses `stable` rolling tag, no action needed +- **tailscale/tailscale** — pinned to a known-good version. Do not bump to v1.96.5 or later (MagicDNS regression breaks the proxy boot) + +To resolve a tag to a digest: + +```fish +docker buildx imagetools inspect docker.io/: +# Use the top-level "Digest:" line (multi-arch index) — not the per-platform sub-digest +``` After updating, the deploy-fly workflow will build and deploy on merge to main. Verify with `fly status -a blumeops-proxy` after deploy. -### 3. Normalize mise task dependency bounds +### 3. Pin mise task dependencies -Mise tasks use `uv run --script` with inline PEP 723 dependency metadata. Check that lower bounds are consistent across all scripts: +Mise tasks use `uv run --script` with inline PEP 723 dependency metadata. All packages are pinned with `==` (PEP 508 doesn't support hashes inline). Check that pinned versions are consistent across all scripts: ```fish grep -r 'dependencies' mise-tasks/ | grep '# dependencies' ``` -Ensure all scripts using the same package agree on the minimum version. When a package has a new major or breaking minor release, bump the lower bound across all scripts at once. +For each package in use (`httpx`, `rich`, `typer`, `pyyaml`), pick the latest PyPI version and update every script in lockstep — divergence between scripts is the failure mode this catches. Bump everything together; don't leave one script behind. ### 4. Pin Forgejo workflow action versions diff --git a/docs/how-to/operations/manage-flyio-proxy.md b/docs/how-to/operations/manage-flyio-proxy.md index 5cea783..d1a243d 100644 --- a/docs/how-to/operations/manage-flyio-proxy.md +++ b/docs/how-to/operations/manage-flyio-proxy.md @@ -76,6 +76,10 @@ The auth key expires every 90 days. To rotate: 2. Re-run setup to stage the new secret: `mise run fly-setup` 3. Deploy to pick up the new secret: `mise run fly-deploy` +## Rotate Fly.io API Token + +See [[rotate-fly-deploy-token]] for the full rotation procedure (75-day cadence, `org`-scoped). + ## Troubleshooting **502 Bad Gateway on fresh deploy**: MagicDNS may not be ready when nginx starts. The `start.sh` script polls `nslookup` before launching nginx, but if it still fails, check that `tailscale status` is healthy inside the container. diff --git a/fly/Dockerfile b/fly/Dockerfile index 8a6df31..eae8c35 100644 --- a/fly/Dockerfile +++ b/fly/Dockerfile @@ -1,9 +1,10 @@ -FROM nginx:1.29.6-alpine +# nginx 1.30.0-alpine +FROM nginx@sha256:0272e4604ed93c1792f03695a033a6e8546840f86e0de20a884bb17d2c924883 -# Copy tailscale binaries from official image -COPY --from=docker.io/tailscale/tailscale:v1.94.1 \ +# Copy tailscale binaries from official image (v1.94.2) +COPY --from=docker.io/tailscale/tailscale@sha256:95e528798bebe75f39b10e74e7051cf51188ee615934f232ba7ad06a3390ffa1 \ /usr/local/bin/tailscaled /usr/local/bin/tailscaled -COPY --from=docker.io/tailscale/tailscale:v1.94.1 \ +COPY --from=docker.io/tailscale/tailscale@sha256:95e528798bebe75f39b10e74e7051cf51188ee615934f232ba7ad06a3390ffa1 \ /usr/local/bin/tailscale /usr/local/bin/tailscale RUN mkdir -p /var/run/tailscale /var/lib/tailscale \ @@ -12,8 +13,8 @@ RUN mkdir -p /var/run/tailscale /var/lib/tailscale \ && apk add --no-cache fail2ban \ && rm -f /etc/fail2ban/jail.d/alpine-ssh.conf -# Copy Alloy binary from official image (Ubuntu-based, needs libc6-compat) -COPY --from=docker.io/grafana/alloy:v1.14.1 \ +# Copy Alloy binary from official image (v1.16.0, Ubuntu-based, needs libc6-compat) +COPY --from=docker.io/grafana/alloy@sha256:6e00cf7c5a692ff5f24844529416ed017d76fce922f8199004e73d5eca46b6b8 \ /bin/alloy /usr/local/bin/alloy RUN mkdir -p /var/log/nginx /etc/alloy /tmp/alloy-data diff --git a/mise-tasks/blumeops-tasks b/mise-tasks/blumeops-tasks index e07e9bf..035aa3b 100755 --- a/mise-tasks/blumeops-tasks +++ b/mise-tasks/blumeops-tasks @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0"] # /// #MISE description="List Blumeops tasks from Todoist sorted by priority" """Fetch and display Blumeops tasks from Todoist, sorted by priority. diff --git a/mise-tasks/branch-cleanup b/mise-tasks/branch-cleanup index bd5ac66..575c9a1 100755 --- a/mise-tasks/branch-cleanup +++ b/mise-tasks/branch-cleanup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Delete branches that have been merged into main (local and remote)" #MISE alias="bc" diff --git a/mise-tasks/container-build-and-release b/mise-tasks/container-build-and-release index afa970e..ba569e7 100755 --- a/mise-tasks/container-build-and-release +++ b/mise-tasks/container-build-and-release @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["typer>=0.24.0", "httpx>=0.28.1"] +# dependencies = ["typer==0.25.0", "httpx==0.28.1"] # /// #MISE description="Trigger container build workflows via Forgejo API" #USAGE arg "" help="Container name (directory under containers/)" diff --git a/mise-tasks/container-list b/mise-tasks/container-list index b1bd433..26639f2 100755 --- a/mise-tasks/container-list +++ b/mise-tasks/container-list @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="List available containers and their recent tags" #USAGE arg "[name]" help="Optional container name to filter output" diff --git a/mise-tasks/container-version-check b/mise-tasks/container-version-check index 95cf6f0..4ebe3b6 100755 --- a/mise-tasks/container-version-check +++ b/mise-tasks/container-version-check @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Validate container version consistency across container.py, Dockerfiles, nix derivations, and service-versions.yaml" #USAGE flag "--all-files" help="Check all containers, not just changed ones" diff --git a/mise-tasks/dns-acme-cleanup b/mise-tasks/dns-acme-cleanup index 5152ae2..432a6ce 100755 --- a/mise-tasks/dns-acme-cleanup +++ b/mise-tasks/dns-acme-cleanup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Delete orphaned ACME challenge TXT records in eblu.me" #USAGE flag "--dry-run" help="List orphans without deleting" diff --git a/mise-tasks/docs-check-frontmatter b/mise-tasks/docs-check-frontmatter index 11d1a49..35e1879 100755 --- a/mise-tasks/docs-check-frontmatter +++ b/mise-tasks/docs-check-frontmatter @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0"] +# dependencies = ["rich==15.0.0"] # /// #MISE description="Check that all docs have required frontmatter fields" """Validate that all documentation files have required YAML frontmatter. diff --git a/mise-tasks/docs-check-links b/mise-tasks/docs-check-links index 78e871a..9974fc7 100755 --- a/mise-tasks/docs-check-links +++ b/mise-tasks/docs-check-links @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0"] +# dependencies = ["rich==15.0.0"] # /// #MISE description="Validate all wiki-links point to existing doc files" """Validate that all wiki-links in documentation point to existing files. diff --git a/mise-tasks/docs-mikado b/mise-tasks/docs-mikado index 0b37f51..eea052f 100755 --- a/mise-tasks/docs-mikado +++ b/mise-tasks/docs-mikado @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="View active Mikado dependency chains for C2 changes" #USAGE arg "[card]" help="Card stem to show chain for" diff --git a/mise-tasks/docs-preview b/mise-tasks/docs-preview index f63b1d1..faa79af 100755 --- a/mise-tasks/docs-preview +++ b/mise-tasks/docs-preview @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Build docs with Dagger and serve locally, opening to a specific card" #USAGE arg "" help="Card path relative to docs/, e.g. how-to/knowledgebase/review-documentation" diff --git a/mise-tasks/docs-review b/mise-tasks/docs-review index 49cf4d0..d07904d 100755 --- a/mise-tasks/docs-review +++ b/mise-tasks/docs-review @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Review the most stale documentation card by last-reviewed date" #USAGE flag "--limit " default="15" help="Number of docs to show in the table" diff --git a/mise-tasks/docs-review-stale b/mise-tasks/docs-review-stale index facbf6b..4449213 100755 --- a/mise-tasks/docs-review-stale +++ b/mise-tasks/docs-review-stale @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Report docs by git-last-modified date, highlighting stale ones" #USAGE flag "--threshold " default="180" help="Days before a doc is considered stale" diff --git a/mise-tasks/docs-review-tags b/mise-tasks/docs-review-tags index 0e7f1d4..869e2f2 100755 --- a/mise-tasks/docs-review-tags +++ b/mise-tasks/docs-review-tags @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0"] # /// #MISE description="Print frontmatter tag inventory across all docs" """Print every frontmatter tag with usage count and file list. diff --git a/mise-tasks/mikado-branch-invariant-check b/mise-tasks/mikado-branch-invariant-check index ca9f79a..1f0fbcf 100755 --- a/mise-tasks/mikado-branch-invariant-check +++ b/mise-tasks/mikado-branch-invariant-check @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Validate Mikado Branch Invariant on mikado/* branches" #USAGE arg "[commit_msg_file]" help="Commit message file (passed by commit-msg hook)" diff --git a/mise-tasks/op-backup b/mise-tasks/op-backup index 6ffef14..37a97a6 100755 --- a/mise-tasks/op-backup +++ b/mise-tasks/op-backup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Encrypt a 1Password .1pux export and send to indri for borgmatic" #USAGE arg "[export_path]" help="Path to .1pux export file (prompted if omitted)" diff --git a/mise-tasks/pr-comments b/mise-tasks/pr-comments index a44a430..7205617 100755 --- a/mise-tasks/pr-comments +++ b/mise-tasks/pr-comments @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="List unresolved comments on a PR" #USAGE arg "" help="Pull request number" diff --git a/mise-tasks/prune-ringtail-generations b/mise-tasks/prune-ringtail-generations index 8066f8b..2b8e3f9 100755 --- a/mise-tasks/prune-ringtail-generations +++ b/mise-tasks/prune-ringtail-generations @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Prune old NixOS generations on ringtail, preserving rollback safety" #MISE alias="prg" diff --git a/mise-tasks/review-compensating-controls b/mise-tasks/review-compensating-controls index 09e2d16..e92d302 100755 --- a/mise-tasks/review-compensating-controls +++ b/mise-tasks/review-compensating-controls @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Review the most stale compensating control" #USAGE flag "--limit " default="10" help="Number of controls to show in the table" diff --git a/mise-tasks/review-compliance-reports b/mise-tasks/review-compliance-reports index 72f35cc..bcbe090 100755 --- a/mise-tasks/review-compliance-reports +++ b/mise-tasks/review-compliance-reports @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0", "pyyaml>=6.0"] +# dependencies = ["rich==15.0.0", "typer==0.25.0", "pyyaml==6.0.3"] # /// #MISE description="Summarize the latest Prowler and Kingfisher compliance reports from sifaka" #USAGE flag "--full" help="Show all unmuted failures, not just new ones" diff --git a/mise-tasks/runner-logs b/mise-tasks/runner-logs index 579a5fd..9c988ee 100755 --- a/mise-tasks/runner-logs +++ b/mise-tasks/runner-logs @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="List recent Forgejo Actions runs or fetch logs for a specific job" #USAGE arg "[run_number]" help="Run number to show jobs for (omit to list recent runs)" diff --git a/mise-tasks/service-review b/mise-tasks/service-review index 01c4ce0..2d50e0b 100755 --- a/mise-tasks/service-review +++ b/mise-tasks/service-review @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Review the most stale service for version freshness" #USAGE flag "--limit " default="15" help="Number of services to show in the table" diff --git a/mise-tasks/spork-create b/mise-tasks/spork-create index 84d2999..92f4e5c 100755 --- a/mise-tasks/spork-create +++ b/mise-tasks/spork-create @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] # /// #MISE description="Create a spork (floating-branch soft-fork) of a mirrored upstream project" #USAGE arg "" help="Repository name in the mirrors/ org on forge (e.g. kingfisher)" diff --git a/prek.toml b/prek.toml index 28776c5..add7799 100644 --- a/prek.toml +++ b/prek.toml @@ -22,13 +22,13 @@ hooks = [ # check-yaml with --unsafe (builtin fast path doesn't support --unsafe yet) [[repos]] repo = "https://github.com/pre-commit/pre-commit-hooks" -rev = "v6.0.0" +rev = "3e8a8703264a2f4a69428a0aa4dcb512790b2c8c" # v6.0.0 hooks = [{ id = "check-yaml", args = ["--unsafe"] }] # Secret detection (running both tools in parallel to compare coverage) [[repos]] repo = "https://github.com/trufflesecurity/trufflehog" -rev = "v3.94.0" +rev = "17456f8c7d042d8c82c9a8ca9e937231f9f42e26" # v3.95.2 hooks = [ { id = "trufflehog", entry = "trufflehog git file://. --since-commit HEAD --no-verification --fail", stages = [ "pre-commit", @@ -38,7 +38,7 @@ hooks = [ [[repos]] repo = "https://github.com/mongodb/kingfisher" -rev = "v1.91.0" +rev = "9ddec4ab8b53653d4941e6b3fd4ff602ce91d81b" # v1.97.0 hooks = [ { id = "kingfisher", args = [ "scan", @@ -56,7 +56,7 @@ hooks = [ # YAML linting [[repos]] repo = "https://github.com/adrienverge/yamllint" -rev = "v1.38.0" +rev = "cba56bcde1fdd01c1deb3f945e69764c291a6530" # v1.38.0 hooks = [{ id = "yamllint", args = ["-c", ".yamllint.yaml"] }] # Ansible linting @@ -69,12 +69,12 @@ name = "ansible-lint" entry = "env ANSIBLE_ROLES_PATH=ansible/roles ansible-lint" language = "python" files = "^ansible/" -additional_dependencies = ["ansible-lint>=26.3.0", "ansible-core>=2.18"] +additional_dependencies = ["ansible-lint==26.4.0", "ansible-core==2.20.5"] # Python - ruff for linting and formatting [[repos]] repo = "https://github.com/astral-sh/ruff-pre-commit" -rev = "v0.15.7" +rev = "6fec9b7edb08fd9989088709d864a7826dc74e80" # v0.15.12 hooks = [{ id = "ruff", args = ["--fix"] }, { id = "ruff-format" }] # Python - ty type checker @@ -92,30 +92,30 @@ pass_filenames = false # Shell scripts - shellcheck and shfmt [[repos]] repo = "https://github.com/shellcheck-py/shellcheck-py" -rev = "v0.11.0.1" +rev = "745eface02aef23e168a8afb6b5737818efbea95" # v0.11.0.1 hooks = [{ id = "shellcheck", args = ["--severity=warning"] }] [[repos]] repo = "https://github.com/scop/pre-commit-shfmt" -rev = "v3.13.0-1" +rev = "05c1426671b9237fb5e1444dd63aa5731bec0dfb" # v3.13.1-1 hooks = [{ id = "shfmt", args = ["-i", "2", "-ci", "-bn"] }] # TOML - taplo [[repos]] repo = "https://github.com/ComPWA/taplo-pre-commit" -rev = "v0.9.3" -hooks = [{ id = "taplo-format" }, { id = "taplo-lint" }] +rev = "23eab0f0eedcbedebff420f5fdfb284744adc7b3" # v0.9.3 +hooks = [{ id = "taplo-format" }, { id = "taplo-lint", args = ["--no-schema"] }] # JSON formatting (prettier for consistent style) [[repos]] repo = "https://github.com/rbubley/mirrors-prettier" -rev = "v3.8.1" +rev = "515f543f5718ebfd6ce22e16708bb32c68ff96e1" # v3.8.3 hooks = [{ id = "prettier", types_or = ["json"], args = ["--tab-width", "2"] }] # GitHub/Forgejo Actions workflow linting [[repos]] repo = "https://github.com/rhysd/actionlint" -rev = "v1.7.11" +rev = "914e7df21a07ef503a81201c76d2b11c789d3fca" # v1.7.12 hooks = [ { id = "actionlint-system", args = [ "-config-file", From 7fed166c18639bf4ec5950c39fed748d978725e3 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 30 Apr 2026 16:55:08 -0700 Subject: [PATCH 039/122] Update ringtail flake inputs Co-Authored-By: Claude Opus 4.7 (1M context) --- nixos/ringtail/flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock index 90fdff1..d6a85dc 100644 --- a/nixos/ringtail/flake.lock +++ b/nixos/ringtail/flake.lock @@ -43,11 +43,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1776434932, - "narHash": "sha256-gyqXNMgk3sh+ogY5svd2eNLJ6oEwzbAeaoBrrxD0lKk=", + "lastModified": 1777428379, + "narHash": "sha256-ypxFOeDz+CqADEQNL72haqGjvZQdBR5Vc7pyx2JDttI=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "c7f47036d3df2add644c46d712d14262b7d86c0c", + "rev": "755f5aa91337890c432639c60b6064bb7fe67769", "type": "github" }, "original": { From 9564435b11ec7285af83fef114a8658317389665 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 08:05:37 -0700 Subject: [PATCH 040/122] Alloy V1.16.0 (#345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bump Grafana Alloy v1.14.0 → v1.16.0 across all four services (alloy-k8s, alloy-ringtail, alloy-tracing-ringtail; alloy native ansible). Also migrate the indri build path from `Dockerfile` to a native Dagger `container.py` per the build-container-image migration playbook. ## Highlights from upstream - v1.15: database observability promoted to stable, OTel Collector → v0.147.0 - v1.16: clustering for `loki.source.kubernetes_events`, MySQL exporter 0.19.0 - One pre-existing breaking change in v1.15 (`loki.source.awsfirehose` undocumented metric prefix rename) — not used here. ## Build infra Alloy v1.16.0's go.mod requires Go 1.26.2. The nix derivation now uses `pkgs.go_1_26` with `GOTOOLCHAIN=local` to avoid auto-downloading a toolchain blob that violated the fixed-output rule. ## Test plan - [ ] CI: `mise run container-build-and-release alloy --ref alloy-v1.16.0` (dispatched as run 522; nix job to be re-triggered with the v1.16.0 goModules outputHash once the local ringtail build surfaces it) - [ ] After CI green, bump `images[].newTag` in three kustomizations to the new `-` and `--nix` tags, deploy from this branch via `argocd app set --revision alloy-v1.16.0 && argocd app sync ` - [ ] Manual rebuild of macOS native binary on gilbert (per ansible/roles/alloy README) and `mise run provision-indri -- --tags alloy --check --diff` - [ ] `mise run services-check` after merge & redeploy Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/345 --- argocd/manifests/alloy-k8s/kustomization.yaml | 2 +- .../alloy-ringtail/kustomization.yaml | 2 +- .../alloy-tracing-ringtail/kustomization.yaml | 2 +- containers/alloy/Dockerfile | 68 ------------- containers/alloy/container.py | 95 +++++++++++++++++++ containers/alloy/default.nix | 16 ++-- docs/changelog.d/alloy-v1.16.0.infra.md | 5 + service-versions.yaml | 16 ++-- 8 files changed, 120 insertions(+), 86 deletions(-) delete mode 100644 containers/alloy/Dockerfile create mode 100644 containers/alloy/container.py create mode 100644 docs/changelog.d/alloy-v1.16.0.infra.md diff --git a/argocd/manifests/alloy-k8s/kustomization.yaml b/argocd/manifests/alloy-k8s/kustomization.yaml index f51bd3a..0326190 100644 --- a/argocd/manifests/alloy-k8s/kustomization.yaml +++ b/argocd/manifests/alloy-k8s/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.14.0-fd0bebb + newTag: v1.16.0-26a3ab5 configMapGenerator: - name: alloy-config diff --git a/argocd/manifests/alloy-ringtail/kustomization.yaml b/argocd/manifests/alloy-ringtail/kustomization.yaml index df472aa..cecae35 100644 --- a/argocd/manifests/alloy-ringtail/kustomization.yaml +++ b/argocd/manifests/alloy-ringtail/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.14.0-fd0bebb-nix + newTag: v1.16.0-26a3ab5-nix configMapGenerator: - name: alloy-config diff --git a/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml b/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml index 5c8e683..ac25f4a 100644 --- a/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml +++ b/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml @@ -9,7 +9,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.14.0-fd0bebb-nix + newTag: v1.16.0-26a3ab5-nix configMapGenerator: - name: alloy-tracing-config diff --git a/containers/alloy/Dockerfile b/containers/alloy/Dockerfile deleted file mode 100644 index f2f30f6..0000000 --- a/containers/alloy/Dockerfile +++ /dev/null @@ -1,68 +0,0 @@ -# Grafana Alloy telemetry collector -# Three-stage build: Web UI (Node), server (Go), runtime (Alpine) - -ARG CONTAINER_APP_VERSION=1.14.0 -ARG ALLOY_VERSION=v${CONTAINER_APP_VERSION} -ARG ALLOY_COMMIT=626a738319812d58ebc25ca6d71651f4925b8b18 - -FROM node:22-alpine AS ui-build - -ARG ALLOY_COMMIT -RUN apk add --no-cache git - -RUN mkdir /app && cd /app \ - && git init \ - && git remote add origin https://forge.ops.eblu.me/mirrors/alloy.git \ - && git fetch --depth 1 origin ${ALLOY_COMMIT} \ - && git checkout FETCH_HEAD - -WORKDIR /app/internal/web/ui -RUN npm ci -RUN npx tsc -b && npx vite build - -FROM golang:1.25-alpine3.22 AS build - -ARG ALLOY_VERSION -ARG ALLOY_COMMIT -RUN apk add --no-cache build-base git - -RUN mkdir /app && cd /app \ - && git init \ - && git remote add origin https://forge.ops.eblu.me/mirrors/alloy.git \ - && git fetch --depth 1 origin ${ALLOY_COMMIT} \ - && git checkout FETCH_HEAD - -WORKDIR /app - -# Copy pre-built web UI assets -COPY --from=ui-build /app/internal/web/ui/dist /app/internal/web/ui/dist - -ENV CGO_ENABLED=1 - -# promtail_journal_enabled omitted: requires systemd headers (libsystemd-dev) -# and our k8s deployments read pod logs from the filesystem, not journald -RUN RELEASE_BUILD=1 VERSION=${ALLOY_VERSION} \ - GO_TAGS="netgo embedalloyui" \ - SKIP_UI_BUILD=1 \ - make alloy - -FROM alpine:3.22 - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Alloy" -LABEL org.opencontainers.image.description="Grafana Alloy is an OpenTelemetry Collector distribution" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -RUN apk --no-cache add ca-certificates tzdata \ - && addgroup -g 473 alloy \ - && adduser -D -u 473 -G alloy alloy \ - && mkdir -p /var/lib/alloy/data \ - && chown -R alloy:alloy /var/lib/alloy - -COPY --from=build --chown=473:473 /app/build/alloy /bin/alloy - -ENTRYPOINT ["/bin/alloy"] -ENV ALLOY_DEPLOY_MODE=docker -CMD ["run", "/etc/alloy/config.alloy", "--storage.path=/var/lib/alloy/data"] diff --git a/containers/alloy/container.py b/containers/alloy/container.py new file mode 100644 index 0000000..41d3995 --- /dev/null +++ b/containers/alloy/container.py @@ -0,0 +1,95 @@ +"""Grafana Alloy — telemetry collector, native Dagger build. + +Three-stage build: Node (UI), Go (server via upstream Makefile with embedded +UI assets), Alpine (runtime). Source cloned from forge mirror. + +Notes: + - Builds via `make alloy` rather than plain `go build` so version stamping, + release flags, and the netgo+embedalloyui tags match upstream releases. + - promtail_journal_enabled is intentionally omitted: it requires + libsystemd-dev and our k8s deployments read pod logs from the filesystem, + not journald. + - Uses golang:alpine3.23 (currently Go 1.26.2 — matches alloy v1.16.0's + go.mod toolchain requirement and the go_build helper's image choice). +""" + +import dagger +from dagger import dag + +from blumeops.containers import ( + alpine_runtime, + clone_from_forge, + node_build, + oci_labels, +) + +VERSION = "v1.16.0" + + +async def build(src: dagger.Directory) -> dagger.Container: + source = clone_from_forge("alloy", VERSION) + + # Stage 1: Build the web UI (tsc + vite, not the package.json default). + ui = node_build( + source, + "internal/web/ui", + build_cmd=["sh", "-c", "npx tsc -b && npx vite build"], + ) + + # Stage 2: Build alloy via the upstream Makefile with embedded UI assets. + builder = ( + dag.container() + .from_("golang:alpine3.23") + .with_exec(["apk", "add", "--no-cache", "build-base", "git", "make"]) + .with_directory("/app", source) + .with_directory( + "/app/internal/web/ui/dist", + ui.directory("/app/internal/web/ui/dist"), + ) + .with_workdir("/app") + .with_env_variable("CGO_ENABLED", "1") + .with_env_variable("RELEASE_BUILD", "1") + .with_env_variable("VERSION", VERSION) + .with_env_variable("GO_TAGS", "netgo embedalloyui") + .with_env_variable("SKIP_UI_BUILD", "1") + .with_exec(["make", "alloy"]) + ) + + # Stage 3: Runtime as uid/gid 473 alloy. + runtime = alpine_runtime( + extra_apk=["ca-certificates", "tzdata"], + uid=473, + gid=473, + username="alloy", + ) + runtime = oci_labels( + runtime, + title="Alloy", + description="Grafana Alloy is an OpenTelemetry Collector distribution", + version=VERSION, + ) + return ( + runtime.with_file( + "/bin/alloy", + builder.file("/app/build/alloy"), + permissions=0o555, + ) + .with_exec( + [ + "sh", + "-c", + "mkdir -p /var/lib/alloy/data && chown -R alloy:alloy /var/lib/alloy", + ] + ) + .with_env_variable("ALLOY_DEPLOY_MODE", "docker") + .with_exposed_port(12345) + .with_user("alloy") + .with_entrypoint(["/bin/alloy"]) + .with_default_args( + args=[ + "run", + "/etc/alloy/config.alloy", + "--storage.path=/var/lib/alloy/data", + ] + ) + ) diff --git a/containers/alloy/default.nix b/containers/alloy/default.nix index e508a10..c884704 100644 --- a/containers/alloy/default.nix +++ b/containers/alloy/default.nix @@ -1,24 +1,24 @@ # Nix-built Grafana Alloy telemetry collector -# Builds v1.14.0 from forge mirror with embedded web UI +# Builds v1.16.0 from forge mirror with embedded web UI # Uses stdenv + make (not buildGoModule) due to multi-module workspace # with local replace directives (collector/ -> ../, ../syntax, ../extension) # Built with dockerTools.buildLayeredImage for efficient layer caching { pkgs ? import { } }: let - version = "1.14.0"; + version = "1.16.0"; src = pkgs.fetchgit { url = "https://forge.ops.eblu.me/mirrors/alloy.git"; rev = "v${version}"; - hash = "sha256-gxNz4XDE8XSl6LsP3k8DERqDdMLcmbWKfXZGGyRULkg="; + hash = "sha256-q5R2noxBZ3OPyZqmB+bx3iJKWFxC2WIprcgh9RwjLzk="; }; ui = pkgs.buildNpmPackage { inherit version; pname = "alloy-ui"; src = "${src}/internal/web/ui"; - npmDepsHash = "sha256-GT0yisPn+3FCtWL3he0i5zPMlaWNparQDefU69G4Yis="; + npmDepsHash = "sha256-vResNUT4auDsK9ngnJYfMUUOYr/ikPhrvakqCjGq2Q8="; buildPhase = '' runHook preBuild @@ -40,11 +40,12 @@ let pname = "alloy-go-modules"; inherit src version; - nativeBuildInputs = with pkgs; [ go git cacert ]; + nativeBuildInputs = with pkgs; [ go_1_26 git cacert ]; buildPhase = '' export GOPATH=$TMPDIR/go export GOFLAGS=-modcacherw + export GOTOOLCHAIN=local # Download modules for all three go.mod files go mod download cd syntax && go mod download && cd .. @@ -56,7 +57,7 @@ let ''; outputHashMode = "recursive"; - outputHash = "sha256-rD7zqomSVv4d8NaC7jXXgihuQvK8guaAN0KrsBRWMVQ="; + outputHash = "sha256-9/v85HyDInJB+9qHauKVuDol6Yf5mkXfMWgCr7RdRTk="; outputHashAlgo = "sha256"; }; @@ -65,7 +66,7 @@ let pname = "alloy"; nativeBuildInputs = with pkgs; [ - go + go_1_26 git gnumake cacert @@ -77,6 +78,7 @@ let export HOME=$TMPDIR export GOPATH=$TMPDIR/go export GOFLAGS=-modcacherw + export GOTOOLCHAIN=local # Populate module cache from pre-fetched modules mkdir -p $GOPATH/pkg diff --git a/docs/changelog.d/alloy-v1.16.0.infra.md b/docs/changelog.d/alloy-v1.16.0.infra.md new file mode 100644 index 0000000..cd9a1ef --- /dev/null +++ b/docs/changelog.d/alloy-v1.16.0.infra.md @@ -0,0 +1,5 @@ +Upgrade Grafana Alloy v1.14.0 → v1.16.0 across all four service deployments +(alloy-k8s, alloy-ringtail, alloy-tracing-ringtail on k8s; alloy native on +indri). Pulls in stable database observability (v1.15) and the OTel Collector +v0.147.0 bump. Container build also migrated from Dockerfile to native Dagger +`container.py` per the build-container-image migration playbook. diff --git a/service-versions.yaml b/service-versions.yaml index f4f4a6a..42f9c77 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -72,22 +72,22 @@ services: - name: alloy-tracing-ringtail type: argocd - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases notes: Privileged DaemonSet with Beyla eBPF for HTTP tracing on ringtail - name: alloy-ringtail type: argocd - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases notes: DaemonSet on ringtail for host metrics and pod logs - name: alloy-k8s type: argocd - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases - name: tailscale-operator @@ -338,8 +338,8 @@ services: - name: alloy type: ansible - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases notes: Built from source on indri From 55563afc7e15e179806808ba106e5c0e11bf033e Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 08:31:27 -0700 Subject: [PATCH 041/122] =?UTF-8?q?C0:=20alloy=20=E2=80=94=20bump=20kustom?= =?UTF-8?q?ization=20tags=20to=20main-branch=20SHA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the build-container-image squash-merge convention, rebuild alloy v1.16.0 container images from the main SHA (9564435) and update the three alloy kustomizations to reference :v1.16.0-9564435[-nix] instead of the branch SHA :v1.16.0-26a3ab5[-nix] left over from #345. Both images were rebuilt locally on gilbert (dagger) and ringtail (nix) because indri is still under heavy macOS memory-compressor pressure (see separate ticket); CI on indri can't reliably run the dagger publish step. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/alloy-k8s/kustomization.yaml | 2 +- argocd/manifests/alloy-ringtail/kustomization.yaml | 2 +- argocd/manifests/alloy-tracing-ringtail/kustomization.yaml | 2 +- docs/changelog.d/+alloy-main-sha-rebuild.infra.md | 5 +++++ 4 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 docs/changelog.d/+alloy-main-sha-rebuild.infra.md diff --git a/argocd/manifests/alloy-k8s/kustomization.yaml b/argocd/manifests/alloy-k8s/kustomization.yaml index 0326190..3503ead 100644 --- a/argocd/manifests/alloy-k8s/kustomization.yaml +++ b/argocd/manifests/alloy-k8s/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.16.0-26a3ab5 + newTag: v1.16.0-9564435 configMapGenerator: - name: alloy-config diff --git a/argocd/manifests/alloy-ringtail/kustomization.yaml b/argocd/manifests/alloy-ringtail/kustomization.yaml index cecae35..526fec5 100644 --- a/argocd/manifests/alloy-ringtail/kustomization.yaml +++ b/argocd/manifests/alloy-ringtail/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.16.0-26a3ab5-nix + newTag: v1.16.0-9564435-nix configMapGenerator: - name: alloy-config diff --git a/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml b/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml index ac25f4a..b1e6338 100644 --- a/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml +++ b/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml @@ -9,7 +9,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.16.0-26a3ab5-nix + newTag: v1.16.0-9564435-nix configMapGenerator: - name: alloy-tracing-config diff --git a/docs/changelog.d/+alloy-main-sha-rebuild.infra.md b/docs/changelog.d/+alloy-main-sha-rebuild.infra.md new file mode 100644 index 0000000..42a7b37 --- /dev/null +++ b/docs/changelog.d/+alloy-main-sha-rebuild.infra.md @@ -0,0 +1,5 @@ +Rebuild and retag alloy v1.16.0 container images from the main-branch SHA +following the squash-merge of #345, per the build-container-image +squash-merge convention. Both images (`registry.ops.eblu.me/blumeops/alloy`) +now reference `9564435` rather than the branch SHA `26a3ab5`, restoring +source traceability after branch cleanup. From 2d5530321357f496a0b607a56966b202bc59b206 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 10:36:38 -0700 Subject: [PATCH 042/122] =?UTF-8?q?C0:=20alloy=20native=20macOS=20on=20ind?= =?UTF-8?q?ri=20=E2=80=94=20upgrade=20to=20v1.16.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the v1.16.0 fleet upgrade for the fourth alloy service (type: ansible, built from source on indri). Binary built on gilbert with Go 1.26.2 + CGO, scp'd to indri, codesigned, LaunchAgent reloaded. Service reports clean WAL replay and resumed metric/log shipping. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md diff --git a/docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md b/docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md new file mode 100644 index 0000000..471990f --- /dev/null +++ b/docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md @@ -0,0 +1,6 @@ +Upgrade native macOS Alloy on indri to v1.16.0. Built on gilbert with Go +1.26.2 + CGO (required for the macOS native DNS resolver, which Tailscale +MagicDNS depends on), scp'd to `~/.local/bin/alloy` on indri, codesigned, +and the LaunchAgent reloaded. Completes the v1.16.0 fleet upgrade started +in #345 — all four Alloy services (alloy-k8s, alloy-ringtail, +alloy-tracing-ringtail, alloy ansible) now run v1.16.0. From 4aa08729493522e2b82181b38df040412e2a52c2 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 10:42:33 -0700 Subject: [PATCH 043/122] =?UTF-8?q?C0:=20review=20ollama=20doc=20=E2=80=94?= =?UTF-8?q?=20refresh=20image,=20models,=20last-reviewed?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumped documented image tag to 0.20.4 (matches kustomization newTag), added the two qwen3.5 models from models.txt, and stamped the card. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+review-ollama-doc.doc.md | 1 + docs/reference/services/ollama.md | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+review-ollama-doc.doc.md diff --git a/docs/changelog.d/+review-ollama-doc.doc.md b/docs/changelog.d/+review-ollama-doc.doc.md new file mode 100644 index 0000000..05ef23e --- /dev/null +++ b/docs/changelog.d/+review-ollama-doc.doc.md @@ -0,0 +1 @@ +Review and refresh the Ollama reference card: add `last-reviewed`, bump the documented image tag to 0.20.4, and add the two `qwen3.5` models now declared in `models.txt`. diff --git a/docs/reference/services/ollama.md b/docs/reference/services/ollama.md index 75480cb..b749cf2 100644 --- a/docs/reference/services/ollama.md +++ b/docs/reference/services/ollama.md @@ -1,6 +1,7 @@ --- title: Ollama -modified: 2026-03-04 +modified: 2026-05-01 +last-reviewed: 2026-05-01 tags: - service - ai @@ -18,7 +19,7 @@ LLM inference server with GPU acceleration. Runs on [[ringtail]] with declarativ | **Tailscale URL** | https://ollama.tail8d86e.ts.net | | **Namespace** | `ollama` | | **Cluster** | ringtail k3s | -| **Image** | `ollama/ollama:0.17.5` | +| **Image** | `ollama/ollama:0.20.4` | | **Upstream** | https://github.com/ollama/ollama | | **Manifests** | `argocd/manifests/ollama/` | | **API Port** | 11434 | @@ -50,6 +51,8 @@ Declared in `argocd/manifests/ollama/models.txt`. The model-sync sidecar pulls m | `deepseek-r1:14b` | 14B | | `phi4:14b` | 14B | | `gemma3:12b` | 12B | +| `qwen3.5:9b` | 9B | +| `qwen3.5:27b` | 27B | To add or remove models, edit `models.txt` and sync via ArgoCD. From f84f5f02b3e10efa8468460c21a4e027cd91ff68 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 10:49:22 -0700 Subject: [PATCH 044/122] C0: review compensating control trusted-ci-only Verified Forgejo runner is registered only to forge.ops.eblu.me and the forge has registration disabled, so no untrusted users can trigger privileged CI. Tightened notes to reflect the closed-forge mechanism (not a per-repo allow-list). Co-Authored-By: Claude Opus 4.7 (1M context) --- compensating-controls.yaml | 13 ++++++++++--- docs/changelog.d/+review-cc-trusted-ci-only.misc.md | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 docs/changelog.d/+review-cc-trusted-ci-only.misc.md diff --git a/compensating-controls.yaml b/compensating-controls.yaml index fb5450d..a6dbc56 100644 --- a/compensating-controls.yaml +++ b/compensating-controls.yaml @@ -110,10 +110,17 @@ controls: forge (forge.ops.eblu.me). No external or untrusted repos can trigger privileged CI jobs. created: 2026-03-30 - last-reviewed: 2026-03-30 + last-reviewed: 2026-05-01 notes: >- - Verify runner registration is limited to the forge instance. - Check Forgejo runner config for repo allow-lists. + Verification: (1) Runner config (argocd/manifests/forgejo-runner/ + config.yaml) connects only to https://forge.ops.eblu.me/. (2) Forge + app.ini has DISABLE_REGISTRATION=true and ALLOW_ONLY_EXTERNAL_REGISTRATION + =true (ansible/roles/forgejo/defaults/main.yml) — no untrusted users + can sign up or create repos. The runner registers at instance scope + (repo_id=0/owner_id=0 in action_runner table), but the instance itself + is closed, so no per-repo allow-list is needed. Re-evaluate if the + forge ever opens to additional users or if the runner is repointed + to an external forge. - id: init-container-isolation description: >- diff --git a/docs/changelog.d/+review-cc-trusted-ci-only.misc.md b/docs/changelog.d/+review-cc-trusted-ci-only.misc.md new file mode 100644 index 0000000..89dc653 --- /dev/null +++ b/docs/changelog.d/+review-cc-trusted-ci-only.misc.md @@ -0,0 +1 @@ +Reviewed compensating control `trusted-ci-only`: Forgejo runner is registered only to the private forge, which has registration disabled — no untrusted users can create repos or trigger privileged CI. Tightened the notes to reflect that the closed-forge property (not a per-repo allow-list) is what actually mitigates the risk. From fabca0477188b3e4713b27c71f11a7022e8a3add Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 17:40:03 -0700 Subject: [PATCH 045/122] Mirror valkey 8.1 locally for paperless and immich (#346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Add native Dagger build of valkey 8.1.6-r0 on Alpine 3.22 at `containers/valkey/` - Swap paperless redis sidecar and immich-valkey from `docker.io/valkey/valkey:8.1-alpine` to `registry.ops.eblu.me/blumeops/valkey:v8.1.6-r0-946fa75` - Resolves the DR-2026-04 TODO in paperless kustomization about multi-arch redis ## Why Move toward fully locally-built containers for supply chain control. Paperless and immich both pulled the same upstream tag — one mirror serves both. Authentik's nix-built Redis stays separate (different image entirely). ## Risk Low. Both sidecars are stateless caches: - paperless redis: no volumeMount (in-pod localhost, pure memory) - immich-valkey: `emptyDir` (cache only) Pod restart rebuilds the cache. Smoke-tested locally (PING/SET/GET roundtrip on `valkey 8.1.6` with `--bind 0.0.0.0 --protected-mode no`). ## Test plan - [ ] After merge: `mise run container-build-and-release valkey` to rebuild with main SHA - [ ] Update kustomizations to the `[main]` SHA tag (C0 follow-up) - [ ] `argocd app sync paperless` and `argocd app sync immich` - [ ] Verify pods come up healthy (paperless OCR queue functional, immich job queue functional) - [ ] `mise run services-check` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/346 --- argocd/manifests/immich/kustomization.yaml | 3 +- argocd/manifests/paperless/kustomization.yaml | 7 +-- containers/valkey/container.py | 47 +++++++++++++++++++ docs/changelog.d/valkey-mirror.infra.md | 1 + service-versions.yaml | 12 +++++ 5 files changed, 64 insertions(+), 6 deletions(-) create mode 100644 containers/valkey/container.py create mode 100644 docs/changelog.d/valkey-mirror.infra.md diff --git a/argocd/manifests/immich/kustomization.yaml b/argocd/manifests/immich/kustomization.yaml index c7c54e1..399a975 100644 --- a/argocd/manifests/immich/kustomization.yaml +++ b/argocd/manifests/immich/kustomization.yaml @@ -19,4 +19,5 @@ images: - name: ghcr.io/immich-app/immich-machine-learning newTag: v2.6.3 - name: docker.io/valkey/valkey - newTag: "8.1-alpine" + newName: registry.ops.eblu.me/blumeops/valkey + newTag: v8.1.6-r0-946fa75 diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml index 3e65578..4e1f658 100644 --- a/argocd/manifests/paperless/kustomization.yaml +++ b/argocd/manifests/paperless/kustomization.yaml @@ -14,9 +14,6 @@ resources: images: - name: registry.ops.eblu.me/blumeops/paperless newTag: v2.20.13-07f52e9 - # TODO(DR-2026-04): authentik-redis is amd64-only (nix-built on ringtail). - # Was running under QEMU emulation before. Switched to upstream valkey - # during DR recovery. Build a multi-arch blumeops/redis or keep upstream. - name: docker.io/library/redis - newName: docker.io/valkey/valkey - newTag: "8.1-alpine" + newName: registry.ops.eblu.me/blumeops/valkey + newTag: v8.1.6-r0-946fa75 diff --git a/containers/valkey/container.py b/containers/valkey/container.py new file mode 100644 index 0000000..5d150e7 --- /dev/null +++ b/containers/valkey/container.py @@ -0,0 +1,47 @@ +"""Valkey — native Dagger build. + +Alpine 3.22 base with the `valkey` apk package (8.1.x — Redis-compatible). +Mirrors `docker.io/valkey/valkey:8.1-alpine`, used by paperless and immich +as a cache/queue sidecar. +""" + +import dagger +from dagger import dag + +from blumeops.containers import oci_labels + +# Alpine 3.22 ships valkey 8.1.6-r0. Alpine 3.23 jumps to 9.0 — hold on 3.22 +# to keep this a 1:1 swap for the upstream `valkey:8.1-alpine` image. +VERSION = "8.1.6-r0" + +ALPINE_BASE = "alpine:3.22" + + +async def build(src: dagger.Directory) -> dagger.Container: + ctr = ( + dag.container() + .from_(ALPINE_BASE) + .with_exec(["apk", "add", "--no-cache", f"valkey={VERSION}"]) + .with_exec(["mkdir", "-p", "/data"]) + .with_exec(["chown", "valkey:valkey", "/data"]) + .with_workdir("/data") + .with_exposed_port(6379) + .with_user("valkey") + .with_default_args( + args=[ + "valkey-server", + "--bind", + "0.0.0.0", + "--protected-mode", + "no", + "--dir", + "/data", + ] + ) + ) + return oci_labels( + ctr, + title="Valkey", + description="Valkey high-performance key/value datastore (Redis-compatible)", + version=VERSION, + ) diff --git a/docs/changelog.d/valkey-mirror.infra.md b/docs/changelog.d/valkey-mirror.infra.md new file mode 100644 index 0000000..06f8d98 --- /dev/null +++ b/docs/changelog.d/valkey-mirror.infra.md @@ -0,0 +1 @@ +Mirror Valkey 8.1 locally as `registry.ops.eblu.me/blumeops/valkey`. Replaces direct pulls of `docker.io/valkey/valkey:8.1-alpine` for paperless and immich sidecars. Built via native Dagger pipeline on Alpine 3.22. Stateless swap — no data migration. Authentik's nix-built Redis remains separate. diff --git a/service-versions.yaml b/service-versions.yaml index 42f9c77..76c0655 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -125,6 +125,18 @@ services: upstream-source: https://github.com/immich-app/immich/releases notes: Kustomize manifests with upstream images + - name: valkey + type: argocd + last-reviewed: 2026-05-01 + current-version: "8.1.6-r0" + upstream-source: https://pkgs.alpinelinux.org/package/v3.22/community/aarch64/valkey + notes: >- + Shared Alpine-built valkey image, used as a sidecar/cache by paperless + (sidecar) and immich (separate Deployment). Mirrors the upstream + docker.io/valkey/valkey:8.1-alpine. Pinned to Alpine 3.22 for valkey 8.1.x; + Alpine 3.23 jumps to 9.0. Distinct from authentik-redis (nix-built Redis + 8.x) which has its own entry. + - name: external-secrets type: argocd last-reviewed: 2026-03-25 From 2c0917b266cc0f6715a733a92e106705d83584c9 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 1 May 2026 17:47:16 -0700 Subject: [PATCH 046/122] =?UTF-8?q?C0:=20valkey=20=E2=80=94=20bump=20kusto?= =?UTF-8?q?mization=20tags=20to=20main-branch=20SHA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Routine post-merge follow-up after #346. Branch SHA tag (946fa75) replaced with the main-SHA-built tag (fabca04) so paperless and immich reference an image traceable to a commit on main. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/immich/kustomization.yaml | 2 +- argocd/manifests/paperless/kustomization.yaml | 2 +- docs/changelog.d/+valkey-main-tag-bump.infra.md | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+valkey-main-tag-bump.infra.md diff --git a/argocd/manifests/immich/kustomization.yaml b/argocd/manifests/immich/kustomization.yaml index 399a975..5f8d02b 100644 --- a/argocd/manifests/immich/kustomization.yaml +++ b/argocd/manifests/immich/kustomization.yaml @@ -20,4 +20,4 @@ images: newTag: v2.6.3 - name: docker.io/valkey/valkey newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.6-r0-946fa75 + newTag: v8.1.6-r0-fabca04 diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml index 4e1f658..9c6a086 100644 --- a/argocd/manifests/paperless/kustomization.yaml +++ b/argocd/manifests/paperless/kustomization.yaml @@ -16,4 +16,4 @@ images: newTag: v2.20.13-07f52e9 - name: docker.io/library/redis newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.6-r0-946fa75 + newTag: v8.1.6-r0-fabca04 diff --git a/docs/changelog.d/+valkey-main-tag-bump.infra.md b/docs/changelog.d/+valkey-main-tag-bump.infra.md new file mode 100644 index 0000000..cd19f60 --- /dev/null +++ b/docs/changelog.d/+valkey-main-tag-bump.infra.md @@ -0,0 +1 @@ +Bump paperless and immich kustomizations to the main-SHA-built valkey tag (`v8.1.6-r0-fabca04`). Routine post-merge follow-up to keep production manifests pointing at images built from a commit on main. From a2c61b625d45311f1ebaaec90cd81e37c04a283a Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 4 May 2026 13:42:57 -0700 Subject: [PATCH 047/122] =?UTF-8?q?C0:=20rotate-fly-deploy-token=20?= =?UTF-8?q?=E2=80=94=20fish+bash=20one-shot,=20op=20validator=20gotcha?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combine mint+store into a single command with both fish and bash forms (the doc previously only showed manual paste). Document the 1Password CLI "Password item requires ps value" validator error and the placeholder-password workaround for Password-category items with empty primary password fields. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...ate-fly-deploy-token-shell-examples.doc.md | 1 + .../configuration/rotate-fly-deploy-token.md | 38 +++++++++++++------ 2 files changed, 27 insertions(+), 12 deletions(-) create mode 100644 docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md diff --git a/docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md b/docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md new file mode 100644 index 0000000..24ffcb9 --- /dev/null +++ b/docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md @@ -0,0 +1 @@ +rotate-fly-deploy-token: combine mint+store into one command with both fish and bash forms; document the `op item edit` "Password item requires ps value" validator gotcha and the placeholder-password workaround. diff --git a/docs/how-to/configuration/rotate-fly-deploy-token.md b/docs/how-to/configuration/rotate-fly-deploy-token.md index 58aba21..5863f54 100644 --- a/docs/how-to/configuration/rotate-fly-deploy-token.md +++ b/docs/how-to/configuration/rotate-fly-deploy-token.md @@ -1,7 +1,7 @@ --- title: Rotate the Fly.io API Token -modified: 2026-04-30 -last-reviewed: 2026-04-30 +modified: 2026-05-04 +last-reviewed: 2026-05-04 tags: - how-to - fly-io @@ -45,24 +45,38 @@ fly auth login (Browser-based. Required to mint a new token, since the existing deploy token can't create tokens.) -### 2. Mint the new token +### 2. Mint the new token and store it + +The token is shown only once at creation, so combine the mint and the 1Password write into a single command. Pick the form for your shell. + +`fish`: ```fish -fly tokens create org \ - --org personal \ - --name "blumeops-proxy deploy $(date +%Y-%m-%d)" \ - --expiry 2160h +op item edit on5slfaygtdjrxmdwezyhfmqsq "add more.deploy-token=(fly tokens create org --org personal --name 'blumeops-proxy deploy '(date +%Y-%m-%d) --expiry 2160h)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie ``` -(`2160h` = 90 days, paired with the 75-day rotation cadence for a 15-day buffer. Capture the output — it's the only time the token is shown.) +`bash` / `zsh`: -### 3. Update 1Password +```bash +op item edit on5slfaygtdjrxmdwezyhfmqsq "add more.deploy-token=$(fly tokens create org --org personal --name "blumeops-proxy deploy $(date +%Y-%m-%d)" --expiry 2160h)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +(`2160h` = 90 days, paired with the 75-day rotation cadence for a 15-day buffer.) + +If you'd rather paste manually: ```fish +fly tokens create org --org personal --name "blumeops-proxy deploy $(date +%Y-%m-%d)" --expiry 2160h op item edit on5slfaygtdjrxmdwezyhfmqsq 'add more.deploy-token=' --vault vg6xf6vvfmoh5hqjjhlhbeoaie ``` -### 4. Sync to Forgejo Actions +> **op validator gotcha:** If `op item edit` returns `Password item requires ps value`, the item's primary `password` field is empty. The 1Password CLI validator rejects edits to a Password-category item with no primary password, even when you're only touching a section field. Set a placeholder once and future rotations will work: +> +> ```fish +> op item edit on5slfaygtdjrxmdwezyhfmqsq 'password=unused - see deploy-token field' --vault vg6xf6vvfmoh5hqjjhlhbeoaie +> ``` + +### 3. Sync to Forgejo Actions The `deploy-fly` workflow reads the same token from a Forgejo Actions secret named `FLY_DEPLOY_TOKEN`, populated by the `forgejo_actions_secrets` ansible role: @@ -70,7 +84,7 @@ The `deploy-fly` workflow reads the same token from a Forgejo Actions secret nam mise run provision-indri -- --tags forgejo_actions_secrets ``` -### 5. Verify +### 4. Verify ```fish mise run fly-deploy @@ -80,7 +94,7 @@ A successful deploy confirms the new token works locally. Watch for the metrics- Then trigger the CI workflow (push a no-op commit touching `fly/`, or dispatch manually) to confirm Forgejo Actions has the new secret. -### 6. Revoke the old token +### 5. Revoke the old token ```fish fly tokens list From f16e1c81f1957e16f9f4fb611d1a618856585b53 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 4 May 2026 17:41:07 -0700 Subject: [PATCH 048/122] =?UTF-8?q?C0:=20zot=20=E2=80=94=20upgrade=20indri?= =?UTF-8?q?=20registry=20to=20v2.1.16?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security fixes only (TLS verification on metrics client, CORS Allow-Credentials suppression on wildcard origin, manifest/API-key body-size limits, dependabot bumps). No config changes required; re-built from source on indri and bounced launchagent. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+zot-v2.1.16.infra.md | 1 + service-versions.yaml | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+zot-v2.1.16.infra.md diff --git a/docs/changelog.d/+zot-v2.1.16.infra.md b/docs/changelog.d/+zot-v2.1.16.infra.md new file mode 100644 index 0000000..f007164 --- /dev/null +++ b/docs/changelog.d/+zot-v2.1.16.infra.md @@ -0,0 +1 @@ +Upgraded zot on indri from v2.1.15 to v2.1.16 (security fixes: TLS verification on metrics client, CORS Allow-Credentials suppression on wildcard origins, manifest/API-key body size limits). diff --git a/service-versions.yaml b/service-versions.yaml index 76c0655..792f4eb 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -357,8 +357,8 @@ services: - name: zot type: ansible - last-reviewed: 2026-03-14 - current-version: "v2.1.15" + last-reviewed: 2026-05-04 + current-version: "v2.1.16" upstream-source: https://github.com/project-zot/zot/releases notes: Built from source on indri From 9fb5442ccd831f4a02ec7ceb8be3433d51495d0c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 4 May 2026 17:46:16 -0700 Subject: [PATCH 049/122] =?UTF-8?q?C0:=20kiwix=20=E2=80=94=20doc=20review,?= =?UTF-8?q?=20fix=20Adding=20Archives=20source=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/reference/services/kiwix.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/reference/services/kiwix.md b/docs/reference/services/kiwix.md index 6806a5e..04fe0f6 100644 --- a/docs/reference/services/kiwix.md +++ b/docs/reference/services/kiwix.md @@ -1,6 +1,7 @@ --- title: Kiwix -modified: 2026-03-05 +modified: 2026-05-04 +last-reviewed: 2026-05-04 tags: - service - knowledge @@ -41,7 +42,7 @@ Full list: `argocd/manifests/kiwix/torrents.txt` ## Adding Archives -1. Edit `configmap-zim-torrents.yaml` +1. Edit `argocd/manifests/kiwix/torrents.txt` (rendered into a ConfigMap by `configMapGenerator`) 2. Add torrent URL from https://download.kiwix.org/zim/ 3. Sync: `argocd app sync kiwix` 4. Torrent-sync adds to [[transmission]] From 074887cd571eaf777baa134a8deaec55275fc82e Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 4 May 2026 18:19:53 -0700 Subject: [PATCH 050/122] =?UTF-8?q?C0:=20docs=20=E2=80=94=20explanation=20?= =?UTF-8?q?article=20on=20compliance=20mute=20categories?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures the CC vs NA vs RA distinction surfaced during the 2026-05-03 weekly compliance review (CVE-2026-31789), and the image-scan mutelist gap that blocks acting on it. Links the new article from the review-compensating-controls how-to so it isn't orphaned. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+compliance-mute-categories.doc.md | 1 + .../explanation/compliance-mute-categories.md | 99 +++++++++++++++++++ .../review-compensating-controls.md | 2 + 3 files changed, 102 insertions(+) create mode 100644 docs/changelog.d/+compliance-mute-categories.doc.md create mode 100644 docs/explanation/compliance-mute-categories.md diff --git a/docs/changelog.d/+compliance-mute-categories.doc.md b/docs/changelog.d/+compliance-mute-categories.doc.md new file mode 100644 index 0000000..c776e46 --- /dev/null +++ b/docs/changelog.d/+compliance-mute-categories.doc.md @@ -0,0 +1 @@ +New explanation article [[compliance-mute-categories]] documenting the gap between current `CC:`-only mute tagging and the three structurally distinct categories (compensating control, not-applicable, risk-accepted) needed for real PCI DSS / SOC2 practice. Captures the current image-scan mutelist gap (`cronjob-image-scan.yaml` doesn't pass `--mutelist-file`) and proposes an order-of-operations for wiring it up alongside the new tag conventions. Triggered by CVE-2026-31789, an OpenSSL 32-bit-only finding that surfaced the need for an NA category. diff --git a/docs/explanation/compliance-mute-categories.md b/docs/explanation/compliance-mute-categories.md new file mode 100644 index 0000000..4c5f3a3 --- /dev/null +++ b/docs/explanation/compliance-mute-categories.md @@ -0,0 +1,99 @@ +--- +title: Compliance Mute Categories +modified: 2026-05-04 +last-reviewed: 2026-05-04 +tags: + - explanation + - security + - compliance +--- + +# Compliance Mute Categories + +> **Note:** This article was drafted by AI and reviewed by Erich. I plan to rewrite all explanatory content in my own words - these serve as placeholders to establish the documentation structure. + +How BlumeOps should categorize muted compliance findings, why a single "compensating control" tag is not enough, and what tooling work is needed to support multiple categories cleanly. + +## Why this matters + +When a compliance scanner ([[prowler]], Trivy via Prowler IaC, Kingfisher) reports a failing finding, there are three structurally different reasons we might suppress it: + +1. **Compensating control (CC)** — the requirement applies and we *do not* meet it directly, but an alternative control mitigates the same risk. +2. **Not applicable (NA)** — the requirement's preconditions cannot be satisfied in our environment, so the finding is structurally inert (e.g. a 32-bit-only CVE on 64-bit-only hosts). +3. **Risk accepted (RA)** — the requirement applies, we do not meet it, no compensating control exists, and we have explicitly chosen to accept the residual risk for a bounded period. + +Today every muted finding in BlumeOps uses the `CC: ` convention. That conflates all three categories. In a real PCI DSS or SOC2 environment, auditors treat them very differently: + +- A CC requires documentation of the constraint, the alternative measure, and recurring validation that the measure still works. +- An NA requires documentation of *why* the precondition cannot be met, with periodic verification that the environmental fact still holds. +- An RA requires an explicit decision-maker, an expiry date, and a scheduled re-decision. + +Mixing them under one tag means stale CCs hide stale RAs, and NAs that should be revisited when the environment changes get treated as permanent fixtures. + +## Trigger case: CVE-2026-31789 + +The 2026-05-03 weekly compliance review surfaced [CVE-2026-31789](https://nvd.nist.gov/vuln/detail/CVE-2026-31789), an OpenSSL heap buffer overflow during X.509 certificate processing on **32-bit systems**. Prowler's image scanner flagged 216 findings across 106 BlumeOps images carrying `libssl3` / `libcrypto3` below the fixed versions. + +The CVE is genuine, but its preconditions cannot be satisfied in our environment: indri is Apple Silicon (arm64), ringtail is x86_64, and we run no 32-bit containers. This is the canonical NA case — not a CC, because there is no "alternative measure mitigating the risk." The risk does not exist for us at all. + +A CC like `no-32bit-runtimes` would technically work, but conflates the categories: if we ever introduce a 32-bit runtime we would have to remember that this CC was load-bearing for the mute, retire or scope it down, and reopen the muted findings. An NA tag with a short justification makes the precondition explicit and self-documents the conditions under which it must be revisited. + +## Current tooling state + +Three Prowler scans run weekly. Their mute paths today: + +| Scan | Mute mechanism | File(s) | +|------|----------------|---------| +| K8s CIS (Sunday) | Prowler `--mutelist-file`, merged from ConfigMap | `argocd/manifests/prowler/mutelist/*.yaml` | +| IaC (Saturday) | Trivy `--ignorefile` shim (Prowler's `--mutelist-file` is a no-op for IaC) | `argocd/manifests/prowler/mutelist/trivyignore.yaml` | +| Container Images (Saturday) | **None — `cronjob-image-scan.yaml` does not pass `--mutelist-file`** | n/a | + +The image scan has never been wired to a mutelist. The CSV reports do contain a `MUTED` column, but it is always `False` because no mutelist is supplied. All 14k+ image findings flow through to `review-compliance-reports` unfiltered. + +The mute tag convention is consistent across the two configured scans: each entry's `Description:` (or `statement:` for trivyignore) starts with `CC: . `. `mise run review-compensating-controls` greps for those IDs to find every file that depends on each control. There is no NA tag, no RA tag, and no expiry field. + +## Proposed model + +### Tag prefixes + +Extend the description-prefix convention: + +- `CC: . ` — references an entry in `compensating-controls.yaml`. Existing convention, unchanged. +- `NA: . ` — environmental precondition fails. Reason should be specific enough that a reviewer can verify it (e.g. `NA: no 32-bit runtimes`, not `NA: doesn't apply`). +- `RA: ; expires . ` — explicit risk acceptance with a hard expiry. Past the expiry, re-review is mandatory. + +Tag choice is exclusive: a given mute is one of CC, NA, or RA. If two reasons apply, pick the strongest — CC > RA > NA. + +### Tooling changes required + +1. **Wire the image scan to a mutelist.** Add `argocd/manifests/prowler/mutelist/image-cves.yaml`, mount-and-merge it the same way `cronjob.yaml` mounts its mutelist parts, and pass `--mutelist-file` to `prowler image`. Verify experimentally that `prowler image` honors the flag — Prowler's behavior across providers is inconsistent, and the IaC provider notably does not. If `prowler image` ignores it, fall back to post-scan filtering inside `review-compliance-reports`. + +2. **Teach `review-compensating-controls` (or a sibling) to surface NA and RA entries.** CCs already get a staleness queue. NAs should appear in a separate queue keyed on the reason text — when an NA reason becomes false (e.g. we do introduce a 32-bit runtime), every NA mute citing that reason must be reopened. RAs should sort by expiry date, with anything past expiry flagged red. + +3. **Expiry parsing.** RA tags carry a hard date. The simplest path is to parse it from the description string at review time. A more durable path is to extend the mutelist YAML schema with a structured `expires:` field and a small wrapper that strips it before passing the file to Prowler. Either works; the structured field is friendlier to editors. + +### Out of scope (for now) + +- Changing the underlying Prowler mutelist YAML schema. Stay within the `Mutelist:` shape Prowler expects. +- Migrating existing `CC:` entries. The current set is genuinely CCs and should stay tagged that way. +- Building an issue-tracker integration. Todoist is the source of truth for "remember to re-review this" until that scales painfully. + +## Order of operations + +When this work is picked up, the suggested sequence is: + +1. **Scope and confirm.** Re-read this article, confirm the model still fits, adjust if not. +2. **Wire the image-scan mutelist.** Smallest atomic change; produces immediate value (the CVE-2026-31789 mute can land as the first NA entry). +3. **Add the NA convention.** Update [[read-compliance-reports]] and [[review-compensating-controls]] how-tos to describe the three tag prefixes. The convention can land before tooling supports it — review will just be manual until tooling catches up. +4. **Extend the review tools.** Add NA and RA queues to `review-compensating-controls` (or a new task). At this point, parse expiry from RA descriptions. +5. **Optionally: structured expiry.** If RA entries become common, migrate to a structured `expires:` YAML field with a wrapper that filters it out before Prowler reads the file. + +The first three steps are a coherent C1. Steps 4–5 can be split off if scope creeps. + +## Related + +- [[read-compliance-reports]] — the weekly review process this feeds into +- [[review-compensating-controls]] — current CC review tooling +- [[security-model]] — overall security posture +- [[prowler]] — scanner reference +- [[agent-change-process]] — how to scope and execute the implementation diff --git a/docs/how-to/operations/review-compensating-controls.md b/docs/how-to/operations/review-compensating-controls.md index b05958e..8a32d98 100644 --- a/docs/how-to/operations/review-compensating-controls.md +++ b/docs/how-to/operations/review-compensating-controls.md @@ -38,6 +38,8 @@ A compensating control is a security measure that mitigates the risk a finding w Controls are documented in `compensating-controls.yaml` and referenced from security tool configurations (Prowler mutelist files, Kingfisher config, etc.) using the format `CC: `. +A compensating control is only one of three structurally distinct ways to suppress a finding — see [[compliance-mute-categories]] for when to reach for a CC versus a not-applicable (`NA:`) or risk-accepted (`RA:`) tag instead. + ## Review Process For each control up for review: From 24e549025952e9eb17ab58fe2c1b9db2ac3b857f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 4 May 2026 18:31:13 -0700 Subject: [PATCH 051/122] =?UTF-8?q?C0:=20review=20CC=20init-container-isol?= =?UTF-8?q?ation=20=E2=80=94=20defer=20retirement=20to=20post-ringtail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runtime grafana pod matches the manifest and the CC's claim; bumped last-reviewed. Noted that retiring init-chown-data in favor of fsGroup alone should wait until grafana migrates to ringtail's k3s, since the storage backend will change. Co-Authored-By: Claude Opus 4.7 (1M context) --- compensating-controls.yaml | 7 ++++++- .../+review-cc-init-container-isolation.misc.md | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+review-cc-init-container-isolation.misc.md diff --git a/compensating-controls.yaml b/compensating-controls.yaml index a6dbc56..658c99d 100644 --- a/compensating-controls.yaml +++ b/compensating-controls.yaml @@ -129,11 +129,16 @@ controls: containers run as non-root (UID 472) with all capabilities dropped. created: 2026-03-30 - last-reviewed: 2026-03-30 + last-reviewed: 2026-05-04 notes: >- Verify by inspecting grafana deployment.yaml securityContext for both init and runtime containers. If fsGroup alone can handle PVC ownership, remove init-chown-data and this control. + Retirement deferred until grafana lands on ringtail's k3s + (see [[indri-k8s-migration]]) — storage backend will change, + and removing init-chown-data right before that migration + trades a real safety net for marginal cleanup. Revisit + post-migration. - id: node-config-automated-verification description: >- diff --git a/docs/changelog.d/+review-cc-init-container-isolation.misc.md b/docs/changelog.d/+review-cc-init-container-isolation.misc.md new file mode 100644 index 0000000..295e7f8 --- /dev/null +++ b/docs/changelog.d/+review-cc-init-container-isolation.misc.md @@ -0,0 +1 @@ +Reviewed compensating control `init-container-isolation` (35 days stale). Grafana's running pod matches the manifest and the CC's claim — only `init-chown-data` runs as root with `CHOWN`; runtime containers all run as UID 472 with all caps dropped. Retirement (replacing init-chown-data with `fsGroup` alone) is plausible given the in-tree minikube-hostpath provisioner, but deferred until grafana lands on ringtail's k3s — note added to the CC. From 39b042e6383a59581a650035de0cb06a7c8ad8d2 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 6 May 2026 06:11:15 -0700 Subject: [PATCH 052/122] =?UTF-8?q?C0:=20service=20review=20=E2=80=94=20ca?= =?UTF-8?q?ddy=20v2.11.2=20(current=20latest,=20healthy)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- service-versions.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/service-versions.yaml b/service-versions.yaml index 792f4eb..f7f0f4e 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -364,7 +364,7 @@ services: - name: caddy type: ansible - last-reviewed: 2026-03-15 + last-reviewed: 2026-05-06 current-version: "v2.11.2" upstream-source: https://github.com/caddyserver/caddy/releases notes: Built from source with Gandi DNS and Layer 4 plugins From 6f0d80ca1e5013b5e176d81adab6403dff052964 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 6 May 2026 06:14:40 -0700 Subject: [PATCH 053/122] =?UTF-8?q?C0:=20doc=20review=20=E2=80=94=20index.?= =?UTF-8?q?md,=20add=20ringtail=20to=20infra=20overview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+review-index-doc.doc.md | 1 + docs/index.md | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 docs/changelog.d/+review-index-doc.doc.md diff --git a/docs/changelog.d/+review-index-doc.doc.md b/docs/changelog.d/+review-index-doc.doc.md new file mode 100644 index 0000000..7016a7a --- /dev/null +++ b/docs/changelog.d/+review-index-doc.doc.md @@ -0,0 +1 @@ +Reviewed `index.md`; added ringtail to the infrastructure overview and stamped `last-reviewed`. diff --git a/docs/index.md b/docs/index.md index 6da90a4..fb04c47 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,7 @@ --- title: BlumeOps -modified: 2026-02-08 +modified: 2026-05-06 +last-reviewed: 2026-05-06 aliases: [] id: index tags: [] @@ -22,8 +23,9 @@ raft I built for myself as I went, and you can see it all from within your editor of choice. (I recommend vim.) These services run on my home [[hosts|infrastructure]], primarily an m1 mac -mini named [[indri]] and a Synology NAS called [[sifaka]]. The infrastructure -is networked via [[tailscale]], with the domain `eblu.me` hosted via [[gandi]], +mini named [[indri]], a NixOS GPU host called [[ringtail]] running a k3s +cluster, and a Synology NAS called [[sifaka]]. The infrastructure is networked +via [[tailscale]], with the domain `eblu.me` hosted via [[gandi]], [[caddy]] providing a private reverse proxy for tailnet devices, and [[flyio-proxy|Fly.io]] serving public-facing services like [this documentation site](https://docs.eblu.me). From 0108b687693a3bbca2c43189fce819c20b25fabe Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 6 May 2026 06:50:31 -0700 Subject: [PATCH 054/122] C1: mirror tailscale container locally for ringtail proxyclass (#347) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Adds the first cut of a local nix build for `docker.io/tailscale/tailscale` and rewires only the ringtail tailscale-operator overlay to use it. Indri's overlay continues pulling upstream — minikube on indri is being decommissioned in favor of ringtail's k3s, so investing in dual-cluster routing here would be wasted churn. ## Changes - `containers/tailscale/default.nix` — `buildGoModule` over `cmd/tailscale`, `cmd/tailscaled`, `cmd/containerboot`; packaged via `dockerTools.buildLayeredImage` with `cacert`, `iptables` (legacy symlink to match upstream Synology compat), `iproute2`, `tzdata`, `busybox`. - `argocd/manifests/tailscale-operator-ringtail/kustomization.yaml` — kustomize `images:` rewrite swapping `docker.io/tailscale/tailscale` → `registry.ops.eblu.me/blumeops/tailscale:v1.94.2-67af7a8-nix`. - `docs/changelog.d/mirror-tailscale-container.infra.md` — fragment. ## Pin rationale v1.94.2 matches `service-versions.yaml:96` and the current ProxyClass exactly — this PR is "make it local," not "upgrade tailscale." Version bumps come as follow-up C0/C1 changes once we decide to test newer (v1.96.x had a Fly-side MagicDNS regression; v1.98.0 is current upstream stable). ## Test plan - [x] Image built successfully on ringtail nix-container-builder (run #528). - [x] Image visible in registry: `registry.ops.eblu.me/blumeops/tailscale:v1.94.2-67af7a8-nix`. - [ ] Deploy from branch: `argocd app set tailscale-operator-ringtail --revision mirror-tailscale-container && argocd app sync tailscale-operator-ringtail`. - [ ] Verify proxy pods restart with new image and existing tailnet ingresses (e.g., authentik, immich, tempo) keep resolving. - [ ] After merge: rebuild on main SHA, update kustomization, run `services-check`. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/347 --- .../kustomization.yaml | 14 ++++ .../proxyclass-image.yaml | 11 +++ containers/tailscale/default.nix | 77 +++++++++++++++++++ .../mirror-tailscale-container.infra.md | 1 + 4 files changed, 103 insertions(+) create mode 100644 argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml create mode 100644 containers/tailscale/default.nix create mode 100644 docs/changelog.d/mirror-tailscale-container.infra.md diff --git a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml index a14ca81..2d9ceb2 100644 --- a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml +++ b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml @@ -8,3 +8,17 @@ resources: - ../tailscale-operator-base - proxygroup-ingress.yaml - external-secret.yaml + +# Rewrite the proxyclass image to our local nix-built mirror. +# Scoped to ringtail only; indri's tailscale-operator/kustomization.yaml still +# pulls from upstream docker.io. A strategic merge patch is used instead of +# kustomize's `images:` directive because that directive only rewrites images +# in standard k8s container fields, not custom-resource fields like +# ProxyClass.spec.statefulSet.pod.tailscaleContainer.image. +patches: + - path: proxyclass-image.yaml + target: + group: tailscale.com + version: v1alpha1 + kind: ProxyClass + name: default diff --git a/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml b/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml new file mode 100644 index 0000000..b585e22 --- /dev/null +++ b/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml @@ -0,0 +1,11 @@ +apiVersion: tailscale.com/v1alpha1 +kind: ProxyClass +metadata: + name: default +spec: + statefulSet: + pod: + tailscaleContainer: + image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-67af7a8-nix + tailscaleInitContainer: + image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-67af7a8-nix diff --git a/containers/tailscale/default.nix b/containers/tailscale/default.nix new file mode 100644 index 0000000..8e87f76 --- /dev/null +++ b/containers/tailscale/default.nix @@ -0,0 +1,77 @@ +# Nix-built tailscale container for ringtail's tailscale-operator ProxyClass +# Builds v1.94.2 from forge mirror; mirrors upstream Dockerfile contents. +# Built with dockerTools.buildLayeredImage on the ringtail nix-container-builder. +{ pkgs ? import { } }: + +let + version = "1.94.2"; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/tailscale.git"; + rev = "v${version}"; + hash = "sha256-qjWVB8xWVgIVUgrf27F6hwiFIE+4ERXWeHv26ugg/x4="; + }; + + tailscale = pkgs.buildGoModule { + inherit src version; + pname = "tailscale"; + vendorHash = "sha256-WeMTOkERj4hvdg4yPaZ1gRgKnhRIBXX55kUVbX/k/xM="; + + subPackages = [ + "cmd/tailscale" + "cmd/tailscaled" + "cmd/containerboot" + ]; + + ldflags = [ + "-s" + "-w" + "-X tailscale.com/version.longStamp=${version}" + "-X tailscale.com/version.shortStamp=${version}" + ]; + + doCheck = false; + + meta = with pkgs.lib; { + description = "The easiest, most secure way to use WireGuard"; + homepage = "https://tailscale.com"; + license = licenses.bsd3; + }; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/tailscale"; + tag = "v${version}"; + + contents = [ + tailscale + pkgs.cacert + pkgs.iptables + pkgs.iproute2 + pkgs.tzdata + pkgs.busybox + ]; + + # Match upstream Dockerfile: symlink iptables-legacy over iptables. + # Synology NAS and similar hosts don't support nftables. + # Also recreate the /tailscale/run.sh compat symlink. + extraCommands = '' + rm -f usr/sbin/iptables usr/sbin/ip6tables + ln -s ${pkgs.iptables}/bin/iptables-legacy usr/sbin/iptables || true + ln -s ${pkgs.iptables}/bin/ip6tables-legacy usr/sbin/ip6tables || true + mkdir -p tailscale + ln -s /bin/containerboot tailscale/run.sh + mkdir -p tmp + chmod 1777 tmp + ''; + + config = { + Entrypoint = [ "/bin/containerboot" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + "PATH=/bin:/usr/bin:/usr/sbin" + ]; + }; +} diff --git a/docs/changelog.d/mirror-tailscale-container.infra.md b/docs/changelog.d/mirror-tailscale-container.infra.md new file mode 100644 index 0000000..54ca3ba --- /dev/null +++ b/docs/changelog.d/mirror-tailscale-container.infra.md @@ -0,0 +1 @@ +Add local nix container build for `tailscale` (`containers/tailscale/default.nix`) so ringtail's tailscale-operator ProxyClass proxy pods pull from the forge mirror instead of `docker.io/tailscale/tailscale`. Pinned at v1.94.2 to match `service-versions.yaml`. Indri's tailscale-operator continues to use upstream during the k8s-to-ringtail migration. From 8bc19fa46037faa3b564ceb4d1f13b268c6cffae Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 6 May 2026 06:52:39 -0700 Subject: [PATCH 055/122] C0: tailscale main-SHA rebuild for ringtail proxyclass Routine post-squash-merge cleanup. Bumps the ProxyClass image tag from the now-orphaned PR branch SHA (67af7a8) to the merge commit SHA (0108b68) so the deployed image stays traceable after branch cleanup. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tailscale-operator-ringtail/proxyclass-image.yaml | 4 ++-- docs/changelog.d/+tailscale-main-sha-rebuild.infra.md | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+tailscale-main-sha-rebuild.infra.md diff --git a/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml b/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml index b585e22..d1bf2a4 100644 --- a/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml +++ b/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml @@ -6,6 +6,6 @@ spec: statefulSet: pod: tailscaleContainer: - image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-67af7a8-nix + image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-0108b68-nix tailscaleInitContainer: - image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-67af7a8-nix + image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-0108b68-nix diff --git a/docs/changelog.d/+tailscale-main-sha-rebuild.infra.md b/docs/changelog.d/+tailscale-main-sha-rebuild.infra.md new file mode 100644 index 0000000..24bb81c --- /dev/null +++ b/docs/changelog.d/+tailscale-main-sha-rebuild.infra.md @@ -0,0 +1 @@ +Update `tailscale-operator-ringtail` ProxyClass to reference the `0108b68` main-SHA build of the tailscale container. Routine post-merge cleanup so the deployed image traces to a commit that survives PR branch cleanup. From b87f62e0f57c22b8975adda029a96e6a10b1a6e1 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 10 May 2026 20:32:38 -0700 Subject: [PATCH 056/122] C1: nix-build homepage container for amd64 ringtail migration Replace Dockerfile (arm64-only, indri-built) with a nix derivation adapted from nixpkgs pkgs/by-name/ho/homepage-dashboard. Built via the nix-container-builder runner on ringtail, producing an amd64 image suitable for k3s. Includes the upstream Next.js file-system-cache patch to avoid prerender cache write failures on a read-only nix store path (nixpkgs issues #328621 and #458494). Pinned to v1.11.0 (current production version). --- containers/homepage/Dockerfile | 47 ------------- containers/homepage/default.nix | 119 ++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 47 deletions(-) delete mode 100644 containers/homepage/Dockerfile create mode 100644 containers/homepage/default.nix diff --git a/containers/homepage/Dockerfile b/containers/homepage/Dockerfile deleted file mode 100644 index 6e53e1c..0000000 --- a/containers/homepage/Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# Homepage - self-hosted services dashboard -# Two-stage build: Node.js build, Alpine runtime - -ARG CONTAINER_APP_VERSION=v1.11.0 -ARG HOMEPAGE_VERSION=${CONTAINER_APP_VERSION} - -FROM node:24-slim AS builder - -ARG HOMEPAGE_VERSION -RUN apt-get update && apt-get install -y --no-install-recommends git ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -RUN git clone --depth 1 --branch ${HOMEPAGE_VERSION} \ - https://forge.ops.eblu.me/mirrors/homepage.git /app - -WORKDIR /app -RUN mkdir -p config \ - && corepack enable && corepack prepare pnpm@latest --activate \ - && pnpm install --frozen-lockfile \ - && NEXT_TELEMETRY_DISABLED=1 pnpm run build - -FROM node:24-alpine - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Homepage" -LABEL org.opencontainers.image.description="A self-hosted services landing page" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -WORKDIR /app - -COPY --from=builder --chown=1000:1000 /app/public ./public -COPY --from=builder --chown=1000:1000 /app/.next/standalone/ ./ -COPY --from=builder --chown=1000:1000 /app/.next/static/ ./.next/static - -RUN mkdir -p /app/config && chown 1000:1000 /app/config - -ENV NODE_ENV=production -ENV PORT=3000 -EXPOSE 3000 - -HEALTHCHECK --interval=10s --timeout=3s --start-period=20s \ - CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:3000/api/healthcheck || exit 1 - -USER 1000 -CMD ["node", "server.js"] diff --git a/containers/homepage/default.nix b/containers/homepage/default.nix new file mode 100644 index 0000000..7b4becb --- /dev/null +++ b/containers/homepage/default.nix @@ -0,0 +1,119 @@ +# Nix-built gethomepage/homepage dashboard +# Builds v1.11.0 from forge mirror. +# +# Adapted from nixpkgs pkgs/by-name/ho/homepage-dashboard (commit master), +# changed to fetch from our forge mirror and wrap with dockerTools for an +# amd64 image runnable on ringtail's k3s. +# +# The preBuild substitutions are not optional — without them Next.js writes +# its file-system-cache to a read-only path and prerender state breaks after +# restart (nixpkgs issues #328621 and #458494). +{ pkgs ? import { } }: + +let + version = "1.11.0"; + + homepage = pkgs.stdenv.mkDerivation (finalAttrs: { + pname = "homepage-dashboard"; + inherit version; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/homepage.git"; + rev = "v${version}"; + hash = "sha256-jnv9PnClm/jIQ4uU6c4A1UiAmwoihG0l6k3fUbD47I4="; + }; + + pnpmDeps = pkgs.fetchPnpmDeps { + inherit (finalAttrs) pname version src; + pnpm = pkgs.pnpm_10; + fetcherVersion = 3; + hash = "sha256-X5j9XppbcasGuC7fUsj4XzbaQFM9WcRcXjgJHN/inR8="; + }; + + nativeBuildInputs = [ + pkgs.makeBinaryWrapper + pkgs.nodejs_24 + pkgs.pnpmConfigHook + pkgs.pnpm_10 + ]; + + buildInputs = [ + pkgs.nodePackages.node-gyp-build + ]; + + env.PYTHON = "${pkgs.python3}/bin/python"; + + preBuild = '' + substituteInPlace node_modules/next/dist/server/lib/incremental-cache/file-system-cache.js \ + --replace-fail 'this.serverDistDir = ctx.serverDistDir;' \ + 'this.serverDistDir = require("path").join((process.env.NIXPKGS_HOMEPAGE_CACHE_DIR || "/tmp/homepage-cache"), "homepage");' + + for bundle in node_modules/next/dist/compiled/next-server/*.runtime.prod.js; do + substituteInPlace "$bundle" \ + --replace-fail 'this.serverDistDir=e.serverDistDir' \ + 'this.serverDistDir=(process.env.NIXPKGS_HOMEPAGE_CACHE_DIR||"/tmp/homepage-cache")+"/homepage"' + done + ''; + + buildPhase = '' + runHook preBuild + mkdir -p config + pnpm build + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + + mkdir -p $out/{bin,share} + cp -r .next/standalone $out/share/homepage/ + cp -r public $out/share/homepage/public + chmod +x $out/share/homepage/server.js + + mkdir -p $out/share/homepage/.next + cp -r .next/static $out/share/homepage/.next/static + + makeWrapper "${pkgs.lib.getExe pkgs.nodejs_24}" $out/bin/homepage \ + --set-default PORT 3000 \ + --set-default HOMEPAGE_CONFIG_DIR /app/config \ + --set-default NIXPKGS_HOMEPAGE_CACHE_DIR /tmp/homepage-cache \ + --add-flags "$out/share/homepage/server.js" \ + --prefix PATH : "${pkgs.lib.makeBinPath [ pkgs.unixtools.ping ]}" + + runHook postInstall + ''; + + doDist = false; + }); +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/homepage"; + contents = [ + homepage + pkgs.cacert + pkgs.tzdata + ]; + + extraCommands = '' + mkdir -p tmp + chmod 1777 tmp + ''; + + config = { + Entrypoint = [ "${homepage}/bin/homepage" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + "TMPDIR=/tmp" + "NIXPKGS_HOMEPAGE_CACHE_DIR=/tmp/homepage-cache" + "HOMEPAGE_CONFIG_DIR=/app/config" + "NEXT_TELEMETRY_DISABLED=1" + "PORT=3000" + ]; + ExposedPorts = { + "3000/tcp" = { }; + }; + User = "1000"; + }; +} From be54cc341179c60e1518aef194275ea3689d0447 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 10 May 2026 20:37:03 -0700 Subject: [PATCH 057/122] C1: migrate homepage dashboard to ringtail k3s Repoint the ArgoCD Application destination from minikube to ringtail and bump the image tag to the new amd64 nix-built v1.11.0-b87f62e-nix. Rework services.yaml for the autodiscovery shift: 11 services that previously auto-populated via minikube Ingress annotations (ArgoCD, Immich, Kiwix, Mealie, Miniflux, Grafana, Prometheus, Navidrome, Paperless, TeslaMate, Transmission) become explicit static entries with their widget configs preserved. Conversely, the ringtail services that will now auto-populate (Frigate/NVR, Authentik, Ntfy) are removed from the static list to avoid duplicates; Ollama becomes newly visible. Add a Content group for Immich/Kiwix/Miniflux which previously lived under the autodiscovered "Content" group from annotations. --- argocd/apps/homepage.yaml | 2 +- argocd/manifests/homepage/kustomization.yaml | 2 +- argocd/manifests/homepage/services.yaml | 77 ++++++++++++++++--- .../changelog.d/homepage-to-ringtail.infra.md | 8 ++ 4 files changed, 75 insertions(+), 14 deletions(-) create mode 100644 docs/changelog.d/homepage-to-ringtail.infra.md diff --git a/argocd/apps/homepage.yaml b/argocd/apps/homepage.yaml index 86a0f8d..22147f2 100644 --- a/argocd/apps/homepage.yaml +++ b/argocd/apps/homepage.yaml @@ -14,7 +14,7 @@ spec: targetRevision: main path: argocd/manifests/homepage destination: - server: https://kubernetes.default.svc + server: https://ringtail.tail8d86e.ts.net:6443 namespace: homepage syncPolicy: syncOptions: diff --git a/argocd/manifests/homepage/kustomization.yaml b/argocd/manifests/homepage/kustomization.yaml index 27de0eb..ce627ac 100644 --- a/argocd/manifests/homepage/kustomization.yaml +++ b/argocd/manifests/homepage/kustomization.yaml @@ -17,7 +17,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/homepage - newTag: v1.11.0-e375859 + newTag: v1.11.0-b87f62e-nix configMapGenerator: - name: homepage-config diff --git a/argocd/manifests/homepage/services.yaml b/argocd/manifests/homepage/services.yaml index 211e043..d552ff2 100644 --- a/argocd/manifests/homepage/services.yaml +++ b/argocd/manifests/homepage/services.yaml @@ -1,3 +1,6 @@ +# Homepage runs on ringtail (k3s) — its k8s autodiscovery only sees ringtail +# Ingresses (frigate→NVR, authentik, ntfy, ollama). Services that live on +# minikube (and indri-native) need explicit static entries here. - Host Services: - Forgejo: href: https://forge.eblu.me @@ -57,10 +60,6 @@ # type: caddy # url: http://indri.tail8d86e.ts.net:2019 - Home: - - NVR: - href: https://nvr.ops.eblu.me - icon: frigate.png - description: Network video recorder - Jellyfin: href: https://jellyfin.ops.eblu.me icon: jellyfin @@ -72,15 +71,61 @@ enableBlocks: true enableNowPlaying: false fields: ["movies", "series", "episodes"] + - Mealie: + href: https://meals.ops.eblu.me + icon: mealie.png + description: Recipe manager + - DJ: + href: https://dj.ops.eblu.me + icon: navidrome.png + description: Music streaming server + widget: + type: navidrome + url: https://dj.ops.eblu.me + user: "{{HOMEPAGE_VAR_NAVIDROME_USER}}" + token: "{{HOMEPAGE_VAR_NAVIDROME_TOKEN}}" + salt: "{{HOMEPAGE_VAR_NAVIDROME_SALT}}" + - Paperless: + href: https://paperless.ops.eblu.me + icon: paperless-ngx.png + description: Document management +- Content: + - Immich: + href: https://photos.ops.eblu.me + icon: immich.png + description: Photo management + - Kiwix: + href: https://kiwix.ops.eblu.me + icon: kiwix.png + description: Offline Wikipedia + - Miniflux: + href: https://feed.ops.eblu.me + icon: miniflux.png + description: RSS reader + widget: + type: miniflux + url: https://feed.ops.eblu.me + key: "{{HOMEPAGE_VAR_MINIFLUX_API_KEY}}" + fields: ["unread"] - Infrastructure: - - Authentik: - href: https://authentik.ops.eblu.me - icon: authentik - description: Identity provider - - Ntfy: - href: https://ntfy.ops.eblu.me - icon: ntfy.png - description: Push notifications + - ArgoCD: + href: https://argocd.ops.eblu.me + icon: argo-cd.png + description: GitOps CD + - Grafana: + href: https://grafana.ops.eblu.me + icon: grafana.png + description: Metrics dashboards + widget: + type: grafana + url: https://grafana.ops.eblu.me + username: "{{HOMEPAGE_VAR_GRAFANA_USERNAME}}" + password: "{{HOMEPAGE_VAR_GRAFANA_PASSWORD}}" + fields: ["dashboards", "totalalerts", "alertstriggered"] + - Prometheus: + href: https://prometheus.ops.eblu.me + icon: prometheus.png + description: Metrics storage - Services: # CV and Docs were previously auto-discovered from k8s Ingresses; after # the indri-native migration ([[cv-on-indri]], [[docs-on-indri]]) there @@ -93,3 +138,11 @@ href: https://docs.eblu.me icon: mdi-book-open-page-variant description: BlumeOps Documentation + - TeslaMate: + href: https://tesla.ops.eblu.me + icon: teslamate.png + description: Tesla data logger + - Transmission: + href: https://torrent.ops.eblu.me + icon: transmission.png + description: Torrent client diff --git a/docs/changelog.d/homepage-to-ringtail.infra.md b/docs/changelog.d/homepage-to-ringtail.infra.md new file mode 100644 index 0000000..1e3e795 --- /dev/null +++ b/docs/changelog.d/homepage-to-ringtail.infra.md @@ -0,0 +1,8 @@ +Migrated homepage dashboard from minikube (indri/arm64) to k3s (ringtail/amd64). +The container is now built via nix (`containers/homepage/default.nix`), adapted +from nixpkgs `homepage-dashboard` with the upstream Next.js cache patches and +wrapped with `dockerTools.buildLayeredImage`. Autodiscovery shifts: services on +minikube (ArgoCD, Immich, Kiwix, Mealie, Miniflux, Grafana, Prometheus, +Navidrome, Paperless, TeslaMate, Transmission) become explicit static entries +in `services.yaml`; ringtail services (Authentik, Frigate/NVR, Ntfy, Ollama) +auto-populate via Ingress annotations. From 678f26b0e7335d498549cdbb85e68ca62f2654ab Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 10 May 2026 20:48:48 -0700 Subject: [PATCH 058/122] C0: fix homepage container /app/config write permissions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous Dockerfile chowned /app/config to 1000:1000 so the runtime user could seed missing skeleton configs (e.g. proxmox.yaml) and write /app/config/logs. The nix derivation didn't replicate that, so the new amd64 image crashed with EACCES on cold start (fixed-forward — caught during ringtail cutover, ArgoCD #348). Add fakeRootCommands to dockerTools to create /app and /app/config and chown them at build time. The deployment's ConfigMap subPath mounts leave the parent directory as image filesystem, so its ownership has to be set at build time, not at runtime. --- containers/homepage/default.nix | 11 +++++++++++ docs/changelog.d/+homepage-config-perms-fix.bugfix.md | 5 +++++ 2 files changed, 16 insertions(+) create mode 100644 docs/changelog.d/+homepage-config-perms-fix.bugfix.md diff --git a/containers/homepage/default.nix b/containers/homepage/default.nix index 7b4becb..6217847 100644 --- a/containers/homepage/default.nix +++ b/containers/homepage/default.nix @@ -100,6 +100,17 @@ pkgs.dockerTools.buildLayeredImage { chmod 1777 tmp ''; + # /app/config must be writable by the runtime user (1000): homepage seeds + # missing skeleton configs (proxmox.yaml, etc.) and writes /app/config/logs. + # The deployment mounts ConfigMap files at /app/config/.yaml via + # subPath, which leaves the parent dir as image filesystem — so its + # ownership has to be set at build time. + fakeRootCommands = '' + mkdir -p app/config + chown -R 1000:1000 app + ''; + enableFakechroot = true; + config = { Entrypoint = [ "${homepage}/bin/homepage" ]; Env = [ diff --git a/docs/changelog.d/+homepage-config-perms-fix.bugfix.md b/docs/changelog.d/+homepage-config-perms-fix.bugfix.md new file mode 100644 index 0000000..20e1135 --- /dev/null +++ b/docs/changelog.d/+homepage-config-perms-fix.bugfix.md @@ -0,0 +1,5 @@ +Fixed homepage container EACCES on cold start: the nix-built image now chowns +`/app/config` to uid 1000 at build time via `fakeRootCommands`, matching the +behavior of the old Dockerfile. Without this, homepage couldn't seed missing +skeleton configs (proxmox.yaml etc.) or create `/app/config/logs`, crashing on +its first uncached request. Caught during the ringtail cutover. From eceb2b99ce9c3edd041625e39f8ce051a81fb268 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 10 May 2026 21:16:34 -0700 Subject: [PATCH 059/122] C0: bump homepage image to fixed-perms build (v1.11.0-678f26b-nix) Pulls in 678f26b0 (chowned /app/config). Resolves the EACCES crash loop on ringtail. --- argocd/manifests/homepage/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/homepage/kustomization.yaml b/argocd/manifests/homepage/kustomization.yaml index ce627ac..31b6847 100644 --- a/argocd/manifests/homepage/kustomization.yaml +++ b/argocd/manifests/homepage/kustomization.yaml @@ -17,7 +17,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/homepage - newTag: v1.11.0-b87f62e-nix + newTag: v1.11.0-678f26b-nix configMapGenerator: - name: homepage-config From 292d354902eb47b16dc0aed18044f7ccac3381a4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 13:47:18 -0700 Subject: [PATCH 060/122] C1: deploy adelaide-baby-shower-app to ringtail k3s (#349) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Brings up the Adelaide / Heidi / Addie baby shower app on ringtail k3s with the public/private split that the app's hosting contract calls for: `shower.eblu.me` (public, via Fly proxy) and `shower.ops.eblu.me` (tailnet). App is consumed as a wheel from the Forgejo PyPI index — source lives at [`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app). ### What's included - **ArgoCD app + manifests** under `argocd/manifests/shower/` (deployment, service, ProxyGroup ingress, ConfigMap for `DJANGO_DEBUG`/`DJANGO_ADMIN_URL`, ExternalSecret for `DJANGO_SECRET_KEY` from 1Password item `Shower (blumeops)`, NFS PV on sifaka, RWX media PVC, RWO local-path data PVC for SQLite). Recreate rollout because SQLite is single-writer. - **Public surface** (`fly/`): new `shower.eblu.me` server block proxying to `shower.ops.eblu.me`. `/admin/` returns 403 at the edge except `/admin/login/` and `/admin/logout/`, which are rate-limited via a new `shower_auth` zone. `X-Clacks-Overhead` on. GNU Terry Pratchett. - **fail2ban** filter (`shower-admin-login.conf`) matching 401/403/429 on `/admin/login/` and jail (`shower.conf`) with `maxretry=5/findtime=600/bantime=3600`. The `nginx-deny` action was generalized to take a per-jail `nginx_deny_file` so the shower has its own deny list (forge keeps using the legacy default). - **Caddy** route on indri (`shower.ops.eblu.me` → `https://shower.tail8d86e.ts.net`). - **Pulumi** Gandi CNAME `shower.eblu.me → blumeops-proxy.fly.dev.`. - **Grafana** APM dashboard `configmap-shower-apm.yaml` (request rate, error rate, failed admin login count, latency percentiles, bandwidth, access logs) mirroring `docs-apm.json` with a `host="shower.eblu.me"` filter. - **Container** `containers/shower/default.nix` — `dockerTools.buildLayeredImage` with a nixpkgs Python and a startup wrapper that creates `/app/data/.venv`, pip-installs `adelaide-baby-shower-app==1.0.0` from the forge PyPI index on first boot, runs migrations + collectstatic, and execs gunicorn. A `local_settings.py` shim pins `DATABASES.NAME`/`MEDIA_ROOT`/`STATIC_ROOT` to absolute paths so they don't end up in site-packages. - **Docs** runbook at `docs/how-to/operations/shower-app.md` linked from the apps registry, plus changelog fragments. ### Defense layers on the public surface 1. fly nginx geo+fail2ban `$shower_banned` (per-service deny list) 2. fly nginx `limit_req zone=shower_auth` (3 r/s per Fly-Client-IP) 3. django-axes (5 fails / 1h, keyed on username+ip_address) 4. edge `/admin/` block (returns 403 for anything that isn't login/logout) ## Prerequisites for the user to do (NOT in this PR) Halted on these per request — they touch shared/manual systems: - [x] **NFS share** on sifaka: `/volume1/shower`, NFS rule for ringtail RW, `chown 1000:1000` - [ ] **1Password item** `Shower (blumeops)` in the blumeops vault with a freshly minted `secret-key` field (`openssl rand -base64 48`) — do NOT reuse anything that has lived in git - [ ] **Container build**: `mise run container-build-and-release shower`, then update `images[].newTag` in `argocd/manifests/shower/kustomization.yaml` to the resulting `v1.0.0--nix` - [x] **DNS**: `mise run dns-up` after merge - [x] **Fly cert**: `fly certs add shower.eblu.me -a blumeops-proxy` - [ ] **Caddy push**: `mise run provision-indri -- --tags caddy` - [ ] **Fly redeploy** to pick up the new nginx block + fail2ban jail: `mise run fly-deploy` - [ ] **ArgoCD sync**: `argocd app set shower --revision shower-app-deploy && argocd app sync shower` to test from this branch before merging ## Test plan - [ ] Container builds successfully on nix-container-builder runner - [ ] Pod starts, migrations run, gunicorn answers on :8000 - [ ] `kubectl --context=k3s-ringtail -n shower logs deploy/shower` clean - [ ] `curl -sf https://shower.ops.eblu.me/` returns the splash page (tailnet) - [ ] `curl -I -H "Host: shower.eblu.me" https://blumeops-proxy.fly.dev/` returns 200 (pre-DNS verification) - [ ] `curl -I -H "Host: shower.eblu.me" https://blumeops-proxy.fly.dev/admin/users/` returns 403 (edge block) - [ ] `curl -I -H "Host: shower.eblu.me" https://blumeops-proxy.fly.dev/admin/login/` returns a Django login response - [ ] After DNS is up: `curl -I https://shower.eblu.me/` returns 200 with `X-Clacks-Overhead` - [ ] Grafana dashboard "Shower APM" appears and starts showing traffic - [ ] `mise run services-check` passes Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/349 --- ansible/roles/borgmatic/defaults/main.yml | 8 + ansible/roles/caddy/defaults/main.yml | 3 + ansible/roles/cv/defaults/main.yml | 2 +- argocd/apps/shower.yaml | 20 ++ .../dashboards/configmap-shower-apm.yaml | 229 ++++++++++++++++ .../grafana-config/kustomization.yaml | 1 + argocd/manifests/shower/configmap.yaml | 22 ++ argocd/manifests/shower/deployment.yaml | 81 ++++++ argocd/manifests/shower/external-secret.yaml | 19 ++ .../manifests/shower/ingress-tailscale.yaml | 30 ++ argocd/manifests/shower/kustomization.yaml | 17 ++ argocd/manifests/shower/pv-nfs.yaml | 24 ++ argocd/manifests/shower/pvc.yaml | 30 ++ argocd/manifests/shower/service.yaml | 13 + containers/shower/default.nix | 259 ++++++++++++++++++ docs/changelog.d/shower-app-deploy.bugfix.md | 13 + docs/changelog.d/shower-app-deploy.feature.md | 4 + docs/changelog.d/shower-app-deploy.infra.md | 9 + docs/how-to/operations/shower-on-ringtail.md | 245 +++++++++++++++++ docs/reference/kubernetes/apps.md | 1 + docs/reference/services/shower-app.md | 55 ++++ docs/tutorials/expose-service-publicly.md | 36 ++- fly/fail2ban/action.d/nginx-deny.conf | 13 +- fly/nginx.conf | 160 +++++++++++ fly/start.sh | 1 + mise-tasks/fly-setup | 1 + pulumi/gandi/__main__.py | 10 + service-versions.yaml | 19 ++ 28 files changed, 1315 insertions(+), 10 deletions(-) create mode 100644 argocd/apps/shower.yaml create mode 100644 argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml create mode 100644 argocd/manifests/shower/configmap.yaml create mode 100644 argocd/manifests/shower/deployment.yaml create mode 100644 argocd/manifests/shower/external-secret.yaml create mode 100644 argocd/manifests/shower/ingress-tailscale.yaml create mode 100644 argocd/manifests/shower/kustomization.yaml create mode 100644 argocd/manifests/shower/pv-nfs.yaml create mode 100644 argocd/manifests/shower/pvc.yaml create mode 100644 argocd/manifests/shower/service.yaml create mode 100644 containers/shower/default.nix create mode 100644 docs/changelog.d/shower-app-deploy.bugfix.md create mode 100644 docs/changelog.d/shower-app-deploy.feature.md create mode 100644 docs/changelog.d/shower-app-deploy.infra.md create mode 100644 docs/how-to/operations/shower-on-ringtail.md create mode 100644 docs/reference/services/shower-app.md diff --git a/ansible/roles/borgmatic/defaults/main.yml b/ansible/roles/borgmatic/defaults/main.yml index 25d0149..123cb0f 100644 --- a/ansible/roles/borgmatic/defaults/main.yml +++ b/ansible/roles/borgmatic/defaults/main.yml @@ -27,6 +27,9 @@ borgmatic_source_directories: - /Users/erichblume/.config/borgmatic - /Users/erichblume/Documents - /Users/erichblume/.local/share/borgmatic/k8s-dumps + # Shower app prize-photo uploads (sifaka SMB mount). Mounted manually + # on indri via Finder — see docs/how-to/operations/shower-app.md. + - /Volumes/shower # Backup repositories borgmatic_repositories: @@ -54,6 +57,11 @@ borgmatic_k8s_sqlite_dumps: label_selector: app=mealie db_path: /app/data/mealie.db context: minikube + - name: shower + namespace: shower + label_selector: app=shower + db_path: /app/data/db.sqlite3 + context: k3s-ringtail # Exclude patterns borgmatic_exclude_patterns: [] diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index 6eada76..da6f3f9 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -101,6 +101,9 @@ caddy_services: - name: paperless host: "paperless.{{ caddy_domain }}" backend: "https://paperless.tail8d86e.ts.net" + - name: shower + host: "shower.{{ caddy_domain }}" + backend: "https://shower.tail8d86e.ts.net" - name: sifaka host: "nas.{{ caddy_domain }}" backend: "http://sifaka:5000" diff --git a/ansible/roles/cv/defaults/main.yml b/ansible/roles/cv/defaults/main.yml index 734e52b..a18cc82 100644 --- a/ansible/roles/cv/defaults/main.yml +++ b/ansible/roles/cv/defaults/main.yml @@ -3,7 +3,7 @@ # Caddy serves cv_content_dir directly via the static-kind service block. cv_version: "v1.0.3" -cv_release_url: "https://forge.eblu.me/api/packages/eblume/generic/cv/{{ cv_version }}/cv-{{ cv_version }}.tar.gz" +cv_release_url: "https://forge.ops.eblu.me/api/packages/eblume/generic/cv/{{ cv_version }}/cv-{{ cv_version }}.tar.gz" cv_home: /Users/erichblume/blumeops/cv cv_content_dir: "{{ cv_home }}/content" diff --git a/argocd/apps/shower.yaml b/argocd/apps/shower.yaml new file mode 100644 index 0000000..c4a7a62 --- /dev/null +++ b/argocd/apps/shower.yaml @@ -0,0 +1,20 @@ +# Adelaide / Heidi / Addie baby shower app — Django guest/raffle/prize system. +# Public landing page at shower.eblu.me (via fly proxy), staff console + admin +# at shower.ops.eblu.me (tailnet only). Built from forge PyPI wheel. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: shower + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/shower + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: shower + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml b/argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml new file mode 100644 index 0000000..96348e8 --- /dev/null +++ b/argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml @@ -0,0 +1,229 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-shower-apm + namespace: monitoring + labels: + grafana_dashboard: "1" +data: + shower-apm.json: | + { + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "req/s", + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 0 }, + "id": 1, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (status) (rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\"}[5m]))", "legendFormat": "{{status}}", "refId": "A" } + ], + "title": "Request Rate by Status", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.01 }, { "color": "red", "value": 0.05 }] }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\",status=~\"5..\"}[5m])) / sum(rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\"}[5m]))", "refId": "A" } + ], + "title": "Error Rate (5xx)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 5 }] }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 4 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(flyio_nginx_http_requests_total{host=\"shower.eblu.me\",request_uri=~\"/admin/login.*\",status=~\"4..\"}[$__range]))", "refId": "A" } + ], + "title": "Failed admin logins (range)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 4 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\"}[5m]))", "refId": "A" } + ], + "title": "Current RPS", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "seconds", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 5, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(0.50, sum by (le) (rate(flyio_nginx_http_request_duration_seconds_bucket{host=\"shower.eblu.me\"}[5m])))", "legendFormat": "p50", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(0.90, sum by (le) (rate(flyio_nginx_http_request_duration_seconds_bucket{host=\"shower.eblu.me\"}[5m])))", "legendFormat": "p90", "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(0.99, sum by (le) (rate(flyio_nginx_http_request_duration_seconds_bucket{host=\"shower.eblu.me\"}[5m])))", "legendFormat": "p99", "refId": "C" } + ], + "title": "Latency Percentiles", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "", + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 6, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_http_response_bytes_total{host=\"shower.eblu.me\"}[5m]))", "legendFormat": "Bandwidth", "refId": "A" } + ], + "title": "Bandwidth", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "id": 7, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { "datasource": { "type": "loki", "uid": "loki" }, "expr": "{instance=\"flyio-proxy\", job=\"flyio-nginx\"} |= \"shower.eblu.me\" | json | line_format \"{{.client_ip}} {{.request_method}} {{.request_uri}} {{.status}} {{.request_time}}s\"", "refId": "A" } + ], + "title": "Recent Access Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["shower", "flyio", "apm"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Shower APM", + "uid": "shower-apm", + "version": 1, + "weekStart": "" + } diff --git a/argocd/manifests/grafana-config/kustomization.yaml b/argocd/manifests/grafana-config/kustomization.yaml index a6e8000..b518043 100644 --- a/argocd/manifests/grafana-config/kustomization.yaml +++ b/argocd/manifests/grafana-config/kustomization.yaml @@ -22,6 +22,7 @@ resources: - dashboards/configmap-transmission.yaml - dashboards/configmap-cv-apm.yaml - dashboards/configmap-docs-apm.yaml + - dashboards/configmap-shower-apm.yaml - dashboards/configmap-flyio.yaml - dashboards/configmap-sifaka-disks.yaml - dashboards/configmap-forgejo.yaml diff --git a/argocd/manifests/shower/configmap.yaml b/argocd/manifests/shower/configmap.yaml new file mode 100644 index 0000000..6102c1e --- /dev/null +++ b/argocd/manifests/shower/configmap.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: shower-app-config + namespace: shower +data: + DJANGO_DEBUG: "0" + # The app's settings.py hardcodes ALLOWED_HOSTS = ["shower.eblu.me", + # "localhost", "127.0.0.1"] and exposes this env var as a comma-separated + # extras list. shower.ops.eblu.me is what Caddy on indri and the + # Tailscale ProxyGroup both send as the Host header, so the app needs to + # accept it. + DJANGO_ALLOWED_HOSTS: "shower.ops.eblu.me" + # /host/, /admin/, and Django's login surface are all tailnet-only — the + # public proxy 403s everything outside of `/` and `/prizes//`. + # /host/'s "Django admin" link follows DJANGO_ADMIN_URL. + DJANGO_ADMIN_URL: "https://shower.ops.eblu.me/admin/" + # /host/ is served on shower.ops.eblu.me (tailnet), but the QR codes it + # generates need to point at the public WAN hostname so guest phones can + # reach them. PUBLIC_URL_BASE overrides Django's request.build_absolute_uri() + # in the QR views — see shower/views.py:_public_url. Added in app v1.0.1. + DJANGO_PUBLIC_URL_BASE: "https://shower.eblu.me" diff --git a/argocd/manifests/shower/deployment.yaml b/argocd/manifests/shower/deployment.yaml new file mode 100644 index 0000000..70547aa --- /dev/null +++ b/argocd/manifests/shower/deployment.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: shower + namespace: shower +spec: + replicas: 1 + # SQLite + RWO data PVC: only one writer at a time. Recreate ensures the + # old pod's lock on the local-path volume is released before the new one + # mounts it. + strategy: + type: Recreate + selector: + matchLabels: + app: shower + template: + metadata: + labels: + app: shower + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: shower + image: registry.ops.eblu.me/blumeops/shower:kustomized + securityContext: + runAsNonRoot: true + allowPrivilegeEscalation: false + ports: + - containerPort: 8000 + name: http + envFrom: + - configMapRef: + name: shower-app-config + - secretRef: + name: shower-app-secrets + volumeMounts: + - name: media + mountPath: /app/media + - name: data + mountPath: /app/data + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: / + port: 8000 + httpHeaders: + - name: Host + value: shower.ops.eblu.me + - name: X-Forwarded-Proto + value: https + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 8000 + httpHeaders: + - name: Host + value: shower.ops.eblu.me + - name: X-Forwarded-Proto + value: https + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: media + persistentVolumeClaim: + claimName: shower-media + - name: data + persistentVolumeClaim: + claimName: shower-data diff --git a/argocd/manifests/shower/external-secret.yaml b/argocd/manifests/shower/external-secret.yaml new file mode 100644 index 0000000..005a7e9 --- /dev/null +++ b/argocd/manifests/shower/external-secret.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: shower-app-secrets + namespace: shower +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: shower-app-secrets + creationPolicy: Owner + data: + - secretKey: DJANGO_SECRET_KEY + remoteRef: + key: "Shower (blumeops)" + property: secret-key diff --git a/argocd/manifests/shower/ingress-tailscale.yaml b/argocd/manifests/shower/ingress-tailscale.yaml new file mode 100644 index 0000000..d09a696 --- /dev/null +++ b/argocd/manifests/shower/ingress-tailscale.yaml @@ -0,0 +1,30 @@ +# Tailscale Ingress for shower app. +# Exposes at shower.tail8d86e.ts.net. +# Caddy on indri proxies shower.ops.eblu.me here. The fly proxy then proxies +# shower.eblu.me through Caddy to this same endpoint (fly does not contact +# the k8s service directly — all traffic routes through indri's Caddy). +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: shower-tailscale + namespace: shower + annotations: + tailscale.com/proxy-class: "default" + tailscale.com/proxy-group: "ingress" + gethomepage.dev/enabled: "true" + gethomepage.dev/name: "Shower" + gethomepage.dev/group: "Home" + gethomepage.dev/icon: "mdi-baby" + gethomepage.dev/description: "Adelaide baby shower" + gethomepage.dev/href: "https://shower.ops.eblu.me" + gethomepage.dev/pod-selector: "app=shower" +spec: + ingressClassName: tailscale + defaultBackend: + service: + name: shower + port: + number: 8000 + tls: + - hosts: + - shower diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml new file mode 100644 index 0000000..0afc8e3 --- /dev/null +++ b/argocd/manifests/shower/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: shower + +resources: + - configmap.yaml + - external-secret.yaml + - pv-nfs.yaml + - pvc.yaml + - service.yaml + - ingress-tailscale.yaml + - deployment.yaml + +images: + - name: registry.ops.eblu.me/blumeops/shower + newTag: v1.0.2-039d9b9-nix diff --git a/argocd/manifests/shower/pv-nfs.yaml b/argocd/manifests/shower/pv-nfs.yaml new file mode 100644 index 0000000..7354fb5 --- /dev/null +++ b/argocd/manifests/shower/pv-nfs.yaml @@ -0,0 +1,24 @@ +# NFS PersistentVolume for shower app media uploads (prize photos). +# +# Requires the `shower` share on sifaka with NFS exports matching the +# blumeops standard (192.168.1.0/24 + 100.64.0.0/10, all_squash → admin). +# See docs/how-to/operations/shower-app.md for the Synology web-UI walk +# and docs/reference/storage/sifaka.md for the exports table. +# +# Because all_squash rewrites every NFS write to admin:users (1024:100), +# the in-pod runAsUser does NOT have to match an on-disk uid. Mode 0777 +# on /volume1/shower lets the pod read back what it wrote. +apiVersion: v1 +kind: PersistentVolume +metadata: + name: shower-media-nfs-pv +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + nfs: + server: sifaka + path: /volume1/shower diff --git a/argocd/manifests/shower/pvc.yaml b/argocd/manifests/shower/pvc.yaml new file mode 100644 index 0000000..47fee54 --- /dev/null +++ b/argocd/manifests/shower/pvc.yaml @@ -0,0 +1,30 @@ +# Media PVC — RWX NFS share for /app/media (prize photo uploads). +# SQLite DB lives in a separate local-path PVC; NFS file locking is not +# reliable enough for SQLite's WAL/journal. +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: shower-media + namespace: shower +spec: + accessModes: + - ReadWriteMany + storageClassName: "" + volumeName: shower-media-nfs-pv + resources: + requests: + storage: 10Gi +--- +# Database PVC — k3s local-path (default storage class) for SQLite. +# RWO is fine: the deployment runs with a single replica. +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: shower-data + namespace: shower +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi diff --git a/argocd/manifests/shower/service.yaml b/argocd/manifests/shower/service.yaml new file mode 100644 index 0000000..0a73aab --- /dev/null +++ b/argocd/manifests/shower/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: shower + namespace: shower +spec: + selector: + app: shower + ports: + - name: http + port: 8000 + targetPort: 8000 + protocol: TCP diff --git a/containers/shower/default.nix b/containers/shower/default.nix new file mode 100644 index 0000000..d9863e1 --- /dev/null +++ b/containers/shower/default.nix @@ -0,0 +1,259 @@ +# Nix-built shower app container — Adelaide / Heidi / Addie baby shower. +# +# The app is published as a wheel to the Forgejo PyPI index at +# https://forge.eblu.me/api/packages/eblume/pypi/. The wheel + its +# transitive Python deps are baked in at build time via a fixed-output +# derivation that runs `pip install --target` against forge PyPI (proxied +# through pypi.ops.eblu.me for upstream packages). Build runs on the +# nix-container-builder runner (ringtail, amd64) so the image is native. +# +# Going through pip-install-target rather than nixpkgs Python packages +# sidesteps two issues we hit going through `python.pkgs.buildPythonPackage`: +# 1. python314Packages.django still aliases to Django 4.2 LTS, which +# doesn't support Python 3.14 at all. +# 2. django-axes pulls selenium + browser fonts into its check phase +# and the nix sandbox can't provide those. +# +# To bump the version: +# 1. Update `version` below. +# 2. Set `outputHash` to `pkgs.lib.fakeHash`, run the build, copy the +# real hash out of the error, and commit it. +{ pkgs ? import { } }: + +let + version = "1.0.2"; + + python = pkgs.python314; + + # The repo's top-level static/ directory (vendored Sortable + cropper + # JS/CSS, prize placeholder SVG) isn't shipped in the wheel — hatchling + # only packages config/ and shower/, leaving the repo-root static/ + # behind. Pull the sdist (which contains the full source tree) and + # extract just the static/ subtree into the image as /app/static. + # local_settings adds it to STATICFILES_DIRS so collectstatic at boot + # picks it up alongside the Django admin's static files. + # + # Fetched from forge.ops.eblu.me (tailnet) because /api/packages/* is + # blocked at the fly edge — see fly/nginx.conf forge.eblu.me block. + # Hash is the upstream sha256 from forge PyPI's simple index. + showerSdist = pkgs.fetchurl { + name = "adelaide_baby_shower_app-${version}.tar.gz"; + url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}.tar.gz"; + hash = "sha256-nlCtlx9zuYaLoJZSckybLV5YPpA8vZamN96O3RXOstM="; + }; + + staticAssets = pkgs.runCommand "shower-static-assets-${version}" { } '' + ${pkgs.gnutar}/bin/tar -xzf ${showerSdist} -C $TMPDIR + cp -r $TMPDIR/adelaide_baby_shower_app-${version}/static $out + ''; + + # Fixed-output derivation: pip-installs the app wheel + every transitive + # dep into a single target dir. FODs get network access in exchange for + # a pinned output hash, which means the whole dependency closure is + # immutable across rebuilds. + pyDepsFOD = pkgs.stdenv.mkDerivation { + pname = "shower-python-deps-fod"; + inherit version; + + dontUnpack = true; + + nativeBuildInputs = [ python pkgs.cacert pkgs.removeReferencesTo ]; + + buildPhase = '' + runHook preBuild + + export HOME=$TMPDIR + export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt + export PIP_DISABLE_PIP_VERSION_CHECK=1 + + ${python}/bin/python -m venv "$TMPDIR/venv" + "$TMPDIR/venv/bin/pip" install --upgrade pip + "$TMPDIR/venv/bin/pip" install \ + --no-cache-dir \ + --index-url=https://pypi.ops.eblu.me/root/pypi/+simple/ \ + --extra-index-url=https://forge.ops.eblu.me/api/packages/eblume/pypi/simple/ \ + "adelaide-baby-shower-app==${version}" \ + gunicorn + + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + + mkdir -p $out/lib/python3.14 $out/bin + cp -r "$TMPDIR/venv/lib/python3.14/site-packages" $out/lib/python3.14/site-packages + + for script in "$TMPDIR/venv/bin/"*; do + [ -f "$script" ] || continue + name=$(basename "$script") + case "$name" in + python*|pip*|activate*) continue ;; + esac + cp "$script" "$out/bin/$name" + chmod +x "$out/bin/$name" + done + + # --- Strip Nix store references (FOD outputs must be self-contained) --- + # The wrapper derivation below restores them via autoPatchelfHook + a + # python wrapper that points pyc-less imports at the on-image python. + + # Strip bytecode entirely — pyc files embed compile-time paths. + find $out -type f -name '*.pyc' -delete + find $out -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true + + # Dynamically discover all nix store references and strip them. We + # don't have a static list because pip pulls in stdenv via Python's + # build env (gcc-lib, libstdc++, etc.) and the closure is opaque. + { find $out -type f -print0 \ + | xargs -0 grep -aohE '/nix/store/[a-z0-9]{32}-[^/"[:space:]]+' 2>/dev/null \ + || true; } | sort -u > $TMPDIR/store-refs.txt + echo "Found $(wc -l < $TMPDIR/store-refs.txt) unique store path references to strip" + + refs_args="" + while IFS= read -r ref; do + refs_args="$refs_args -t $ref" + done < $TMPDIR/store-refs.txt + + if [ -n "$refs_args" ]; then + find $out -type f -exec remove-references-to $refs_args {} + 2>/dev/null || true + fi + + remaining=$({ find $out -type f -print0 | xargs -0 grep -cl '/nix/store/' 2>/dev/null || true; } | wc -l) + echo "Files with remaining store references: $remaining" + + runHook postInstall + ''; + + outputHashMode = "recursive"; + outputHashAlgo = "sha256"; + # Pinned dep closure — reproducible until version bumps. To recompute, + # set to pkgs.lib.fakeHash and read the failure. + outputHash = "sha256-tSTH/HaDY7M0qxlauBTM+JekZAgF++K2lGP3PLvym/o="; + + dontFixup = true; + }; + + # Non-FOD wrapper: re-applies RPATHs to pre-built .so files (pillow, + # scipy) so they find libstdc++ / libz / etc. at runtime. autoPatchelfHook + # discovers needed libraries from buildInputs. + pyDeps = pkgs.stdenv.mkDerivation { + pname = "shower-python-deps"; + inherit version; + + dontUnpack = true; + + nativeBuildInputs = [ pkgs.autoPatchelfHook ]; + + buildInputs = with pkgs; [ + python + stdenv.cc.cc.lib # libstdc++, libgcc_s + zlib + libjpeg + libwebp + libtiff + openjpeg + lcms2 + freetype + ]; + + installPhase = '' + cp -r ${pyDepsFOD} $out + chmod -R u+w $out + ''; + }; + + sitePackages = "${pyDeps}/lib/python3.14/site-packages"; + + # Settings shim — config/settings.py's `BASE_DIR = parent.parent` would + # otherwise resolve to site-packages, scattering db.sqlite3 / media / + # staticfiles into the venv. Pin them to /app/{data,media,data/staticfiles}. + localSettings = pkgs.writeText "local_settings.py" '' + from pathlib import Path + + from config.settings import * # noqa: F401,F403 + + DATABASES["default"]["NAME"] = "/app/data/db.sqlite3" + MEDIA_ROOT = "/app/media" + STATIC_ROOT = "/app/data/staticfiles" + # /app/static comes from the repo-root static/ subtree of the sdist + # (see default.nix staticAssets). Added because the wheel doesn't + # ship vendored Sortable/cropper assets. + STATICFILES_DIRS = [Path("/app/static")] + ''; + + # PYTHONPATH, DJANGO_SETTINGS_MODULE, PATH, and HOME live in the image's + # `Env` block below — that way `kubectl exec deploy/shower -- python -m + # django ` Just Works without an inline `env` ceremony. + # The entrypoint just changes directory and runs the boot sequence. + entrypoint = pkgs.writeShellScript "shower-entrypoint" '' + set -eu + + cd /app + + mkdir -p /app/data /app/media + + echo "shower: running migrations" + python -m django migrate --noinput + + echo "shower: collecting static files" + python -m django collectstatic --noinput --clear + + echo "shower: starting gunicorn" + exec gunicorn \ + --bind 0.0.0.0:8000 \ + --workers 2 \ + --forwarded-allow-ips='*' \ + config.wsgi:application + ''; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/shower"; + contents = [ + python + pyDeps + pkgs.cacert + pkgs.tzdata + pkgs.bashInteractive + pkgs.coreutils + ]; + + extraCommands = '' + mkdir -p app/data app/media tmp + chmod 1777 tmp + cp ${localSettings} app/local_settings.py + cp -r ${staticAssets} app/static + chmod -R u+w app/static + ''; + + fakeRootCommands = '' + chown -R 1000:1000 app + ''; + enableFakechroot = true; + + config = { + Entrypoint = [ "${entrypoint}" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + "TZ=America/Los_Angeles" + "TMPDIR=/tmp" + "LANG=C.UTF-8" + "LC_ALL=C.UTF-8" + "PYTHONDONTWRITEBYTECODE=1" + "HOME=/app/data" + "PATH=${pyDeps}/bin:${python}/bin:/bin" + # /app first so local_settings.py is importable; sitePackages second so + # django, gunicorn, etc. resolve. Inherited by entrypoint + any + # `kubectl exec` so manual django subcommands work without ceremony. + "PYTHONPATH=/app:${sitePackages}" + "DJANGO_SETTINGS_MODULE=local_settings" + ]; + ExposedPorts = { + "8000/tcp" = { }; + }; + User = "1000"; + WorkingDir = "/app"; + }; +} diff --git a/docs/changelog.d/shower-app-deploy.bugfix.md b/docs/changelog.d/shower-app-deploy.bugfix.md new file mode 100644 index 0000000..91d2b3b --- /dev/null +++ b/docs/changelog.d/shower-app-deploy.bugfix.md @@ -0,0 +1,13 @@ +Shower app container now bakes the wheel + Python deps into the image +at build time via `buildPythonPackage` instead of pip-installing on +first boot. Boots are deterministic and don't depend on forge PyPI +being reachable from the pod. The `wheelHash` in +`containers/shower/default.nix` is the sha256 sourced from the +[forge PyPI simple index](https://forge.eblu.me/api/packages/eblume/pypi/simple/adelaide-baby-shower-app/); +bumping the version means bumping that hash too. + +Borgmatic now covers the shower app: SQLite is dumped from the live +pod via `kubectl exec` (mirroring the existing mealie entry, with +`context: k3s-ringtail`), and the prize-photo media share is picked up +through `/Volumes/shower` (sifaka SMB mount on indri, same pattern as +`/Volumes/photos`). diff --git a/docs/changelog.d/shower-app-deploy.feature.md b/docs/changelog.d/shower-app-deploy.feature.md new file mode 100644 index 0000000..96218be --- /dev/null +++ b/docs/changelog.d/shower-app-deploy.feature.md @@ -0,0 +1,4 @@ +Deploy the Adelaide / Heidi / Addie baby shower app — guest splash, raffle +picker, and prize assignment console — on ringtail k3s with `shower.eblu.me` +as the public entry and `shower.ops.eblu.me` as the tailnet admin host. App +source: [`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app). diff --git a/docs/changelog.d/shower-app-deploy.infra.md b/docs/changelog.d/shower-app-deploy.infra.md new file mode 100644 index 0000000..157a068 --- /dev/null +++ b/docs/changelog.d/shower-app-deploy.infra.md @@ -0,0 +1,9 @@ +Wire shower app for public exposure: fly nginx `shower.eblu.me` server +block as a guest-only surface — splash page, `/prizes//`, static +assets, media. Everything authenticated (`/admin/`, `/host/`, +`/accounts/`) returns 403 with a "tailnet only" pointer. Staff hit +`shower.ops.eblu.me` for the operator console + admin; the app's +v1.0.1 `DJANGO_PUBLIC_URL_BASE` setting makes QR codes generated on +the tailnet point back at the WAN host for guests. Plus a Caddy route +on indri, Pulumi Gandi CNAME, and a Grafana APM dashboard tracking +request rate, error rate, latency, bandwidth, and access logs. diff --git a/docs/how-to/operations/shower-on-ringtail.md b/docs/how-to/operations/shower-on-ringtail.md new file mode 100644 index 0000000..daf1046 --- /dev/null +++ b/docs/how-to/operations/shower-on-ringtail.md @@ -0,0 +1,245 @@ +--- +title: Shower App on Ringtail +modified: 2026-05-10 +last-reviewed: 2026-05-10 +tags: + - how-to + - operations + - kubernetes + - django +--- + +# Shower App on Ringtail + +How the Adelaide / Heidi / Addie baby shower app is deployed. The app is a +Django project ([`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app)) +released as a wheel to the Forgejo Packages PyPI index and run on +[[ringtail]]'s k3s cluster. Public landing page at `shower.eblu.me`, staff +console + admin UI at `shower.ops.eblu.me` (tailnet only). + +The contract this deploy implements is defined in the app repo's +`docs/how-to/hosting.md` — read that for the env-var contract, security +model, and storage requirements before changing anything here. + +## Routing + +``` +Internet → shower.eblu.me + │ (Fly.io nginx — public) + ▼ + Caddy on indri (shower.ops.eblu.me) + │ + ▼ + Tailscale ProxyGroup ingress (shower.tail8d86e.ts.net) + │ + ▼ + Service shower:8000 → Pod (Django + gunicorn) +``` + +| Hostname | Reachable from | Notes | +|---|---|---| +| `shower.eblu.me` | Public internet | Guest surface only — splash, `/prizes//`, `/static/`, `/media/`. Everything authenticated 403s with a tailnet pointer. | +| `shower.ops.eblu.me` | Tailnet | Full app surface — `/host/`, `/admin/`, the works | +| `shower.tail8d86e.ts.net` | Tailnet | Bare ProxyGroup endpoint Caddy proxies to | + +## Defense layers (public side) + +The public surface is guest-only, so the threat model collapses: there +is no credential-accepting endpoint reachable from WAN, and nothing on +WAN that requires authentication. + +1. **edge auth lockout** — fly nginx 403s `/admin/`, `/host/`, and + anything that would redirect into them. Anyone hitting an auth URL + on WAN gets a "tailnet only" message. +2. **fly nginx `limit_req zone=general`** — 10 r/s per Fly-Client-IP + cushion for the splash form. +3. **django-axes** — 5 fails / 1 hour lockout per `(username, ip_address)`, + running on the tailnet-side login. Provides the only credential + defense, since brute-force is only reachable to tailnet members. + +The QR codes that `/host/` (on tailnet) generates for guests embed +`https://shower.eblu.me/...` even though the QR view is served from +the tailnet host. The app's `PUBLIC_URL_BASE` setting (added in v1.0.1) +overrides Django's `request.build_absolute_uri()` for those URLs. + +## Persistent storage + +| Mount | PVC | Type | Why | +|---|---|---|---| +| `/app/media` | `shower-media` | NFS RWX on sifaka (`/volume1/shower`) | Prize photos survive pod rescheduling | +| `/app/data` | `shower-data` | k3s `local-path` RWO | SQLite DB; NFS file locking can't be trusted for WAL/journal | + +The container has the app + its Python deps baked in at nix build time +(`buildPythonPackage` against the wheel fetched from forge PyPI). The +entrypoint runs migrations, runs `collectstatic`, and `exec`s gunicorn — +no pip-at-boot. A `local_settings.py` shim overrides `DATABASES.NAME`, +`MEDIA_ROOT`, and `STATIC_ROOT` to absolute paths under `/app/`, +sidestepping the wheel's `BASE_DIR = parent.parent` of an +in-site-packages settings module. + +## Backups + +[[borgmatic]] (running on indri) captures both halves of the persistent +state on its daily 2 a.m. run: + +- **`/app/data/db.sqlite3`** — dumped via `kubectl exec`'s + `sqlite3.backup()` against the live pod (entry in + `borgmatic_k8s_sqlite_dumps`, context `k3s-ringtail`). The dumped + file lands in `borgmatic_k8s_dump_dir` on indri and is picked up by + the main source-directory sweep. +- **`/app/media`** — picked up via `/Volumes/shower`, the SMB mount of + `sifaka:/volume1/shower` on indri. The same Synology share is exposed + via SMB *and* NFS simultaneously; ringtail's pod uses the NFS export, + while indri reads the SMB side for the borgmatic source. + +Both archive to [[sifaka]] (`borg-backups`) and BorgBase offsite, with +retention `keep_daily=7 / keep_monthly=12 / keep_yearly=1000`. + +The SMB mount on indri is set up manually once via Finder (Cmd-K → +`smb://sifaka/shower`, save credentials, "Always log in" so it +reconnects after reboot). If `/Volumes/shower` is missing at backup +time borgmatic will fail loudly — `source_directories_must_exist: true` +applies to all entries. + +## One-time setup steps + +These steps are required the first time the service is deployed and are +not encoded in the manifests. + +### 1. NFS + SMB share on sifaka + +On the Synology DSM web UI: + +1. **Control Panel → Shared Folder → Create**. Name: `shower`, + Location: Volume 1. Leave the rest at default. +2. **Control Panel → File Services → NFS → NFS Rules** (on the + `shower` row's *Permissions* tab). Add a rule mirroring the other + shares' pattern: Hostname/IP=`192.168.1.0/24` and again for + `100.64.0.0/10`, Privilege=Read/Write, Squash=`Map all users to + admin` (= `all_squash`), and tick *Allow connections from + non-privileged ports*. (See [[sifaka#NFS Exports]] — the existing + `frigate`, `paperless`, etc. shares use this exact pattern.) +3. **Control Panel → File Services → SMB**: leave SMB enabled + globally. No per-share rule required — the share inherits the + default `eblume` access. +4. The directory ownership at `/volume1/shower` will end up + `root:root`, mode `0777` (DSM default) — which is fine because + `all_squash` rewrites every NFS write to `admin:users`, and the + `0777` lets pods read what other pods wrote. No `chown` needed. + +After the share exists, mount it on indri for borgmatic: + +- In Finder, **Cmd-K → `smb://sifaka/shower`**, sign in as `eblume`, + and tick **Remember in Keychain** + **Always log in** so it + reconnects on reboot. This produces `/Volumes/shower`, which the + borgmatic source-directory list points at. + +### 2. 1Password item + +Item name: **`Shower (blumeops)`** in the `blumeops` vault. +Required property: + +| Field | Value | +|---|---| +| `secret-key` | Output of `openssl rand -base64 48` | + +The `ExternalSecret` `shower-app-secrets` will sync this into the +`shower` namespace as a `Secret` and `envFrom` exposes it as +`DJANGO_SECRET_KEY` to the container. + +**Never reuse a key that has ever been in git history.** Per the app's +hosting.md, an early dev key was committed before being replaced with +the `django-insecure-...` placeholder; the production key must be +freshly generated. + +### 3. Container image + +Built by the `build-container` Forgejo Actions workflow on the +`nix-container-builder` runner (ringtail, amd64). The wheel is fetched +from forge PyPI at nix build time and baked into the image — no +pip-at-runtime. To bump the version, change `version` in +`containers/shower/default.nix` and update `wheelHash` (or set it to +`pkgs.lib.fakeHash` and let the next build print the correct one). + +Trigger with: + +```fish +mise run container-build-and-release shower +``` + +After the workflow finishes, update `images[].newTag` in +`argocd/manifests/shower/kustomization.yaml` to the resulting +`vX.Y.Z--nix` tag, then commit (C0). + +### 4. DNS + +`pulumi/gandi/__main__.py` declares the `shower-public` CNAME pointing +at `blumeops-proxy.fly.dev.`. Apply with: + +```fish +mise run dns-preview +mise run dns-up +``` + +### 5. Fly.io certificate + +```fish +fly certs add shower.eblu.me -a blumeops-proxy +``` + +(Add to `mise-tasks/fly-setup` so re-runs of the one-time setup pick +it up.) + +### 6. Caddy on indri + +`shower` is in `ansible/roles/caddy/defaults/main.yml`. Push with: + +```fish +mise run provision-indri -- --tags caddy +``` + +### 7. Create the admin user + +The container's entrypoint runs `migrate --noinput` + `collectstatic +--noinput --clear` before gunicorn, so a fresh `db.sqlite3` is schema- +ready as soon as the pod boots. It does *not* create a Django superuser +— that has to happen once, interactively, after the first pod is up: + +```fish +kubectl --context=k3s-ringtail -n shower exec -it deploy/shower -- \ + python -m django createsuperuser +``` + +Use `erich` / your usual email. The same account doubles as the +`@staff_member_required` login for `/host/`. Subsequent staff accounts +can be created from `/admin/auth/user/` once you're signed in. + +## Deploying a new version + +1. Bump the wheel version in the app repo (`adelaide-baby-shower-app`) + and release it to Forgejo PyPI. +2. Bump `appVersion` in `containers/shower/default.nix` to match. +3. `mise run container-build-and-release shower`. Verify the build + with `mise run runner-logs`. +4. Update the `newTag` in `argocd/manifests/shower/kustomization.yaml` + to the new `[main]` SHA tag. +5. Commit (C0 after PR merge — see [[build-container-image#Squash-merge and container tags]]). +6. `argocd app sync shower`. + +## Verifying after a deploy + +```fish +kubectl --context=k3s-ringtail -n shower get pods +kubectl --context=k3s-ringtail -n shower logs deploy/shower +curl -sf https://shower.ops.eblu.me/ # tailnet +curl -sf https://shower.eblu.me/ # public +curl -I https://shower.eblu.me/admin/users/ # expect 403 (edge block) +curl -I https://shower.ops.eblu.me/admin/ # expect 200 / 302 (login) +``` + +## Related + +- [[expose-service-publicly]] — Fly.io proxy + Tailscale pattern +- [[deploy-k8s-service]] — generic ArgoCD service onboarding +- [[ringtail]] — the cluster +- [`hosting.md`](https://forge.eblu.me/eblume/adelaide-baby-shower-app/src/branch/main/docs/how-to/hosting.md) — app's deployment contract diff --git a/docs/reference/kubernetes/apps.md b/docs/reference/kubernetes/apps.md index 80ea72e..fd5c06f 100644 --- a/docs/reference/kubernetes/apps.md +++ b/docs/reference/kubernetes/apps.md @@ -41,6 +41,7 @@ Registry of all applications deployed via [[argocd]]. | `ollama` | ollama | `argocd/manifests/ollama/` | [[ollama]] | | `mealie` | mealie | `argocd/manifests/mealie/` | [[mealie]] | | `paperless` | paperless | `argocd/manifests/paperless/` | [[paperless]] | +| `shower` | shower | `argocd/manifests/shower/` | [[shower-app]] | | `prowler` | prowler | `argocd/manifests/prowler/` | [[prowler]] | ## Sync Policies diff --git a/docs/reference/services/shower-app.md b/docs/reference/services/shower-app.md new file mode 100644 index 0000000..26d1764 --- /dev/null +++ b/docs/reference/services/shower-app.md @@ -0,0 +1,55 @@ +--- +title: Shower App +modified: 2026-05-10 +last-reviewed: 2026-05-10 +tags: + - service + - django +--- + +# Shower App + +Django web app for Adelaide / Heidi / Addie's baby shower — guest splash with +a "what did you bring?" form, raffle picker, contest-prize ranking via +QR-coded `/prizes//` URLs, and an `/host/` operator console with +drag-rank assignment solving via scipy. + +## Quick Reference + +| Property | Value | +|----------|-------| +| **Public URL** | `shower.eblu.me` (guest surface only — via [[flyio-proxy]]) | +| **Private URL** | `shower.ops.eblu.me` (admin + `/host/` console — Caddy on indri) | +| **Cluster** | [[ringtail]] k3s, namespace `shower` | +| **Container** | `registry.ops.eblu.me/blumeops/shower` (built from `containers/shower/default.nix`) | +| **App source** | `forge.eblu.me/eblume/adelaide-baby-shower-app` (wheel on Forgejo PyPI) | +| **Database** | SQLite on a local-path PVC (`shower-data`, RWO 2 Gi) | +| **Media (prize photos)** | NFS RWX PVC `shower-media` → `sifaka:/volume1/shower` | +| **Secrets** | `Shower (blumeops)` 1Password item → `DJANGO_SECRET_KEY` | + +## Routing + +``` +Internet → shower.eblu.me (Fly nginx, guest-only 403s on /admin/ /host/) + │ + ▼ + Caddy on indri (shower.ops.eblu.me — full surface) + │ + ▼ + Tailscale ProxyGroup → k3s Service → Deployment +``` + +## Backups + +- **SQLite** dumped via `kubectl exec` to indri's `borgmatic_k8s_dump_dir` on every 2 a.m. run (mealie-pattern entry in `borgmatic_k8s_sqlite_dumps`) +- **Media** picked up via `/Volumes/shower` (sifaka SMB mount on indri) in the main `borgmatic_source_directories` list + +Both archive to sifaka + BorgBase. + +## Related + +- [[shower-on-ringtail]] — onboarding + day-of runbook +- [[expose-service-publicly]] — Fly proxy + tailnet pattern this rides on +- [[ringtail]] — host cluster +- [[sifaka#NFS Exports]] — NFS share table +- [[borgmatic]] — backup system diff --git a/docs/tutorials/expose-service-publicly.md b/docs/tutorials/expose-service-publicly.md index 6bc8fae..886cad4 100644 --- a/docs/tutorials/expose-service-publicly.md +++ b/docs/tutorials/expose-service-publicly.md @@ -176,17 +176,39 @@ Indri carries `tag:flyio-target` so the Fly proxy can reach Caddy. No per-servic Deploy: `mise run tailnet-preview` then `mise run tailnet-up`. -After deploying, extract the auth key and set it as a Fly.io secret: +After deploying, push the auth key to Fly.io. The simplest path is +`mise run fly-setup`, which reads the current value from Pulumi state +and stages it as a Fly.io secret: ```bash -# Get the key from Pulumi state -cd pulumi/tailscale && pulumi stack output flyio_authkey --show-secrets - -# Set it in Fly.io -fly secrets set TS_AUTHKEY="tskey-auth-..." -a blumeops-proxy +mise run fly-setup ``` -Store the auth key in 1Password as well for the `fly-setup` mise task. +Manual equivalent for reference: + +```bash +cd pulumi/tailscale && pulumi stack output flyio_authkey --show-secrets +# then in fly/: +fly secrets set TS_AUTHKEY="tskey-auth-..." -a blumeops-proxy --stage +``` + +**Pulumi state is the only source of truth for this key.** No other +process (mise tasks, ansible, scripts) reads it from anywhere else — +in particular, the key is not stored in 1Password. To rotate +(every 90 days, or after a compromise), force-replace the resource +and re-run `fly-setup`: + +```bash +mise run tailnet-up -- \ + --replace='urn:pulumi:tail8d86e::blumeops-tailnet::tailscale:index/tailnetKey:TailnetKey::flyio-proxy-key' +mise run fly-setup +mise run fly-deploy +``` + +Pulumi destroys the old key and mints a new 90-day one in a single +operation. Older fly machines that already authed against the old key +are unaffected (they don't need it after the initial join); only +*new* machine starts read the rotated value. ### Step 4: Mise tasks diff --git a/fly/fail2ban/action.d/nginx-deny.conf b/fly/fail2ban/action.d/nginx-deny.conf index 1d3737b..bab8abb 100644 --- a/fly/fail2ban/action.d/nginx-deny.conf +++ b/fly/fail2ban/action.d/nginx-deny.conf @@ -2,13 +2,22 @@ # Standard iptables banning won't work in Fly.io because $remote_addr # is Fly's internal proxy IP. Instead, we write banned IPs to a file # that nginx checks via a geo directive keyed on $http_fly_client_ip. +# +# The deny file is per-service: each jail sets `nginx_deny_file = ...` +# (see jail.d/*.conf) and a matching `geo $http_fly_client_ip $..._banned` +# block in nginx.conf includes the same path. [Definition] -actionban = echo " 1;" >> /etc/nginx/forge-deny.conf && nginx -s reload +actionban = echo " 1;" >> && nginx -s reload -actionunban = sed -i '/ 1;/d' /etc/nginx/forge-deny.conf && nginx -s reload +actionunban = sed -i '/ 1;/d' && nginx -s reload actionstart = actionstop = actioncheck = + +[Init] + +# Default for jails that don't override (preserves forge behaviour). +nginx_deny_file = /etc/nginx/forge-deny.conf diff --git a/fly/nginx.conf b/fly/nginx.conf index 5e49d88..570e6c9 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -34,6 +34,15 @@ http { # bucket. $http_fly_client_ip has the actual client IP. limit_req_zone $http_fly_client_ip zone=forge_auth:10m rate=3r/s; + # Shower-specific zone: loose enough that ~30 guests sharing a single + # venue-wifi NAT'd public IP can all scan the QR and load the splash + # (HTML + a handful of static asset hits each) without anyone tripping + # the limit. 50r/s + burst=200 covers the simultaneous-load spike; + # exploit scanners still trip it (e.g. the .env-sweeping bot we saw + # fired ~30 req in 2s — that pattern stays caught). See the + # shower.eblu.me server block for the matching `limit_req`. + limit_req_zone $http_fly_client_ip zone=shower_general:10m rate=50r/s; + # fail2ban deny list — banned IPs are written here by fail2ban and # checked via the $forge_banned variable. The file is touched at # container start to ensure it exists. @@ -184,6 +193,23 @@ http { return 200 "User-agent: *\nDisallow: /mirrors/\nDisallow: /user/\nDisallow: /users/\nDisallow: /*/archive/\nDisallow: /*/releases/download/\n"; } + # Block the package registry at the public edge. Forgejo's per-user + # visibility model treats packages as world-readable when the owner + # has Visibility=Public — which means anyone on the internet can + # enumerate and download every wheel/sdist/generic artifact, even + # for private-repo releases (the sdist contains full source). We + # like keeping eblume's profile public, so we close the hole here + # at the proxy instead: WAN sees 403, tailnet (forge.ops.eblu.me) + # stays open for legitimate consumers (CI workflows, gilbert). + # See docs/tutorials/expose-service-publicly.md for the broader + # threat model on this proxy. + location /api/packages/ { + return 403 "Package downloads are tailnet-only — use forge.ops.eblu.me.\n"; + } + location /api/v1/packages { + return 403 "Package enumeration is tailnet-only — use forge.ops.eblu.me.\n"; + } + # Block swagger API docs — use forge.ops.eblu.me from tailnet location /swagger { return 403 "API documentation is only available at forge.ops.eblu.me (tailnet).\n"; @@ -288,6 +314,140 @@ http { } } + # --- shower.eblu.me (Adelaide baby shower — guest-only public surface) --- + # Only the guest paths (`/`, `/prizes//`, /static/, /media/) are + # exposed on WAN. /host/, /admin/, and Django's login views are blocked + # at the edge with a 403 pointing at the tailnet hostname — staff sign + # in on shower.ops.eblu.me, which is reachable from any device with + # Tailscale installed. Defense layers reduce to: general per-IP rate + # limit + django-axes (5 fails / 1h) on the tailnet-side login. No + # fail2ban needed here because the public surface no longer takes + # credentials of any kind. + server { + listen 8080; + server_name shower.eblu.me; + + # Per-IP rate limit. shower_general (50r/s, burst=200) instead of + # the global `general` zone because at the party, guests on the + # venue's wifi all NAT through a single Fly-Client-IP — 30 guests + # scanning the QR at once would each fetch HTML + a few static + # assets, easily clearing 20 burst on `general`. Exploit scanners + # still trip it (sustained ≫ 50r/s patterns). + limit_req zone=shower_general burst=200 nodelay; + + # Image uploads from /host/'s prize cropper are ~150-300 KiB JPEGs. + # The host page itself isn't reachable here, but /media/ reads can + # be larger than 1 MiB so set the cap to 5 MiB to match Django. + client_max_body_size 5m; + + # Security headers — HSTS matches Django's SECURE_HSTS_SECONDS. + add_header X-Frame-Options "DENY" always; + add_header X-Content-Type-Options "nosniff" always; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header Referrer-Policy "same-origin" always; + # GNU Terry Pratchett — keep the name moving. + add_header X-Clacks-Overhead "GNU Terry Pratchett" always; + + error_page 502 503 504 /error.html; + location = /error.html { + root /usr/share/nginx/html; + internal; + } + + # Reject indexers — there's nothing here we want crawled. + location = /robots.txt { + default_type text/plain; + return 200 "User-agent: *\nDisallow: /\n"; + } + + # Admin surface: tailnet-only. Anything under /admin/ — login, + # logout, CRUD UI, password reset — returns 403 with a pointer to + # the tailnet host. Django's `staff_member_required` will redirect + # /host/ to /admin/login/, which lands on this 403 if a guest + # device wanders into it. Staff hit the tailnet host directly. + location /admin/ { + return 403 "Authentication is tailnet-only — visit shower.ops.eblu.me.\n"; + } + + # Operator console: tailnet-only. Same rationale as /admin/. + location /host/ { + return 403 "The host console is tailnet-only — visit shower.ops.eblu.me.\n"; + } + + # Static assets — WhiteNoise + CompressedManifestStaticFilesStorage + # gives content-hashed filenames, so cache aggressively. Hashed + # names make cache invalidation automatic on app upgrades. + location /static/ { + proxy_pass https://indri_backend$request_uri; + proxy_ssl_verify off; + proxy_ssl_server_name on; + proxy_ssl_name shower.ops.eblu.me; + + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + proxy_set_header Host shower.ops.eblu.me; + proxy_set_header X-Real-IP $http_fly_client_ip; + proxy_set_header X-Forwarded-For $http_fly_client_ip; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_cache services; + proxy_cache_valid 200 1y; + proxy_cache_valid 404 1m; + proxy_cache_use_stale error timeout updating; + proxy_cache_lock on; + proxy_cache_key $host$uri; + proxy_ignore_headers Cache-Control Set-Cookie; + + add_header X-Cache-Status $upstream_cache_status; + } + + # Prize photo uploads. Shorter TTL than /static/ because filenames + # aren't content-hashed — operators can re-upload a prize photo + # and we want guests to see the new image within a day. + location /media/ { + proxy_pass https://indri_backend$request_uri; + proxy_ssl_verify off; + proxy_ssl_server_name on; + proxy_ssl_name shower.ops.eblu.me; + + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + proxy_set_header Host shower.ops.eblu.me; + proxy_set_header X-Real-IP $http_fly_client_ip; + proxy_set_header X-Forwarded-For $http_fly_client_ip; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_cache services; + proxy_cache_valid 200 1d; + proxy_cache_valid 404 1m; + proxy_cache_use_stale error timeout updating; + proxy_cache_lock on; + proxy_cache_key $host$uri; + proxy_ignore_headers Cache-Control Set-Cookie; + + add_header X-Cache-Status $upstream_cache_status; + } + + location / { + proxy_pass https://indri_backend$request_uri; + proxy_ssl_verify off; + proxy_ssl_server_name on; + proxy_ssl_name shower.ops.eblu.me; + proxy_intercept_errors on; + + # No proxy_cache — dynamic content with sessions and CSRF. + + proxy_set_header Host shower.ops.eblu.me; + proxy_set_header X-Real-IP $http_fly_client_ip; + proxy_set_header X-Forwarded-For $http_fly_client_ip; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + } + } + # Catch-all: reject unknown hosts, but serve health check server { listen 8080 default_server; diff --git a/fly/start.sh b/fly/start.sh index 1f2acaa..a924849 100644 --- a/fly/start.sh +++ b/fly/start.sh @@ -20,6 +20,7 @@ done echo "MagicDNS ready" # Ensure fail2ban deny file exists before nginx starts +# (the geo directive's `include` fails if the file is missing). touch /etc/nginx/forge-deny.conf # Start nginx — MagicDNS is available, upstreams resolved. diff --git a/mise-tasks/fly-setup b/mise-tasks/fly-setup index 0c5cb56..be797e5 100755 --- a/mise-tasks/fly-setup +++ b/mise-tasks/fly-setup @@ -23,6 +23,7 @@ echo "IPs allocated" fly certs add docs.eblu.me -a "$APP" 2>/dev/null || true fly certs add cv.eblu.me -a "$APP" 2>/dev/null || true fly certs add forge.eblu.me -a "$APP" 2>/dev/null || true +fly certs add shower.eblu.me -a "$APP" 2>/dev/null || true echo "Certificates configured" echo "Done. Run 'mise run fly-deploy' to deploy." diff --git a/pulumi/gandi/__main__.py b/pulumi/gandi/__main__.py index bda7a8a..25fd0f7 100644 --- a/pulumi/gandi/__main__.py +++ b/pulumi/gandi/__main__.py @@ -85,6 +85,15 @@ forge_public = gandi.livedns.Record( values=["blumeops-proxy.fly.dev."], ) +shower_public = gandi.livedns.Record( + "shower-public", + zone=domain, + name="shower", + type="CNAME", + ttl=300, + values=["blumeops-proxy.fly.dev."], +) + # ============== Exports ============== pulumi.export("domain", domain) pulumi.export("wildcard_fqdn", f"*.{subdomain}.{domain}") @@ -93,3 +102,4 @@ pulumi.export("target_ip", tailscale_ip) pulumi.export("docs_public_fqdn", f"docs.{domain}") pulumi.export("cv_public_fqdn", f"cv.{domain}") pulumi.export("forge_public_fqdn", f"forge.{domain}") +pulumi.export("shower_public_fqdn", f"shower.{domain}") diff --git a/service-versions.yaml b/service-versions.yaml index f7f0f4e..74d467e 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -44,6 +44,16 @@ services: upstream-source: https://github.com/gethomepage/homepage/releases notes: Custom container, kustomize manifests + - name: shower + type: argocd + last-reviewed: 2026-05-10 + current-version: "1.0.2" + upstream-source: https://forge.eblu.me/eblume/adelaide-baby-shower-app + notes: | + Django app for Adelaide / Heidi / Addie's baby shower. Wheel + published to Forgejo Packages PyPI; runs on ringtail k3s. Public + at shower.eblu.me (fly proxy), tailnet admin at shower.ops.eblu.me. + - name: nvidia-device-plugin type: argocd last-reviewed: 2026-03-27 @@ -96,6 +106,15 @@ services: current-version: "v1.94.2" upstream-source: https://github.com/tailscale/tailscale/releases + - name: tailscale + type: container + last-reviewed: 2026-05-10 + current-version: "1.94.2" + upstream-source: https://github.com/tailscale/tailscale/releases + notes: | + Locally mirrored tailscale image used by ringtail's tailscale-operator + ProxyClass. Built via containers/tailscale/default.nix. + - name: grafana type: argocd last-reviewed: 2026-04-02 From 40d9a1ef9e1e1f18128877b671025ea5b1d89e04 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 13:55:25 -0700 Subject: [PATCH 061/122] =?UTF-8?q?C0:=20shower=20=E2=80=94=20rebuild=20fr?= =?UTF-8?q?om=20main=20SHA=20(post-PR-349=20retag)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Standard squash-merge dance per docs/how-to/deployment/build-container-image.md#Squash-merge-and-container-tags — retags from v1.0.2-039d9b9-nix (branch SHA) to v1.0.2-292d354-nix ([main] tag from run 544 built off the merge commit). Functionally identical; preserves source traceability. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/shower/kustomization.yaml | 2 +- docs/changelog.d/+shower-main-sha-rebuild.infra.md | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+shower-main-sha-rebuild.infra.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index 0afc8e3..d2ce83c 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.0.2-039d9b9-nix + newTag: v1.0.2-292d354-nix diff --git a/docs/changelog.d/+shower-main-sha-rebuild.infra.md b/docs/changelog.d/+shower-main-sha-rebuild.infra.md new file mode 100644 index 0000000..f1751b5 --- /dev/null +++ b/docs/changelog.d/+shower-main-sha-rebuild.infra.md @@ -0,0 +1,5 @@ +Rebuild shower from the post-merge commit on main so the container's +SHA tag points at a commit that will still exist after the 30-day +branch-cleanup window. Functionally identical to the branch-tag image +already deployed, just preserves source traceability per +[[build-container-image#Squash-merge and container tags]]. From f83be3bf370105b6ad896353b117da34b78285c1 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 16:10:39 -0700 Subject: [PATCH 062/122] C1: review CC observability-stack-audit (extend to k3s) (#353) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Recurring compensating-control review (oldest stale control: 42 days). - Verified the control is in effect on both clusters: - `alloy-k8s` on minikube-indri — Synced/Healthy, DaemonSet 1/1 ready - `alloy-ringtail` on k3s-ringtail — Synced/Healthy - `loki` (`monitoring/loki-0`) — Running, receiving logs (52 restarts in 18h is worth watching but not blocking review) - Generalized the description: previously named only minikube, but the indri→ringtail migration means we now operate two clusters and both rely on this control. - Added a follow-up note: enabling native apiserver audit logging is far more tractable on k3s (`--audit-log-path` / `--audit-policy-file`) than it was on minikube — worth revisiting once the migration concludes. ## Test plan - [x] `prek` hooks pass - [x] Verified alloy + loki status via `kubectl --context=minikube-indri` and `argocd app get` ## Notes - No deployment changes. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/353 --- compensating-controls.yaml | 12 ++++++++---- ...-cc-observability-stack-audit-2026-05-11.infra.md | 1 + 2 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md diff --git a/compensating-controls.yaml b/compensating-controls.yaml index 658c99d..01b3cfd 100644 --- a/compensating-controls.yaml +++ b/compensating-controls.yaml @@ -196,11 +196,15 @@ controls: description: >- Alloy collects pod logs and ships them to Loki, providing an audit trail for cluster activity. Compensates for missing - apiserver audit logging which minikube does not configure. + apiserver audit logging which neither minikube (indri) nor + k3s (ringtail) configures by default. created: 2026-03-30 - last-reviewed: 2026-03-30 + last-reviewed: 2026-05-11 notes: >- - Verify Alloy DaemonSet is running and Loki is receiving logs. + Verify Alloy DaemonSet is running on each cluster (alloy-k8s on + minikube, alloy-ringtail on k3s) and Loki is receiving logs. Note this is weaker than native apiserver audit logs — it captures pod stdout/stderr, not API request-level auditing. - Consider enabling minikube audit logging if supported. + Consider enabling apiserver audit logging on k3s post-migration + (`--audit-log-path` / `--audit-policy-file`) — minikube made it + hard, k3s makes it straightforward. diff --git a/docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md b/docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md new file mode 100644 index 0000000..8100c6a --- /dev/null +++ b/docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md @@ -0,0 +1 @@ +Reviewed compensating control `observability-stack-audit`. Updated description to cover ringtail's k3s as well as indri's minikube; both Alloy DaemonSets and Loki are healthy. From bb7efa850ac8f07ba8ab8f86ddee47ef0726d70e Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 16:11:35 -0700 Subject: [PATCH 063/122] =?UTF-8?q?C1:=20doc=20review=20=E2=80=94=20replic?= =?UTF-8?q?ating-blumeops=20tutorial=20(#350)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Periodic doc review of `tutorials/replicating-blumeops.md` (was never reviewed). - Fixed 4 instances of "BluemeOps" → "BlumeOps" (also caught 1 in `contributing.md`). - Added `last-reviewed: 2026-05-11` and bumped `modified`. - Verified all wiki-link targets resolve. ## Test plan - [x] `prek` hooks pass (link checker, frontmatter checker) - [ ] Optional: `mise run docs-preview docs/tutorials/replicating-blumeops.md` Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/350 --- .../doc-review-replicating-blumeops.doc.md | 1 + docs/tutorials/contributing.md | 2 +- docs/tutorials/replicating-blumeops.md | 11 ++++++----- 3 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 docs/changelog.d/doc-review-replicating-blumeops.doc.md diff --git a/docs/changelog.d/doc-review-replicating-blumeops.doc.md b/docs/changelog.d/doc-review-replicating-blumeops.doc.md new file mode 100644 index 0000000..e9e6d0f --- /dev/null +++ b/docs/changelog.d/doc-review-replicating-blumeops.doc.md @@ -0,0 +1 @@ +Reviewed `replicating-blumeops` tutorial: fixed "BluemeOps" typos (also in `contributing.md`) and added `last-reviewed` frontmatter. diff --git a/docs/tutorials/contributing.md b/docs/tutorials/contributing.md index a2a7069..0d48e8f 100644 --- a/docs/tutorials/contributing.md +++ b/docs/tutorials/contributing.md @@ -11,7 +11,7 @@ tags: > **Audiences:** Contributor -This tutorial walks through making your first contribution to BluemeOps - from understanding the codebase to submitting a pull request. +This tutorial walks through making your first contribution to BlumeOps - from understanding the codebase to submitting a pull request. ## Prerequisites diff --git a/docs/tutorials/replicating-blumeops.md b/docs/tutorials/replicating-blumeops.md index f2ed8ca..e54ecb2 100644 --- a/docs/tutorials/replicating-blumeops.md +++ b/docs/tutorials/replicating-blumeops.md @@ -1,6 +1,7 @@ --- title: Replicating BlumeOps -modified: 2026-02-07 +modified: 2026-05-11 +last-reviewed: 2026-05-11 tags: - tutorials - replication @@ -10,7 +11,7 @@ tags: > **Audiences:** Replicator -This tutorial provides a roadmap for building your own homelab GitOps environment inspired by BluemeOps. It links to detailed component tutorials for each major piece. +This tutorial provides a roadmap for building your own homelab GitOps environment inspired by BlumeOps. It links to detailed component tutorials for each major piece. ## What You'll Build @@ -23,7 +24,7 @@ By following this guide, you'll have: ## Hardware Requirements -BluemeOps runs on modest hardware. At minimum: +BlumeOps runs on modest hardware. At minimum: | Component | BlumeOps Uses | Minimum Alternative | |-----------|---------------|---------------------| @@ -94,7 +95,7 @@ Without observability, you're flying blind. ### Phase 6: Your First Services -With the foundation in place, deploy actual workloads. BluemeOps runs: +With the foundation in place, deploy actual workloads. BlumeOps runs: - [[miniflux]] - RSS reader - [[jellyfin]] - Media server - [[immich]] - Photo management @@ -118,7 +119,7 @@ Protect your data. ## Alternative Approaches -BluemeOps makes specific choices that may not suit everyone: +BlumeOps makes specific choices that may not suit everyone: | BlumeOps Choice | Alternative | |-----------------|-------------| From 145df76d062b1a9757322cae4eab4199c3e1309c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 16:12:36 -0700 Subject: [PATCH 064/122] =?UTF-8?q?C1:=20service=20review=20=E2=80=94=20me?= =?UTF-8?q?alie=20(v3.12.0=20deployed;=20upstream=20v3.17.0)=20(#351)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Recurring service review for `mealie`. - Upstream is at **v3.17.0** (released 2026-05-06); deployed image is **v3.12.0** — 5 minor versions behind. - Container is built locally from the forge mirror (`containers/mealie/Dockerfile`), so upgrade requires a fresh build + changelog review for breaking changes between v3.12 and v3.17. - Deferring the actual upgrade to a separate task; this PR just refreshes `last-reviewed` and captures the gap in `notes`. ## Test plan - [x] `prek` hooks pass - [ ] Follow-up: open task to bump `containers/mealie/Dockerfile` `CONTAINER_APP_VERSION`, build, and update kustomization tag ## Notes - No deployment changes in this PR. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/351 --- .../changelog.d/service-review-mealie-2026-05-11.infra.md | 1 + service-versions.yaml | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/service-review-mealie-2026-05-11.infra.md diff --git a/docs/changelog.d/service-review-mealie-2026-05-11.infra.md b/docs/changelog.d/service-review-mealie-2026-05-11.infra.md new file mode 100644 index 0000000..074cd21 --- /dev/null +++ b/docs/changelog.d/service-review-mealie-2026-05-11.infra.md @@ -0,0 +1 @@ +Reviewed `mealie` service version freshness; upstream is 5 minor versions ahead (v3.17.0 vs deployed v3.12.0). Marked reviewed; upgrade deferred. diff --git a/service-versions.yaml b/service-versions.yaml index 74d467e..56000df 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -327,10 +327,14 @@ services: - name: mealie type: argocd - last-reviewed: 2026-03-16 + last-reviewed: 2026-05-11 current-version: "v3.12.0" upstream-source: https://github.com/mealie-recipes/mealie/releases - notes: Recipe manager; built from source via forge mirror + notes: >- + Recipe manager; built from source via forge mirror. + Upstream is at v3.17.0 as of 2026-05-11 (5 minor versions ahead). + Container/manifest still pinned to v3.12.0 — upgrade deferred to a + separate task (build new image, review changelog for breaking changes). - name: paperless type: argocd From 4133785119587335f570a406cefc09411699e0d6 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 16:13:07 -0700 Subject: [PATCH 065/122] =?UTF-8?q?C1:=20ringtail=20=E2=80=94=20weekly=20f?= =?UTF-8?q?lake.lock=20update=20(#352)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Recurring weekly lockfile refresh for `nixos/ringtail/flake.lock`. - Inputs updated: `disko`, `home-manager`, `nixpkgs`. - `nixpkgs-services` was deliberately skipped (per overlay convention — pinned services bump only on intentional update). - Generated via `dagger call flake-update --src=. --flake-path=nixos/ringtail`. ## Test plan - [x] `prek` hooks pass - [ ] After merge: `mise run provision-ringtail` to deploy - [ ] Then check for kernel update per [[manage-lockfile]] ## Notes - Not deployed from this PR — provisioning is a follow-up. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/352 --- .../review-ringtail-flake-2026-05-11.infra.md | 1 + nixos/ringtail/flake.lock | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) create mode 100644 docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md diff --git a/docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md b/docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md new file mode 100644 index 0000000..f39f9f4 --- /dev/null +++ b/docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md @@ -0,0 +1 @@ +Updated `nixos/ringtail/flake.lock` (weekly cadence): `disko`, `home-manager`, and `nixpkgs` inputs refreshed. `nixpkgs-services` skipped per overlay convention. diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock index d6a85dc..0f53d0e 100644 --- a/nixos/ringtail/flake.lock +++ b/nixos/ringtail/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1776613567, - "narHash": "sha256-gC9Cp5ibBmGD5awCA9z7xy6MW6iJufhazTYJOiGlCUI=", + "lastModified": 1777713215, + "narHash": "sha256-8GzXDOXckDWwST8TY5DbwYFjdvQLlP7K9CLSVx6iTTo=", "owner": "nix-community", "repo": "disko", - "rev": "32f4236bfc141ae930b5ba2fb604f561fed5219d", + "rev": "63b4e7e6cf75307c1d26ac3762b886b5b0247267", "type": "github" }, "original": { @@ -27,11 +27,11 @@ ] }, "locked": { - "lastModified": 1775425411, - "narHash": "sha256-KY6HsebJHEe5nHOWP7ur09mb0drGxYSzE3rQxy62rJo=", + "lastModified": 1778401693, + "narHash": "sha256-OVHdCqXXUF5UdGkH+FF2ZL06OLZjj2kvP2dIUmzVWoo=", "owner": "nix-community", "repo": "home-manager", - "rev": "0d02ec1d0a05f88ef9e74b516842900c41f0f2fe", + "rev": "389b83002efc26f1145e89a6a8e6edc5a6435948", "type": "github" }, "original": { @@ -43,11 +43,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1777428379, - "narHash": "sha256-ypxFOeDz+CqADEQNL72haqGjvZQdBR5Vc7pyx2JDttI=", + "lastModified": 1778430510, + "narHash": "sha256-Ti+ZBvW6yrWWAg2szExVTwCd4qOJ3KlVr1tFHfyfi8Q=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "755f5aa91337890c432639c60b6064bb7fe67769", + "rev": "8fd9daa3db09ced9700431c5b7ad0e8ba199b575", "type": "github" }, "original": { From fbc1f7720ee4c907112ab2621d5f1966b4a143ad Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 18:37:29 -0700 Subject: [PATCH 066/122] C0: gitignore .claude/scheduled_tasks.lock Transient lock file written by the ScheduleWakeup harness tool when Claude paces its own work between long-running operations. Not config, not state worth checking in. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 48c4b97..09e937c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .claude/settings.local.json .claude/agent-memory/ +.claude/scheduled_tasks.lock # Python __pycache__/ From 3c7967e44507137e997fa9edf3c649954ef7807f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 20:08:03 -0700 Subject: [PATCH 067/122] C1: deploy shower v1.1.0 (phases + guest memories) (#354) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Deploys `adelaide-baby-shower-app` **v1.1.0** to ringtail k3s. ### App changes (since v1.0.2) - **Four-phase `ShowerState`** replaces the boolean `locked` flag — `pre_event` → `party` → `prizes_locked` → `event_locked` — with a backfill migration that maps `locked=True → pre_event`, `locked=False → party`. - **Guest memories**: append-only photos + comments panel where guests can leave notes for the baby. Adds `GuestPhoto` + `GuestComment` models with file-extension validators and a max-size validator; new `shower.imaging` module for thumbnail generation. - **Admin + QR polish**: configurable host link, fixed "View Site" URL, guest-facing QR copy improvements, contest tweaks. Three Django migrations run automatically in the entrypoint against the SQLite PV: - `0009_shower_phase` - `0010_guest_memories` - `0011_book_description` No ConfigMap / env-var changes. The deploy uses `strategy: Recreate` with a single replica, so the old pod releases the data PVC before the new one mounts it and runs migrations. ### Container build changes The v1.1.0 tag exposed a latent issue with the Forgejo PyPI install path: - The recent commit [2d38418e](https://forge.eblu.me/eblume/blumeops/commit/2d38418e) closed the forge package leak at the Fly edge by blocking `/api/packages/*` publicly. - Forgejo's PyPI simple index returns absolute file URLs hardcoded to its public `ROOT_URL` (`forge.eblu.me`), so pip-installing from the tailnet index URL still tries to download from `forge.eblu.me` → 403. - Previous shower builds escaped this because their FOD outputs were already in the nix store; bumping to a new version forced a fresh pip run that hit the block. Fix mirrors what we already do for the sdist: both wheel and sdist are pulled via direct `fetchurl` against `forge.ops.eblu.me`, then the wheel is copied to TMPDIR under its clean filename (nix store path's hash prefix breaks pip's wheel-filename parser) and handed to pip as a local path. The forge `--extra-index-url` is no longer needed. FOD outputHash pinned to `sha256-kTNOswobtkgyQmmqbQM8XO4vvaGg57nCuuZGbNXb0NM=` from run 547. Image: `registry.ops.eblu.me/blumeops/shower:v1.1.0-444ff91-nix`. ### Adjacent finding (already handled) The ringtail `gitea-runner-nix_container_builder` systemd unit was left `inactive` after the recent `provision-ringtail` (matches the known `sshd-restart-hangs-mux` lesson — the rebuild changed the unit's PATH closure + config.yaml, systemd stopped it, then the playbook hung before the activation could restart it). Manually started; the existing memory `lesson_provision_ringtail_ssh_hang.md` was extended to mention the runner as the canary service to check after provisions. ## Test plan - [ ] `argocd app diff shower --revision shower-v1.1.0` — review the manifest change - [ ] `argocd app set shower --revision shower-v1.1.0 && argocd app sync shower` - [ ] `kubectl --context=k3s-ringtail logs -n shower deploy/shower` — confirm migrations 0009/0010/0011 applied, no errors - [ ] Hit `https://shower.ops.eblu.me/` (tailnet) — splash page renders, phase indicator visible - [ ] Hit `https://shower.ops.eblu.me/host/` — host console loads, phase dropdown shows the four states - [ ] Hit `https://shower.eblu.me/` (public via Fly) — splash page still served - [ ] After merge: `argocd app set shower --revision main && argocd app sync shower` Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/354 --- argocd/manifests/shower/kustomization.yaml | 2 +- containers/shower/default.nix | 39 ++++++++++++++++------ docs/changelog.d/shower-v1.1.0.feature.md | 15 +++++++++ service-versions.yaml | 4 +-- 4 files changed, 47 insertions(+), 13 deletions(-) create mode 100644 docs/changelog.d/shower-v1.1.0.feature.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index d2ce83c..6fe641f 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.0.2-292d354-nix + newTag: v1.1.0-444ff91-nix diff --git a/containers/shower/default.nix b/containers/shower/default.nix index d9863e1..e2d369d 100644 --- a/containers/shower/default.nix +++ b/containers/shower/default.nix @@ -1,11 +1,15 @@ # Nix-built shower app container — Adelaide / Heidi / Addie baby shower. # # The app is published as a wheel to the Forgejo PyPI index at -# https://forge.eblu.me/api/packages/eblume/pypi/. The wheel + its -# transitive Python deps are baked in at build time via a fixed-output -# derivation that runs `pip install --target` against forge PyPI (proxied -# through pypi.ops.eblu.me for upstream packages). Build runs on the -# nix-container-builder runner (ringtail, amd64) so the image is native. +# https://forge.ops.eblu.me/api/packages/eblume/pypi/ (tailnet-only — the +# public forge.eblu.me /api/packages/* surface is blocked at the Fly edge). +# We can't point pip at Forgejo's simple index even from the tailnet, +# because Forgejo's index returns absolute file URLs hardcoded to its +# public ROOT_URL (forge.eblu.me), which then 403s. So both the wheel and +# the sdist are pulled by direct `fetchurl` against forge.ops.eblu.me, and +# the wheel is then handed to `pip install` as a local path; transitive +# deps come from pypi.ops.eblu.me. Build runs on the nix-container-builder +# runner (ringtail, amd64) so the image is native. # # Going through pip-install-target rather than nixpkgs Python packages # sidesteps two issues we hit going through `python.pkgs.buildPythonPackage`: @@ -21,7 +25,7 @@ { pkgs ? import { } }: let - version = "1.0.2"; + version = "1.1.0"; python = pkgs.python314; @@ -39,7 +43,17 @@ let showerSdist = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}.tar.gz"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}.tar.gz"; - hash = "sha256-nlCtlx9zuYaLoJZSckybLV5YPpA8vZamN96O3RXOstM="; + hash = "sha256-5dp+0u4metOIC6s6/nPlT4cdpFBCV6S3+Z/3RO0sX5U="; + }; + + # Wheel pulled from forge.ops.eblu.me (tailnet) for the same reason the + # sdist is: Forgejo's PyPI simple index would return forge.eblu.me URLs + # that the Fly edge 403s on /api/packages/*. We hand this path to pip + # below so it never touches the forge index at all. + showerWheel = pkgs.fetchurl { + name = "adelaide_baby_shower_app-${version}-py3-none-any.whl"; + url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}-py3-none-any.whl"; + hash = "sha256-7orFbycON9dQxEIb6q45Xx2rFlEZ8xXSrC2tnrO5uug="; }; staticAssets = pkgs.runCommand "shower-static-assets-${version}" { } '' @@ -68,11 +82,16 @@ let ${python}/bin/python -m venv "$TMPDIR/venv" "$TMPDIR/venv/bin/pip" install --upgrade pip + + # Nix store paths embed a 32-char hash prefix, which pip's wheel + # filename parser rejects ("Invalid wheel filename"). Copy to a + # clean filename in TMPDIR before installing. + cp ${showerWheel} "$TMPDIR/${showerWheel.name}" + "$TMPDIR/venv/bin/pip" install \ --no-cache-dir \ --index-url=https://pypi.ops.eblu.me/root/pypi/+simple/ \ - --extra-index-url=https://forge.ops.eblu.me/api/packages/eblume/pypi/simple/ \ - "adelaide-baby-shower-app==${version}" \ + "$TMPDIR/${showerWheel.name}" \ gunicorn runHook postBuild @@ -129,7 +148,7 @@ let outputHashAlgo = "sha256"; # Pinned dep closure — reproducible until version bumps. To recompute, # set to pkgs.lib.fakeHash and read the failure. - outputHash = "sha256-tSTH/HaDY7M0qxlauBTM+JekZAgF++K2lGP3PLvym/o="; + outputHash = "sha256-kTNOswobtkgyQmmqbQM8XO4vvaGg57nCuuZGbNXb0NM="; dontFixup = true; }; diff --git a/docs/changelog.d/shower-v1.1.0.feature.md b/docs/changelog.d/shower-v1.1.0.feature.md new file mode 100644 index 0000000..d2c3400 --- /dev/null +++ b/docs/changelog.d/shower-v1.1.0.feature.md @@ -0,0 +1,15 @@ +Deploy adelaide-baby-shower-app v1.1.0 to ringtail k3s. Replaces the +boolean lock with a four-phase `ShowerState` (`pre_event` → `party` → +`prizes_locked` → `event_locked`), adds an append-only "guest memories" +panel where guests can leave photos and comments for the baby, and +polishes the admin and QR views. Three Django migrations +(`0009_shower_phase`, `0010_guest_memories`, `0011_book_description`) +run automatically in the entrypoint against the SQLite PV. No config +or env-var changes. + +Container build also gains a Forgejo-PyPI workaround: Forgejo's simple +index returns absolute file URLs hardcoded to the public ROOT_URL +(`forge.eblu.me`), which the Fly edge 403s on `/api/packages/*`. The +wheel and sdist are now both pulled via direct `fetchurl` against +`forge.ops.eblu.me` (tailnet-only) and the wheel is handed to pip as +a local path. diff --git a/service-versions.yaml b/service-versions.yaml index 56000df..63bc5df 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -46,8 +46,8 @@ services: - name: shower type: argocd - last-reviewed: 2026-05-10 - current-version: "1.0.2" + last-reviewed: 2026-05-11 + current-version: "1.1.0" upstream-source: https://forge.eblu.me/eblume/adelaide-baby-shower-app notes: | Django app for Adelaide / Heidi / Addie's baby shower. Wheel From dc0916a548db2017fb271cb42f3f3233b5bae279 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 11 May 2026 20:20:39 -0700 Subject: [PATCH 068/122] =?UTF-8?q?C0:=20shower=20=E2=80=94=20rebuild=20fr?= =?UTF-8?q?om=20main=20SHA=20(post-merge=20retag)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #354 was squash-merged so the branch commit 444ff91 baked into the prior image tag isn't reachable from main's history. Rebuild from main HEAD (3c7967e) and retag. Image content is byte-identical (FOD is content-addressed, inputs unchanged); only the SHA in the tag changes so future provenance tracing stays on main. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/shower/kustomization.yaml | 2 +- docs/changelog.d/+shower-rebuild-from-main-sha.misc.md | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+shower-rebuild-from-main-sha.misc.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index 6fe641f..b6de844 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.1.0-444ff91-nix + newTag: v1.1.0-3c7967e-nix diff --git a/docs/changelog.d/+shower-rebuild-from-main-sha.misc.md b/docs/changelog.d/+shower-rebuild-from-main-sha.misc.md new file mode 100644 index 0000000..a9495cd --- /dev/null +++ b/docs/changelog.d/+shower-rebuild-from-main-sha.misc.md @@ -0,0 +1,6 @@ +Rebuild shower v1.1.0 container from main HEAD (`3c7967e`) and bump the +kustomization tag to `v1.1.0-3c7967e-nix`. The PR was squash-merged, so +the branch commit `444ff91` baked into the prior tag isn't reachable +from main's history. The new tag points at a commit that exists on +main; image content is byte-identical because the FOD output is content +addressed and the inputs didn't change. From d0b54231351d70f2bc1c87c206bab2dddc2708e0 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 12 May 2026 09:33:57 -0700 Subject: [PATCH 069/122] C1: pin ringtail wired IP to 192.168.1.21 (static) Removes DHCP lease renewal as a failure mode on ringtail after an outage on 2026-05-12 where the IP and routes silently disappeared from enp5s0 without any kernel link event. NetworkManager stays enabled for wireless fallback but no longer manages the wired interface. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/ringtail-static-ip.infra.md | 1 + docs/reference/infrastructure/ringtail.md | 13 +++++++++++++ nixos/ringtail/configuration.nix | 14 +++++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/ringtail-static-ip.infra.md diff --git a/docs/changelog.d/ringtail-static-ip.infra.md b/docs/changelog.d/ringtail-static-ip.infra.md new file mode 100644 index 0000000..5137f48 --- /dev/null +++ b/docs/changelog.d/ringtail-static-ip.infra.md @@ -0,0 +1 @@ +Pin ringtail's wired IP to `192.168.1.21` via NixOS scripted networking; NetworkManager no longer manages `enp5s0`. Removes DHCP lease renewal as a failure mode after a silent lease teardown took ringtail offline. diff --git a/docs/reference/infrastructure/ringtail.md b/docs/reference/infrastructure/ringtail.md index 8b93d4d..a4e6837 100644 --- a/docs/reference/infrastructure/ringtail.md +++ b/docs/reference/infrastructure/ringtail.md @@ -25,6 +25,19 @@ Service host and gaming PC. Custom-built PC running NixOS. | **OS** | NixOS 25.11 (Sway/Wayland) | | **Tailscale hostname** | `ringtail.tail8d86e.ts.net` | +## Networking + +| Property | Value | +|----------|-------| +| **Interface (wired)** | `enp5s0` | +| **IP** | `192.168.1.21/24` (static, set by NixOS scripted networking) | +| **Gateway** | `192.168.1.1` (UX7) | +| **DNS** | `192.168.1.1`, `1.1.1.1` (used as Tailscale's upstream resolvers; `/etc/resolv.conf` is owned by Tailscale's MagicDNS at `100.100.100.100`) | +| **DHCP reservation** | UniFi "Fixed IP" tied to ringtail's MAC; belt-and-suspenders so the UX7 won't lease `192.168.1.21` to anyone else even though ringtail no longer asks for it | +| **Wireless** | `wlp6s0` still managed by NetworkManager as a fallback path | + +NetworkManager is enabled but explicitly excluded from managing `enp5s0` via `networking.networkmanager.unmanaged = [ "interface-name:enp5s0" ]`. The wired address is configured by a deterministic `network-addresses-enp5s0.service` oneshot — no daemon, no lease, no renewal. + ## Software Managed declaratively via `nixos/ringtail/configuration.nix`. Home-manager handles ringtail-specific sway/waybar config; chezmoi manages cross-platform dotfiles. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 2cc5280..bd46222 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -16,8 +16,20 @@ in systemd.tpm2.enable = false; # Networking + # Wired interface (enp5s0) uses a static IP configured by NixOS scripted + # networking; NetworkManager is left enabled for the wireless fallback only. networking.hostName = "ringtail"; - networking.networkmanager.enable = true; + networking.networkmanager = { + enable = true; + unmanaged = [ "interface-name:enp5s0" ]; + }; + networking.useDHCP = false; + networking.interfaces.enp5s0.ipv4.addresses = [{ + address = "192.168.1.21"; + prefixLength = 24; + }]; + networking.defaultGateway = "192.168.1.1"; + networking.nameservers = [ "192.168.1.1" "1.1.1.1" ]; # Time zone time.timeZone = "America/Los_Angeles"; From a4a30aad448fb0b43f4a2e8d553015d6af379a32 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 12 May 2026 09:51:16 -0700 Subject: [PATCH 070/122] fix(ringtail): explicitly enable net.ipv4.ip_forward MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the static IP change, k3s/flannel pod networking broke because ip_forward was 0. NixOS doesn't enable IP forwarding by default — it was previously being set implicitly somewhere in the NM-managed / scripted-DHCP path. With static networking we have to set it ourselves. Verified at runtime via sysctl -w before adding here; pod outbound came back immediately and Tailscale VIP services recovered without any pod restarts. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/ringtail-static-ip.infra.md | 2 +- nixos/ringtail/configuration.nix | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/changelog.d/ringtail-static-ip.infra.md b/docs/changelog.d/ringtail-static-ip.infra.md index 5137f48..8474b0a 100644 --- a/docs/changelog.d/ringtail-static-ip.infra.md +++ b/docs/changelog.d/ringtail-static-ip.infra.md @@ -1 +1 @@ -Pin ringtail's wired IP to `192.168.1.21` via NixOS scripted networking; NetworkManager no longer manages `enp5s0`. Removes DHCP lease renewal as a failure mode after a silent lease teardown took ringtail offline. +Pin ringtail's wired IP to `192.168.1.21` via NixOS scripted networking; NetworkManager no longer manages `enp5s0`. Removes DHCP lease renewal as a failure mode after a silent lease teardown took ringtail offline. Also explicitly enables `net.ipv4.ip_forward` (previously set implicitly by scripted-DHCP) so k3s pod networking and Tailscale routing continue to work with static networking. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index bd46222..e8c634a 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -31,6 +31,12 @@ in networking.defaultGateway = "192.168.1.1"; networking.nameservers = [ "192.168.1.1" "1.1.1.1" ]; + # K3s pod networking and Tailscale tunnel routing require IP forwarding. + # NixOS leaves this off by default; previously it was being enabled + # implicitly by NM/scripted-DHCP setup, but with static networking we + # have to set it explicitly. + boot.kernel.sysctl."net.ipv4.ip_forward" = 1; + # Time zone time.timeZone = "America/Los_Angeles"; From 947e4310c306c36e1096f98f5431cf910554d823 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 13 May 2026 16:46:17 -0700 Subject: [PATCH 071/122] C2: migrate immich from minikube to ringtail (mikado chain) (#356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary C2 Mikado chain to move the entire Immich stack (server, ML, valkey, postgres) off `minikube-indri` and onto `k3s-ringtail`. Immich is the largest single tenant on minikube (~1.5 GiB resident) and minikube is currently memory-saturated (97% RAM, swapping). This is the first concrete chain in the broader indri-k8s decommission effort. This PR contains the planning layer only — 7 cards (1 goal + 6 prerequisites). Implementation cycles follow per the Mikado Branch Invariant. ## Goal end-state - Immich `server`, `machine-learning`, `valkey` on ringtail. - ML pod uses ringtail's RTX 4080 (performance win — currently CPU-only). - CNPG `immich-pg` (PG17 + VectorChord) runs on ringtail. - Library still on sifaka NFS — ringtail mounts the same path. - `photos.ops.eblu.me` reroutes through Caddy → ringtail ingress. - Minikube `immich` and `immich-pg` are removed. ## Cards | Card | Depends on | |---|---| | `migrate-immich-to-ringtail` (goal) | all six below | | `cnpg-on-ringtail` | — | | `immich-pg-on-ringtail` | cnpg-on-ringtail | | `immich-pg-data-migration` | immich-pg-on-ringtail | | `sifaka-nfs-from-ringtail` | — | | `immich-app-on-ringtail` | immich-pg-on-ringtail, sifaka-nfs-from-ringtail | | `immich-cutover-and-decommission` | immich-pg-data-migration, immich-app-on-ringtail | ## Key constraints - **No data loss.** Downtime is acceptable; data loss is not. Two surfaces matter: postgres (ML embeddings, face data — slow to re-derive) and the library files (don't move, but NFS access from ringtail must be verified). - **Migration method:** Option A is a CNPG `externalCluster` basebackup → promote. Option B is `pg_dump`/`pg_restore` as a documented fallback. Either way, dry-run against a scratch cluster first. - **Why pg moves too** (not cross-cluster): keeping pg on minikube would block the whole decommission, and Immich is chatty with pg so tailnet round-trips would hurt. ## Test plan - [ ] Plan review — does the dependency graph make sense? - [ ] `mise run docs-mikado migrate-immich-to-ringtail` shows the chain correctly. - [ ] Per-card implementation cycles land separately (commit convention enforced by hook). Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/356 --- argocd/apps/cloudnative-pg-ringtail.yaml | 27 ++++ argocd/apps/databases-ringtail.yaml | 26 ++++ argocd/apps/immich-ringtail.yaml | 31 ++++ argocd/apps/immich.yaml | 30 ---- .../external-secret-immich-borgmatic.yaml | 15 +- .../databases-ringtail/immich-pg.yaml | 53 +++++++ .../databases-ringtail/kustomization.yaml | 9 ++ .../service-immich-pg-tailscale.yaml | 8 +- argocd/manifests/databases/immich-pg.yaml | 69 --------- argocd/manifests/databases/kustomization.yaml | 3 - .../deployment-ml.yaml | 6 + .../deployment-server.yaml | 0 .../deployment-valkey.yaml | 0 .../ingress-tailscale.yaml | 15 +- .../kustomization.yaml | 13 +- argocd/manifests/immich-ringtail/pv-nfs.yaml | 29 ++++ .../pvc-ml-cache.yaml | 0 .../{immich => immich-ringtail}/pvc.yaml | 6 +- .../service-ml.yaml | 0 .../service-valkey.yaml | 0 .../{immich => immich-ringtail}/service.yaml | 0 argocd/manifests/immich/README.md | 115 --------------- argocd/manifests/immich/pv-nfs.yaml | 22 --- .../time-slicing-config.yaml | 2 +- .../migrate-immich-to-ringtail.infra.md | 13 ++ docs/how-to/immich/cnpg-on-ringtail.md | 52 +++++++ docs/how-to/immich/immich-app-on-ringtail.md | 91 ++++++++++++ .../immich/immich-cutover-and-decommission.md | 103 ++++++++++++++ .../how-to/immich/immich-pg-data-migration.md | 79 +++++++++++ docs/how-to/immich/immich-pg-on-ringtail.md | 69 +++++++++ .../immich/migrate-immich-to-ringtail.md | 132 ++++++++++++++++++ .../how-to/immich/sifaka-nfs-from-ringtail.md | 67 +++++++++ 32 files changed, 820 insertions(+), 265 deletions(-) create mode 100644 argocd/apps/cloudnative-pg-ringtail.yaml create mode 100644 argocd/apps/databases-ringtail.yaml create mode 100644 argocd/apps/immich-ringtail.yaml delete mode 100644 argocd/apps/immich.yaml rename argocd/manifests/{databases => databases-ringtail}/external-secret-immich-borgmatic.yaml (65%) create mode 100644 argocd/manifests/databases-ringtail/immich-pg.yaml create mode 100644 argocd/manifests/databases-ringtail/kustomization.yaml rename argocd/manifests/{databases => databases-ringtail}/service-immich-pg-tailscale.yaml (57%) delete mode 100644 argocd/manifests/databases/immich-pg.yaml rename argocd/manifests/{immich => immich-ringtail}/deployment-ml.yaml (83%) rename argocd/manifests/{immich => immich-ringtail}/deployment-server.yaml (100%) rename argocd/manifests/{immich => immich-ringtail}/deployment-valkey.yaml (100%) rename argocd/manifests/{immich => immich-ringtail}/ingress-tailscale.yaml (62%) rename argocd/manifests/{immich => immich-ringtail}/kustomization.yaml (61%) create mode 100644 argocd/manifests/immich-ringtail/pv-nfs.yaml rename argocd/manifests/{immich => immich-ringtail}/pvc-ml-cache.yaml (100%) rename argocd/manifests/{immich => immich-ringtail}/pvc.yaml (54%) rename argocd/manifests/{immich => immich-ringtail}/service-ml.yaml (100%) rename argocd/manifests/{immich => immich-ringtail}/service-valkey.yaml (100%) rename argocd/manifests/{immich => immich-ringtail}/service.yaml (100%) delete mode 100644 argocd/manifests/immich/README.md delete mode 100644 argocd/manifests/immich/pv-nfs.yaml create mode 100644 docs/changelog.d/migrate-immich-to-ringtail.infra.md create mode 100644 docs/how-to/immich/cnpg-on-ringtail.md create mode 100644 docs/how-to/immich/immich-app-on-ringtail.md create mode 100644 docs/how-to/immich/immich-cutover-and-decommission.md create mode 100644 docs/how-to/immich/immich-pg-data-migration.md create mode 100644 docs/how-to/immich/immich-pg-on-ringtail.md create mode 100644 docs/how-to/immich/migrate-immich-to-ringtail.md create mode 100644 docs/how-to/immich/sifaka-nfs-from-ringtail.md diff --git a/argocd/apps/cloudnative-pg-ringtail.yaml b/argocd/apps/cloudnative-pg-ringtail.yaml new file mode 100644 index 0000000..fa7bba0 --- /dev/null +++ b/argocd/apps/cloudnative-pg-ringtail.yaml @@ -0,0 +1,27 @@ +# CloudNativePG Operator for ringtail k3s cluster +# Deploys the operator only; PostgreSQL clusters are created separately +# +# Sibling of cloudnative-pg.yaml (minikube). Same mirror, same release, +# different destination. Both apps will coexist during the immich +# migration; the minikube one is removed at the end of the broader +# indri-k8s decommission. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cloudnative-pg-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/mirrors/cloudnative-pg.git + targetRevision: v1.27.1 + path: releases + directory: + include: 'cnpg-1.27.1.yaml' + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: cnpg-system + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true # Required for large CRDs that exceed annotation size limit diff --git a/argocd/apps/databases-ringtail.yaml b/argocd/apps/databases-ringtail.yaml new file mode 100644 index 0000000..00de4e3 --- /dev/null +++ b/argocd/apps/databases-ringtail.yaml @@ -0,0 +1,26 @@ +# Databases on ringtail k3s. +# +# Today: only immich-pg (CNPG Cluster) + its borgmatic ExternalSecret. +# More databases may move here as the indri-k8s decommission proceeds. +# +# Prerequisites: +# - cloudnative-pg-ringtail (operator must exist before the Cluster CR) +# - external-secrets-ringtail + 1password-connect-ringtail (for the +# immich-pg-borgmatic ExternalSecret to sync) +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: databases-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/databases-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: databases + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/immich-ringtail.yaml b/argocd/apps/immich-ringtail.yaml new file mode 100644 index 0000000..c93cbee --- /dev/null +++ b/argocd/apps/immich-ringtail.yaml @@ -0,0 +1,31 @@ +# Immich on ringtail k3s. +# +# Staging deployment; the minikube `immich` app remains in parallel +# until cutover. See [[immich-cutover-and-decommission]] for the +# routing flip + minikube cleanup. +# +# Prerequisites: +# - cnpg-on-ringtail + databases-ringtail (postgres) +# - 1password-connect-ringtail + external-secrets-ringtail (not used +# by this app today — immich-db Secret is created manually, +# matching the minikube pattern) +# - The immich-db Secret in the immich namespace, holding the +# password for the `immich` postgres role (copied from the source +# immich-pg-app Secret at migration time). +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: immich-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/immich-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: immich + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/immich.yaml b/argocd/apps/immich.yaml deleted file mode 100644 index 7efd263..0000000 --- a/argocd/apps/immich.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Immich - Self-hosted photo and video management -# High-performance Google Photos/iCloud alternative with AI features -# -# Kustomize manifests in argocd/manifests/immich/ -# Components: server, machine-learning, valkey (Redis) -# -# Prerequisites: -# 1. Create immich namespace and secrets: -# kubectl create namespace immich -# kubectl --context=minikube-indri create secret generic immich-db -n immich \ -# --from-literal=password="$(kubectl --context=minikube-indri -n databases get secret immich-pg-app -o jsonpath='{.data.password}' | base64 -d)" -# 2. Create immich-pg database and user (see immich-pg app) -# 3. NFS share on sifaka at /volume1/photos with read/write for indri -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: immich - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/immich - destination: - server: https://kubernetes.default.svc - namespace: immich - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/manifests/databases/external-secret-immich-borgmatic.yaml b/argocd/manifests/databases-ringtail/external-secret-immich-borgmatic.yaml similarity index 65% rename from argocd/manifests/databases/external-secret-immich-borgmatic.yaml rename to argocd/manifests/databases-ringtail/external-secret-immich-borgmatic.yaml index 8801c1a..3d1fc14 100644 --- a/argocd/manifests/databases/external-secret-immich-borgmatic.yaml +++ b/argocd/manifests/databases-ringtail/external-secret-immich-borgmatic.yaml @@ -1,9 +1,12 @@ # ExternalSecret for borgmatic backup user password on immich-pg cluster +# (ringtail k3s). +# +# Mirror of argocd/manifests/databases/external-secret-immich-borgmatic.yaml. +# The onepassword-blumeops ClusterSecretStore exists on ringtail via the +# external-secrets-ringtail app. # -# Reuses the same 1Password item as blumeops-pg-borgmatic. # 1Password item: "borgmatic" in blumeops vault # Field: "db-password" -# apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: @@ -23,7 +26,7 @@ spec: username: borgmatic password: "{{ .password }}" data: - - secretKey: password - remoteRef: - key: borgmatic - property: db-password + - secretKey: password + remoteRef: + key: borgmatic + property: db-password diff --git a/argocd/manifests/databases-ringtail/immich-pg.yaml b/argocd/manifests/databases-ringtail/immich-pg.yaml new file mode 100644 index 0000000..982bc43 --- /dev/null +++ b/argocd/manifests/databases-ringtail/immich-pg.yaml @@ -0,0 +1,53 @@ +# PostgreSQL Cluster for Immich on ringtail k3s. +# +# Initially bootstrapped via CNPG pg_basebackup from the minikube +# immich-pg cluster on 2026-05-13, then promoted to primary. The +# externalClusters + bootstrap.pg_basebackup blocks have been pruned +# from this manifest now that the migration is complete — leaving +# them around is a footgun (re-enabling replica.enabled=true would +# try to demote this cluster against a stale source). See +# [[immich-pg-data-migration]] for the procedure used. +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: immich-pg + namespace: databases +spec: + instances: 1 + imageName: ghcr.io/tensorchord/cloudnative-vectorchord:17-0.5.0 + + storage: + size: 10Gi + storageClass: local-path + + # Managed roles + managed: + roles: + - name: borgmatic + login: true + connectionLimit: -1 + ensure: present + inherit: true + inRoles: + - pg_read_all_data + passwordSecret: + name: immich-pg-borgmatic + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + + postgresql: + shared_preload_libraries: + - "vchord.so" + parameters: + max_connections: "50" + shared_buffers: "128MB" + password_encryption: "scram-sha-256" + pg_hba: + - host all all 0.0.0.0/0 scram-sha-256 + - host all all ::/0 scram-sha-256 diff --git a/argocd/manifests/databases-ringtail/kustomization.yaml b/argocd/manifests/databases-ringtail/kustomization.yaml new file mode 100644 index 0000000..971e2d4 --- /dev/null +++ b/argocd/manifests/databases-ringtail/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: databases + +resources: + - immich-pg.yaml + - external-secret-immich-borgmatic.yaml + - service-immich-pg-tailscale.yaml diff --git a/argocd/manifests/databases/service-immich-pg-tailscale.yaml b/argocd/manifests/databases-ringtail/service-immich-pg-tailscale.yaml similarity index 57% rename from argocd/manifests/databases/service-immich-pg-tailscale.yaml rename to argocd/manifests/databases-ringtail/service-immich-pg-tailscale.yaml index 78891dd..92deb14 100644 --- a/argocd/manifests/databases/service-immich-pg-tailscale.yaml +++ b/argocd/manifests/databases-ringtail/service-immich-pg-tailscale.yaml @@ -1,6 +1,8 @@ -# Tailscale LoadBalancer for immich-pg PostgreSQL access -# Canonical hostname: immich-pg.tail8d86e.ts.net -# Caddy L4 proxies pg.ops.eblu.me:5433 → this service for borgmatic backups +# Tailscale LoadBalancer for immich-pg PostgreSQL access on ringtail. +# Canonical hostname: immich-pg.tail8d86e.ts.net (claimed from the +# minikube side after the minikube service was removed during the +# immich-to-ringtail migration). Borgmatic on indri uses this +# hostname for nightly backups. apiVersion: v1 kind: Service metadata: diff --git a/argocd/manifests/databases/immich-pg.yaml b/argocd/manifests/databases/immich-pg.yaml deleted file mode 100644 index 74c6f4e..0000000 --- a/argocd/manifests/databases/immich-pg.yaml +++ /dev/null @@ -1,69 +0,0 @@ -# PostgreSQL Cluster for Immich -# Uses VectorChord (successor to pgvecto.rs) for AI-powered vector search -# See: https://github.com/immich-app/immich/discussions/9060 -# Managed by CloudNativePG operator -apiVersion: postgresql.cnpg.io/v1 -kind: Cluster -metadata: - name: immich-pg - namespace: databases -spec: - instances: 1 - # VectorChord image for PostgreSQL 17 with VectorChord 0.5.0 - # Immich v2.4.1 requires VectorChord >=0.3 <0.6 - # See: https://github.com/tensorchord/VectorChord - imageName: ghcr.io/tensorchord/cloudnative-vectorchord:17-0.5.0 - - storage: - size: 10Gi - storageClass: standard - - # Bootstrap creates initial database and owner - bootstrap: - initdb: - database: immich - owner: immich - postInitSQL: - # Extensions required by Immich - - CREATE EXTENSION IF NOT EXISTS vector; - - CREATE EXTENSION IF NOT EXISTS vchord CASCADE; - - CREATE EXTENSION IF NOT EXISTS cube CASCADE; - - CREATE EXTENSION IF NOT EXISTS earthdistance CASCADE; - - # Managed roles - # Note: connectionLimit, ensure, inherit are CNPG defaults added to prevent ArgoCD drift - managed: - roles: - # borgmatic read-only user for backups - - name: borgmatic - login: true - connectionLimit: -1 - ensure: present - inherit: true - inRoles: - - pg_read_all_data - passwordSecret: - name: immich-pg-borgmatic - - # Resource limits for minikube environment - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "1Gi" - cpu: "500m" - - # PostgreSQL configuration - postgresql: - # VectorChord requires vchord.so in shared_preload_libraries - shared_preload_libraries: - - "vchord.so" - parameters: - max_connections: "50" - shared_buffers: "128MB" - password_encryption: "scram-sha-256" - pg_hba: - # Allow connections from k8s pods - - host all all 0.0.0.0/0 scram-sha-256 - - host all all ::/0 scram-sha-256 diff --git a/argocd/manifests/databases/kustomization.yaml b/argocd/manifests/databases/kustomization.yaml index b25e09e..692285a 100644 --- a/argocd/manifests/databases/kustomization.yaml +++ b/argocd/manifests/databases/kustomization.yaml @@ -5,13 +5,10 @@ namespace: databases resources: - blumeops-pg.yaml - - immich-pg.yaml - service-tailscale.yaml - - service-immich-pg-tailscale.yaml - service-metrics-tailscale.yaml - external-secret-eblume.yaml - external-secret-borgmatic.yaml - - external-secret-immich-borgmatic.yaml - external-secret-teslamate.yaml - external-secret-authentik.yaml - external-secret-paperless.yaml diff --git a/argocd/manifests/immich/deployment-ml.yaml b/argocd/manifests/immich-ringtail/deployment-ml.yaml similarity index 83% rename from argocd/manifests/immich/deployment-ml.yaml rename to argocd/manifests/immich-ringtail/deployment-ml.yaml index 57c4242..5ea8035 100644 --- a/argocd/manifests/immich/deployment-ml.yaml +++ b/argocd/manifests/immich-ringtail/deployment-ml.yaml @@ -16,11 +16,16 @@ spec: app: immich component: machine-learning spec: + runtimeClassName: nvidia securityContext: seccompProfile: type: RuntimeDefault containers: - name: machine-learning + # ringtail uses the -cuda tag (set in kustomization.yaml) + # to take advantage of the RTX 4080 via the nvidia + # device plugin. Time-slicing is configured for 4 replicas + # so frigate + ollama + this pod can share. image: ghcr.io/immich-app/immich-machine-learning:kustomized ports: - name: http @@ -57,6 +62,7 @@ spec: cpu: "100m" limits: memory: "4Gi" + nvidia.com/gpu: "1" volumes: - name: cache persistentVolumeClaim: diff --git a/argocd/manifests/immich/deployment-server.yaml b/argocd/manifests/immich-ringtail/deployment-server.yaml similarity index 100% rename from argocd/manifests/immich/deployment-server.yaml rename to argocd/manifests/immich-ringtail/deployment-server.yaml diff --git a/argocd/manifests/immich/deployment-valkey.yaml b/argocd/manifests/immich-ringtail/deployment-valkey.yaml similarity index 100% rename from argocd/manifests/immich/deployment-valkey.yaml rename to argocd/manifests/immich-ringtail/deployment-valkey.yaml diff --git a/argocd/manifests/immich/ingress-tailscale.yaml b/argocd/manifests/immich-ringtail/ingress-tailscale.yaml similarity index 62% rename from argocd/manifests/immich/ingress-tailscale.yaml rename to argocd/manifests/immich-ringtail/ingress-tailscale.yaml index 59a4c05..f0b5fe1 100644 --- a/argocd/manifests/immich/ingress-tailscale.yaml +++ b/argocd/manifests/immich-ringtail/ingress-tailscale.yaml @@ -1,6 +1,9 @@ -# Tailscale Ingress for Immich -# Exposes Immich at photos.tail8d86e.ts.net -# Caddy will proxy photos.ops.eblu.me to this endpoint +# Tailscale ProxyGroup Ingress for Immich on ringtail. +# +# Production hostname: photos.tail8d86e.ts.net +# (during the cutover window this was photos-ringtail; the minikube +# ingress was torn down before this was renamed to photos to avoid +# the Tailscale device-name collision.) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -16,12 +19,6 @@ metadata: gethomepage.dev/description: "Photo management" gethomepage.dev/href: "https://photos.ops.eblu.me" gethomepage.dev/pod-selector: "app=immich,component=server" - # TODO: Add Immich widget - requires API key from Account Settings > API Keys - # See: https://gethomepage.dev/widgets/services/immich/ - # gethomepage.dev/widget.type: "immich" - # gethomepage.dev/widget.url: "https://photos.ops.eblu.me" - # gethomepage.dev/widget.key: "{{HOMEPAGE_VAR_IMMICH_API_KEY}}" - # gethomepage.dev/widget.version: "2" spec: ingressClassName: tailscale rules: diff --git a/argocd/manifests/immich/kustomization.yaml b/argocd/manifests/immich-ringtail/kustomization.yaml similarity index 61% rename from argocd/manifests/immich/kustomization.yaml rename to argocd/manifests/immich-ringtail/kustomization.yaml index 5f8d02b..c1f639e 100644 --- a/argocd/manifests/immich/kustomization.yaml +++ b/argocd/manifests/immich-ringtail/kustomization.yaml @@ -1,7 +1,8 @@ ---- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization + namespace: immich + resources: - deployment-server.yaml - deployment-ml.yaml @@ -13,11 +14,15 @@ resources: - pv-nfs.yaml - pvc.yaml - ingress-tailscale.yaml + images: - name: ghcr.io/immich-app/immich-server newTag: v2.6.3 - name: ghcr.io/immich-app/immich-machine-learning - newTag: v2.6.3 + # CUDA variant of the same release — ringtail has an RTX 4080 + newTag: v2.6.3-cuda + # Using upstream multi-arch valkey image directly; the + # registry.ops.eblu.me/blumeops/valkey mirror is arm64-only (built + # on indri) and would crashloop on ringtail. - name: docker.io/valkey/valkey - newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.6-r0-fabca04 + newTag: "8.1.6" diff --git a/argocd/manifests/immich-ringtail/pv-nfs.yaml b/argocd/manifests/immich-ringtail/pv-nfs.yaml new file mode 100644 index 0000000..3d5a682 --- /dev/null +++ b/argocd/manifests/immich-ringtail/pv-nfs.yaml @@ -0,0 +1,29 @@ +# NFS PersistentVolume for Immich photo library on ringtail k3s. +# +# Mirror of argocd/manifests/immich/pv-nfs.yaml (minikube) but with +# a distinct name (minikube and ringtail are separate clusters, so PV +# names don't collide cluster-side, but using the same name in two +# manifests is confusing). +# +# The sifaka NFS export for /volume1/photos already permits +# 192.168.1.0/24 + 100.64.0.0/10. Ringtail's wired IP (192.168.1.21) +# falls in the first CIDR, so no DSM rule changes are needed. +# +# Verified 2026-05-13: ringtail pod can read existing dirs, write +# new files, and delete them. DNS resolves sifaka to 192.168.1.203 +# (LAN), so NFS traffic stays off the tailnet — avoids the known +# sifaka-tailscale-userspace bite. +apiVersion: v1 +kind: PersistentVolume +metadata: + name: immich-library-nfs-pv-ringtail +spec: + capacity: + storage: 2Ti + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + nfs: + server: sifaka + path: /volume1/photos diff --git a/argocd/manifests/immich/pvc-ml-cache.yaml b/argocd/manifests/immich-ringtail/pvc-ml-cache.yaml similarity index 100% rename from argocd/manifests/immich/pvc-ml-cache.yaml rename to argocd/manifests/immich-ringtail/pvc-ml-cache.yaml diff --git a/argocd/manifests/immich/pvc.yaml b/argocd/manifests/immich-ringtail/pvc.yaml similarity index 54% rename from argocd/manifests/immich/pvc.yaml rename to argocd/manifests/immich-ringtail/pvc.yaml index c764636..5bfc052 100644 --- a/argocd/manifests/immich/pvc.yaml +++ b/argocd/manifests/immich-ringtail/pvc.yaml @@ -1,5 +1,5 @@ -# PersistentVolumeClaim for Immich photo library -# Binds to the NFS PV for sifaka:/volume1/photos +# PersistentVolumeClaim for Immich photo library on ringtail. +# Binds to immich-library-nfs-pv-ringtail (sifaka:/volume1/photos). apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -9,7 +9,7 @@ spec: accessModes: - ReadWriteMany storageClassName: "" - volumeName: immich-library-nfs-pv + volumeName: immich-library-nfs-pv-ringtail resources: requests: storage: 2Ti diff --git a/argocd/manifests/immich/service-ml.yaml b/argocd/manifests/immich-ringtail/service-ml.yaml similarity index 100% rename from argocd/manifests/immich/service-ml.yaml rename to argocd/manifests/immich-ringtail/service-ml.yaml diff --git a/argocd/manifests/immich/service-valkey.yaml b/argocd/manifests/immich-ringtail/service-valkey.yaml similarity index 100% rename from argocd/manifests/immich/service-valkey.yaml rename to argocd/manifests/immich-ringtail/service-valkey.yaml diff --git a/argocd/manifests/immich/service.yaml b/argocd/manifests/immich-ringtail/service.yaml similarity index 100% rename from argocd/manifests/immich/service.yaml rename to argocd/manifests/immich-ringtail/service.yaml diff --git a/argocd/manifests/immich/README.md b/argocd/manifests/immich/README.md deleted file mode 100644 index a82a856..0000000 --- a/argocd/manifests/immich/README.md +++ /dev/null @@ -1,115 +0,0 @@ -# Immich - -Self-hosted photo and video management solution with AI-powered search and face recognition. - -## Prerequisites - -1. **NFS Share**: Create `/volume1/photos` on sifaka with NFS permissions for indri -2. **PostgreSQL**: The `immich-pg` cluster (with pgvecto.rs) must be healthy -3. **Secrets**: Create the database password secret - -## Deployment Order - -1. Sync `blumeops-pg` (to get CloudNativePG operator if not already running) -2. Wait for `immich-pg` cluster to be healthy -3. Create secrets (see below) -4. Sync `immich` (deploys all resources: storage, services, deployments) -5. Run `mise run provision-indri -- --tags caddy` to update Caddy config - -## Components - -| Component | Deployment | Service | Port | -|-----------|------------|---------|------| -| Server (web/API) | `immich-server` | `immich-server` | 2283 | -| Machine Learning | `immich-machine-learning` | `immich-machine-learning` | 3003 | -| Valkey (Redis) | `immich-valkey` | `immich-valkey` | 6379 | - -## Secret Setup - -The `immich-db` secret contains the database password, which is auto-generated by CloudNativePG -in the `immich-pg-app` secret. To create or regenerate the secret: - -```bash -# Create namespace if needed -kubectl --context=minikube-indri create namespace immich - -# Copy password from CNPG secret to immich namespace -kubectl --context=minikube-indri create secret generic immich-db -n immich \ - --from-literal=password="$(kubectl --context=minikube-indri -n databases get secret immich-pg-app -o jsonpath='{.data.password}' | base64 -d)" -``` - -Note: This secret is not managed by ExternalSecrets since the source of truth is the CNPG-generated secret. - -## Access - -- **URL**: https://photos.ops.eblu.me (after Caddy is updated) -- **Tailscale**: https://photos.tail8d86e.ts.net (direct) - -## First-Time Setup - -1. Navigate to https://photos.ops.eblu.me -2. Create an admin account -3. Configure external library (optional - for importing existing photos) - -## External Library (iCloud Photos) - -To import existing photos from iCloud sync on indri: - -1. In Immich Admin > External Libraries, create a new library -2. Set the import path to the location where iCloud photos sync -3. Configure scan schedule or trigger manual scan - -## Architecture - -``` -┌─────────────────┐ ┌─────────────────┐ -│ immich-server │────▶│ immich-pg │ -│ (web/api) │ │ (PostgreSQL │ -└────────┬────────┘ │ + pgvecto.rs) │ - │ └─────────────────┘ - │ -┌────────▼────────┐ ┌─────────────────┐ -│ immich-ml │ │ valkey │ -│ (ML inference) │ │ (Redis cache) │ -└─────────────────┘ └─────────────────┘ - │ -┌────────▼────────┐ -│ sifaka NFS │ -│ /volume1/photos│ -└─────────────────┘ -``` - -## Version Management - -Image versions are controlled via `kustomization.yaml`: - -```yaml -images: - - name: ghcr.io/immich-app/immich-server - newTag: v2.6.3 - - name: ghcr.io/immich-app/immich-machine-learning - newTag: v2.6.3 - - name: docker.io/valkey/valkey - newTag: "8.1-alpine" -``` - -To upgrade, update `newTag` values and sync via ArgoCD. - -## Troubleshooting - -```bash -# Check pods -kubectl --context=minikube-indri -n immich get pods - -# Check immich-pg cluster -kubectl --context=minikube-indri -n databases get cluster immich-pg - -# View server logs -kubectl --context=minikube-indri -n immich logs -l app=immich,component=server - -# View ML logs -kubectl --context=minikube-indri -n immich logs -l app=immich,component=machine-learning - -# Check PVC binding -kubectl --context=minikube-indri -n immich get pvc -``` diff --git a/argocd/manifests/immich/pv-nfs.yaml b/argocd/manifests/immich/pv-nfs.yaml deleted file mode 100644 index 0bd6ee2..0000000 --- a/argocd/manifests/immich/pv-nfs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# NFS PersistentVolume for Immich photo library -# Requires: NFS share on sifaka at /volume1/photos with NFS permissions for indri -# -# To create on Synology: -# 1. Control Panel > Shared Folder > Create -# 2. Name: photos, Location: Volume 1 -# 3. Control Panel > File Services > NFS > NFS Rules -# 4. Add rule for "photos" share: Hostname=indri, Privilege=Read/Write, Squash=No mapping -apiVersion: v1 -kind: PersistentVolume -metadata: - name: immich-library-nfs-pv -spec: - capacity: - storage: 2Ti - accessModes: - - ReadWriteMany - persistentVolumeReclaimPolicy: Retain - storageClassName: "" - nfs: - server: sifaka - path: /volume1/photos diff --git a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml index dee2fd7..100e7a9 100644 --- a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml +++ b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml @@ -11,4 +11,4 @@ data: timeSlicing: resources: - name: nvidia.com/gpu - replicas: 2 + replicas: 4 diff --git a/docs/changelog.d/migrate-immich-to-ringtail.infra.md b/docs/changelog.d/migrate-immich-to-ringtail.infra.md new file mode 100644 index 0000000..b47742f --- /dev/null +++ b/docs/changelog.d/migrate-immich-to-ringtail.infra.md @@ -0,0 +1,13 @@ +Move the entire Immich stack — server, machine-learning, valkey, +and the PostgreSQL+VectorChord cluster — off `minikube-indri` and +onto `k3s-ringtail`. Postgres data migrated zero-loss via CNPG +`pg_basebackup` (replica catch-up then promote); row counts on +`asset`, `user`, `album`, `smart_search`, `activity`, `asset_face` +verified equal between source and replica before cutover. The ML +pod now uses ringtail's RTX 4080 via the nvidia-device-plugin +(time-slicing bumped 2 → 4 to share with frigate + ollama). Caddy +routing at `photos.ops.eblu.me` is unchanged (still +`photos.tail8d86e.ts.net`, the device just lives on ringtail now). +Borgmatic backups continue against the same `immich-pg` tailnet +hostname. First concrete chain in the broader indri-k8s +decommission effort. diff --git a/docs/how-to/immich/cnpg-on-ringtail.md b/docs/how-to/immich/cnpg-on-ringtail.md new file mode 100644 index 0000000..153e674 --- /dev/null +++ b/docs/how-to/immich/cnpg-on-ringtail.md @@ -0,0 +1,52 @@ +--- +title: CNPG Operator on Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - postgres + - ringtail +--- + +# CNPG Operator on Ringtail + +Bring up the `cloudnative-pg` operator on `k3s-ringtail`. Today the +operator only exists on `minikube-indri` (see +`argocd/apps/cloudnative-pg.yaml`, destination `kubernetes.default.svc`). + +Prerequisite of [[migrate-immich-to-ringtail]]; consumed by +[[immich-pg-on-ringtail]]. + +## What to do + +- Add a sibling `argocd/apps/cloudnative-pg-ringtail.yaml` pointing + at the same mirror (`mirrors/cloudnative-pg`, tag `v1.27.1`), + destination `https://ringtail.tail8d86e.ts.net:6443`, + namespace `cnpg-system`. +- Mirror the `ServerSideApply=true` and `CreateNamespace=true` sync + options (the CRDs exceed the annotation size limit). +- Sync `apps` then `cloudnative-pg-ringtail`. Verify the operator + pod is running on ringtail. + +## Verification + +```fish +kubectl --context=k3s-ringtail -n cnpg-system get pods +kubectl --context=k3s-ringtail get crd clusters.postgresql.cnpg.io +``` + +## Why a separate app + +Each ArgoCD app targets a single cluster via `destination.server`. +We could parameterize with ApplicationSets, but blumeops' convention +is to duplicate the manifest with a `-ringtail` suffix (see +`alloy-ringtail`, `external-secrets-ringtail`, etc.). Keep the +convention. + +## Out of scope + +- Postgres clusters themselves (`immich-pg`, etc.) — those come from + [[immich-pg-on-ringtail]]. +- Removing the minikube cnpg operator. That happens at the very end + of the indri-k8s decommission, not in this chain. diff --git a/docs/how-to/immich/immich-app-on-ringtail.md b/docs/how-to/immich/immich-app-on-ringtail.md new file mode 100644 index 0000000..51b619d --- /dev/null +++ b/docs/how-to/immich/immich-app-on-ringtail.md @@ -0,0 +1,91 @@ +--- +title: Immich App on Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - immich +--- + +# Immich App on Ringtail + +Bring up `immich-server`, `immich-machine-learning`, and +`immich-valkey` on ringtail. This card stands the stack up against +the *new* pg cluster — it does not move user traffic. Cutover lives +in [[immich-cutover-and-decommission]]. + +## What to do + +- New manifest dir `argocd/manifests/immich-ringtail/` (the suffix + matches the `-ringtail` convention used by other apps). Port from + `argocd/manifests/immich/`: + - `deployment-server.yaml` — point `DB_HOSTNAME` at the ringtail + pg service. + - `deployment-ml.yaml` — use `runtimeClassName: nvidia` + a + `resources.limits` for `nvidia.com/gpu: 1`. Use the `-cuda` tag + of the immich-ml image (set in kustomization). Ringtail is + single-node, so no node selector needed. See + `argocd/manifests/frigate/` for the existing GPU pod pattern. + + **GPU contention discovery:** ringtail's `nvidia-device-plugin` + is configured with `timeSlicing.replicas: 2`. Frigate + Ollama + already consume both virtual slices. Adding immich-ml requires + bumping the count to >= 3. Edit + `argocd/manifests/nvidia-device-plugin/configmap.yaml` (or + wherever the device-plugin config lives) and re-sync the + `nvidia-device-plugin` ArgoCD app. The plugin pod restarts and + the new advertised count appears as the node's + `nvidia.com/gpu` allocatable. + - `deployment-valkey.yaml` — straight port, BUT use the upstream + multi-arch `docker.io/valkey/valkey:` image — do NOT + use the `registry.ops.eblu.me/blumeops/valkey` rewrite in the + kustomization. That mirror was built on indri (arm64) and is + single-arch; pulling it on ringtail (amd64) gets `exec format + error` in CrashLoopBackOff. The mirror should eventually carry + a multi-arch tag, at which point the rewrite can return. + - `service*.yaml` — straight port. + - `pvc-ml-cache.yaml` — straight port (empty `local-path` PVC). + - `pv-nfs.yaml` + `pvc.yaml` — already covered by + [[sifaka-nfs-from-ringtail]] (may live in this dir or theirs). + - `ingress-tailscale.yaml` — ProxyGroup ingress, **must not** set + an explicit `host:` (or use `host: *`) per the lesson on + ProxyGroup VIP routing. + **Hostname collision warning:** the minikube ingress claims the + Tailscale device name `photos` (`tls.hosts: [photos]`). Two + devices on the tailnet cannot share that name. While the + ringtail deployment is being staged it must use a *different* + `tls.hosts` value (e.g. `photos-ringtail`) so it can coexist + with the running minikube one. The flip to `photos` happens at + cutover time, *after* the minikube ingress has been removed. + See [[immich-cutover-and-decommission#Cutover sequence]]. + - `kustomization.yaml` — same `images:` block (server, ML, valkey). +- New ArgoCD app `argocd/apps/immich-ringtail.yaml` targeting + ringtail, namespace `immich`. **Manual sync only** until the + cutover. +- Existing `argocd/apps/immich.yaml` (minikube) stays untouched + during this card — both apps exist briefly. + +## Bring it up against a copy of the DB + +Use the throwaway/test path from [[immich-pg-data-migration#Dry run +before real cutover]]: point the ringtail immich at the *test* pg +cluster first, verify the pod boots, the web UI loads (via +`kubectl port-forward`), assets list, ML embeddings query. Then +tear it down. + +## Verification + +- All three pods Ready. +- ML pod has a GPU attached: `nvidia-smi` inside the container shows + the 4080. +- `immich-server` connects to pg and valkey (no `ECONNREFUSED` in + logs). +- A `kubectl port-forward` to the server service shows the Immich + web UI. + +## Out of scope + +- Public/tailnet routing flip. Caddy still points at the minikube + Tailscale ingress until [[immich-cutover-and-decommission]]. +- Removing the minikube immich. Same. diff --git a/docs/how-to/immich/immich-cutover-and-decommission.md b/docs/how-to/immich/immich-cutover-and-decommission.md new file mode 100644 index 0000000..b44fddd --- /dev/null +++ b/docs/how-to/immich/immich-cutover-and-decommission.md @@ -0,0 +1,103 @@ +--- +title: Immich Cutover and Decommission +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - immich + - migration +--- + +# Immich Cutover and Decommission + +The user-visible flip. By the time this card opens, the ringtail +stack has been proven against a copy of the data. This card does the +real cutover. + +## Pre-cutover checklist + +- [[immich-pg-data-migration]] dry-run succeeded; method is chosen. +- Ringtail immich stack has been brought up against the test pg, + pods healthy, UI loaded ([[immich-app-on-ringtail#Verification]]). +- Borgmatic just ran successfully (a fresh nightly archive is a + belt-and-suspenders fallback, on top of the live source pg). +- User has been told to stop uploading from the iOS app for the + cutover window. + +## Cutover sequence + +1. **Quiesce source.** `kubectl --context=minikube-indri -n immich + scale deploy/immich-server --replicas=0` and same for ML. Leave + valkey + pg running. Confirm no client traffic on the source pg + via `pg_stat_activity`. +2. **Tear down the minikube Tailscale ingress.** The `photos` + Tailscale device name must be freed before ringtail's ingress can + claim it (Tailscale enforces uniqueness across the tailnet). + `kubectl --context=minikube-indri -n immich delete ingress + immich-tailscale` and wait for the corresponding `tailscale`-LB + StatefulSet pod to terminate. Verify the `photos` device is gone: + `tailscale status | grep -i photos` from any tailnet host. +3. **Final sync.** Per chosen method in + [[immich-pg-data-migration]]: + - Option A: promote the ringtail replica. + - Option B: take final `pg_dump`, restore to ringtail + `immich-pg`. +4. **Verify.** Run the row-count and schema-diff checks from + [[immich-pg-data-migration#Verification on the real run]]. +5. **Flip the ringtail ingress to `photos`.** Update + `argocd/manifests/immich-ringtail/ingress-tailscale.yaml`: + `tls.hosts: [photos]` (was `[photos-ringtail]` during staging per + [[immich-app-on-ringtail]]). Commit, `argocd app sync + immich-ringtail`. Wait for the `photos` device to register on the + tailnet again. +6. **Bring up ringtail immich** against the now-promoted pg + (`argocd app sync immich-ringtail`). Wait for Ready. +7. **Flip routing.** Update Caddy on indri + (`ansible/roles/caddy/defaults/main.yml`): `photos.ops.eblu.me` + upstream changes to the ringtail Tailscale ingress hostname + (`photos` — same MagicDNS name, now pointing to the ringtail + proxy). `mise run provision-indri -- --tags caddy`. +8. **Smoke test.** Open `photos.ops.eblu.me` in a browser. Sign in. + Scroll the timeline. Open an album. Trigger an ML search. +9. **Update borgmatic.** If the Tailscale hostname for pg changed, + update `borgmatic.cfg` on indri to point at the ringtail + `immich-pg-tailscale` service. Run a manual backup to verify. + +## After cutover + +- `argocd app set immich --revision ` is no longer relevant; + the minikube `immich` app gets deleted entirely. +- Delete `argocd/apps/immich.yaml`, `argocd/manifests/immich/`, and + the minikube `argocd/manifests/databases/immich-pg.yaml` + + `external-secret-immich-borgmatic.yaml` + + `service-immich-pg-tailscale.yaml`. +- Rename `immich-ringtail` back to `immich` (the `-ringtail` suffix + was scaffolding for the dual-cluster window; once minikube is + empty of immich, the unsuffixed name is clean). +- Confirm the minikube `immich-pg` PVC is no longer used, then + delete it (the PV with `Retain` policy will persist — clean that + up too). + +## Verification (definition of done) + +- `photos.ops.eblu.me` works for a real session, including ML search. +- Source minikube has no `immich` pods, no `immich-pg`, no PVCs. +- Memory pressure on minikube has dropped (≥1.5 GiB reclaimed). Check + `docker stats minikube` on indri. +- Nightly borgmatic run after the cutover completes successfully, + with the immich-pg archive showing the new source. + +## Rollback (within the cutover window) + +If smoke test fails: flip Caddy back, scale ringtail immich to 0, +scale source immich back up. Source pg was never destroyed. File a +plan reset on the relevant prerequisite card and try again next +session. + +## Out of scope + +- Decommissioning all of minikube. This chain just removes immich. + Other tenants migrate in their own chains as part of the broader + indri-k8s decommission. See [[migrate-immich-to-ringtail]] for + context. diff --git a/docs/how-to/immich/immich-pg-data-migration.md b/docs/how-to/immich/immich-pg-data-migration.md new file mode 100644 index 0000000..fb87783 --- /dev/null +++ b/docs/how-to/immich/immich-pg-data-migration.md @@ -0,0 +1,79 @@ +--- +title: Immich Postgres Data Migration +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - postgres + - immich + - critical +--- + +# Immich Postgres Data Migration + +**This is the data-loss surface of the migration.** Pick a method, +prove it on a throwaway copy first, then run the real cutover. + +## Decision: pick one + +### Option A — CNPG `externalCluster` bootstrap (preferred) + +Stand the ringtail cluster up as a streaming replica of the minikube +cluster via `bootstrap.pg_basebackup.source`. Replica catches up +online; when ready, promote it and point Immich at it. This is +CNPG's documented PG-to-PG migration path and gives near-zero data +loss (the WAL position at promote == the position at app stop). + +Requires: network path from ringtail to minikube's pg over the +tailnet (the existing `immich-pg-tailscale` Service works), and a +superuser secret minikube-side exposed to ringtail's basebackup. + +Pitfall to plan around: the ringtail Cluster CR will need its +`bootstrap` block rewritten *after* promotion (CNPG doesn't +gracefully drop the externalCluster reference). Account for this in +[[immich-pg-on-ringtail]] — it may force a reset of that card. + +### Option B — pg_dump / pg_restore + +Stop immich, `pg_dump -Fc` from minikube, scp to ringtail, restore. +Simpler but full downtime for the whole dump+restore window +(measure on a copy first — VectorChord indexes are slow to rebuild). +Smaller blast radius; no streaming-replication moving parts. + +Use this if Option A hits any blocker. Data loss should still be +zero if the source is stopped first. + +### Option C — leave pg on minikube + +Rejected. See goal card [[migrate-immich-to-ringtail#Why postgres on +ringtail (not cross-cluster)]]. + +## Dry run before real cutover + +Whichever option wins: + +1. Snapshot the minikube `immich-pg` PVC or take a fresh `pg_dump` + into a scratch location. +2. Restore into a *separate* ringtail CNPG cluster (different name, + e.g. `immich-pg-test`) and point a scratch immich-server pod at + it. +3. Verify: pod boots, can list assets, ML embeddings query without + error, face thumbnails render. VectorChord-backed queries should + not error. +4. Tear the scratch cluster down before doing the real one. + +## Verification on the real run + +- Row counts match for `assets`, `albums`, `users`, `face`, + `asset_face`, `smart_search` (the embedding table) — script this. +- `pg_dump --schema-only --no-owner` diff between source and dest + should be empty modulo CNPG-managed roles. +- Immich `/api/server-info/version` and `/api/server-info/statistics` + return sane numbers. + +## Rollback + +If the cutover fails verification: stop the ringtail immich, repoint +ArgoCD `immich.destination` back to minikube, re-sync. Source pg was +never deleted. Document what failed and reset the chain. diff --git a/docs/how-to/immich/immich-pg-on-ringtail.md b/docs/how-to/immich/immich-pg-on-ringtail.md new file mode 100644 index 0000000..10c7072 --- /dev/null +++ b/docs/how-to/immich/immich-pg-on-ringtail.md @@ -0,0 +1,69 @@ +--- +title: Immich Postgres Cluster on Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - postgres + - immich +--- + +# Immich Postgres Cluster on Ringtail + +Stand up a fresh `immich-pg` CNPG Cluster on ringtail, ready to receive +data. **No data import yet** — that's [[immich-pg-data-migration]]. + +## What to do + +- Create `argocd/manifests/databases-ringtail/` (or pick another + namespace name — verify what other ringtail pg clusters will use; + if none yet, `databases` is fine). +- Port these from the minikube side: + - `immich-pg.yaml` — CNPG Cluster CR. Same image + (`ghcr.io/tensorchord/cloudnative-vectorchord:17-0.5.0`), same + extensions, same managed `borgmatic` role. Bump `storage.size` if + the minikube 10 GiB looks tight (check actual usage first). + `storageClass: local-path` on ringtail (default). + - `external-secret-immich-borgmatic.yaml` — same 1Password item, + same field, but referencing the ringtail `ClusterSecretStore` + (`onepassword-blumeops` already exists per the + `external-secrets-ringtail` app). + - Service for in-cluster access (the operator creates `immich-pg-rw` + etc. automatically; verify the app deployment uses those names). + - A Tailscale Service if we want backups to keep working via the + same hostname during the transition — see "Borgmatic" below. +- New ArgoCD app `argocd/apps/databases-ringtail.yaml` pointing at + the new path, destination ringtail. + +## Verification + +- Cluster reaches `Ready`. +- `borgmatic` role exists, `rolcanlogin=t`, and is a member of + `pg_read_all_data` (via `managed.roles[].inRoles`). +- ExternalSecret `immich-pg-borgmatic` syncs from 1Password + (`Ready: True`) and the rendered Secret has `username=borgmatic`. +- The `vchord`, `vector`, `cube`, `earthdistance` extensions show + installed in the `postgres` database (`\dx` from + `psql -U postgres`). They are NOT installed in the `immich` + database at this point — `postInitSQL` in CNPG's `initdb` block + runs against the `postgres` superuser database. The Immich app + itself creates the extensions in its own `immich` database at + startup; do not be alarmed by their absence pre-immich-deploy. + The `vchord.so` library is preloaded via + `shared_preload_libraries` regardless, so `CREATE EXTENSION` at + app startup just registers it in the right database. + +## Borgmatic implications + +`borgmatic.cfg` on indri targets `immich-pg-tailscale` over the +tailnet. During migration both clusters will exist briefly. Decide +upfront: backup the *source* pg until cutover, then flip borgmatic +to the ringtail Tailscale service. Document the flip in +[[immich-cutover-and-decommission]]. + +## Out of scope + +- Importing data. That is [[immich-pg-data-migration]], which may + drive a reset on this card if the migration approach (e.g. CNPG + `externalCluster` bootstrap) requires changes to this Cluster CR. diff --git a/docs/how-to/immich/migrate-immich-to-ringtail.md b/docs/how-to/immich/migrate-immich-to-ringtail.md new file mode 100644 index 0000000..cd23384 --- /dev/null +++ b/docs/how-to/immich/migrate-immich-to-ringtail.md @@ -0,0 +1,132 @@ +--- +title: Migrate Immich to Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - immich + - migration +--- + +# Migrate Immich to Ringtail + +Move the entire Immich stack (server, ML, valkey, postgres) off +`minikube-indri` and onto `k3s-ringtail`. This is the first concrete +chain in the broader indri-k8s decommission: minikube is +memory-saturated (97% RAM, swapping), and Immich is the single +largest tenant (~1.5 GiB resident). + +## End state + +- Immich `server`, `machine-learning`, and `valkey` Deployments run on + ringtail k3s in the `immich` namespace. +- The `immich-machine-learning` pod uses ringtail's RTX 4080 via the + `nvidia-device-plugin` (performance win — currently CPU-only on + minikube). +- A CNPG `immich-pg` Cluster (PostgreSQL 17 + VectorChord) runs in a + `databases` namespace on ringtail, owned by the `cnpg-system` + operator on ringtail. +- The photo library still lives on [[sifaka]] at `/volume1/photos`, + mounted via NFS from ringtail pods (RWX). +- Routing: `photos.ops.eblu.me` (Caddy on indri) proxies to a + Tailscale ProxyGroup ingress on ringtail. No public surface today. +- The ArgoCD `immich` app's `destination.server` points at + `https://ringtail.tail8d86e.ts.net:6443`. The old minikube + manifests are removed. + +## Non-goals + +- Public exposure via Fly. Immich stays tailnet-only. +- Changing the immich version or runtime configuration. This is a + lift-and-shift; bumps come later. +- Backing up to a different target. [[borgmatic]] keeps running on + indri (it pulls via Tailscale and uses sifaka SMB for the library). + +## Critical constraint: no data loss + +Downtime is acceptable (Immich is a single-user system; we can take +it offline for the cutover). **Data loss is not.** Two surfaces matter: + +1. **Postgres** — face data, ML embeddings (vectors), album state, + sharing, etc. Re-derivable in theory; weeks of recompute in + practice. See [[immich-pg-data-migration]]. +2. **Library files** — `/volume1/photos`. Not moving, but the NFS + path must be verified accessible from ringtail before cutover. + See [[sifaka-nfs-from-ringtail]]. + +[[borgmatic]] backs both up to sifaka + BorgBase nightly; restore is +possible but slow. Treat it as a fallback, not a plan. + +## Why postgres on ringtail (not cross-cluster) + +`immich-pg` already has a Tailscale Service we could point ringtail +at, leaving the DB on minikube. We're not doing that because: + +- The whole goal is to retire minikube — keeping pg there blocks it. +- Immich is chatty against pg; tailnet round-trips would hurt. +- CNPG is the same operator on both sides — a Cluster CR on ringtail + is mechanically equivalent. + +## Approach + +This is a C2 Mikado chain. The prerequisite cards each represent a +distinct surface that has to work before cutover. See +[[agent-change-process#C2 — Mikado Chain]] for the discipline. + +## Workflow note: registering new ArgoCD apps during the chain + +This chain adds three new ArgoCD `Application` definitions in +`argocd/apps/`: `cloudnative-pg-ringtail`, `databases-ringtail`, +and (later) `immich-ringtail`. The usual C1/C2 pattern of +`argocd app set --revision && argocd app sync ` +does NOT work for the app-of-apps `apps` Application itself, because +`apps` self-manages: it re-reads `apps.yaml` (which declares +`targetRevision: main`) on every sync and reverts the override. As a +result, new app definitions added on a feature branch are never +visible to the cluster via `apps`. + +**Use `kubectl apply` to register each new Application directly:** + +```fish +kubectl --context=minikube-indri apply -f argocd/apps/.yaml +``` + +This creates the Application resource out-of-band, bypassing `apps`. + +For apps whose source lives in **this** repo (e.g. +`databases-ringtail`, `immich-ringtail` — manifest paths exist only +on the branch until merge), follow the apply with a branch override: + +```fish +argocd app set --revision mikado/migrate-immich-to-ringtail +argocd app sync +``` + +For apps whose source is an **external** repo at a pinned tag (e.g. +`cloudnative-pg-ringtail` → `mirrors/cloudnative-pg` `v1.27.1`), no +override is needed — the source revision is independent of this PR. + +After PR merge: + +```fish +argocd app set --revision main +argocd app sync +``` + +`apps` itself, on its next sync from `main`, will discover the new +Application definitions in `argocd/apps/` and adopt the already-running +resources without disruption — provided their in-cluster spec matches +the on-disk definitions (which it does because we applied the same +file). + +## Related + +- [[shower-on-ringtail]] — a previous migration to ringtail (simpler: + no upstream cluster, SQLite, no GPU) +- [[connect-to-postgres]] — getting a psql session against CNPG +- [[ringtail]] — the target cluster +- [[cnpg-on-ringtail]], [[immich-pg-on-ringtail]], + [[immich-pg-data-migration]], [[sifaka-nfs-from-ringtail]], + [[immich-app-on-ringtail]], [[immich-cutover-and-decommission]] — + the prerequisite cards diff --git a/docs/how-to/immich/sifaka-nfs-from-ringtail.md b/docs/how-to/immich/sifaka-nfs-from-ringtail.md new file mode 100644 index 0000000..2c490c1 --- /dev/null +++ b/docs/how-to/immich/sifaka-nfs-from-ringtail.md @@ -0,0 +1,67 @@ +--- +title: Sifaka NFS Photos from Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - storage + - nfs + - sifaka +--- + +# Sifaka NFS Photos from Ringtail + +The Immich library lives at `sifaka:/volume1/photos` and is mounted +into the pod via an NFS PV (see `argocd/manifests/immich/pv-nfs.yaml`). +That PV is currently scoped to indri. We need ringtail to mount the +same path with the same RWX semantics, without breaking the existing +indri mount during the transition. + +## What to verify / do + +- Check `sifaka` DSM NFS rules for the `photos` share. Per + [[shower-on-ringtail#NFS + SMB share on sifaka]] convention, rules + use `192.168.1.0/24` + `100.64.0.0/10` with + `all_squash`/`Map all users to admin`. The existing rule may + already cover ringtail (it's on `192.168.1.21` per the recent + static-IP pin). If so this card is a verification card. +- If the rule is locked to indri's IP: add an entry for ringtail + (192.168.1.21) or widen to the subnet pattern above. +- Test mount from a ringtail debug pod (busybox or alpine with + nfs-utils) against the `photos` share. Read a file. Write a temp + file. Delete it. +- Watch for the known sifaka NFS-over-Tailscale gotcha: sifaka's + Tailscale must be in TUN mode (not userspace) for NFS to work + reliably over the tailnet. The NFS path here goes over the LAN + (not tailnet), so this shouldn't bite, but worth confirming the + NFS traffic is on `192.168.1.x` not `100.x`. + +## PV + PVC on ringtail + +- New `pv-nfs.yaml` mirroring the minikube one (name can be shared + if the PV is cluster-scoped — but PVs are per-cluster, so just + duplicate). Same `server: sifaka`, same path, same + `accessModes: [ReadWriteMany]`, `persistentVolumeReclaimPolicy: + Retain`. +- New `pvc.yaml` in the ringtail `immich` namespace bound to it. +- The minikube PVC stays bound and active until cutover — both + clusters can have the share NFS-mounted simultaneously (NFS RWX + permits this). Immich itself must not be running on both sides + at once. + +## Verification + +- A pod on ringtail can `ls /mnt/photos/` and see the same files + as the indri pod. +- File written from ringtail pod is visible from indri pod and + vice versa (proves there's no caching surprise). + +## Out of scope + +- Migrating photo files. Nothing moves; this is just adding a second + NFS client. +- The `pvc-ml-cache.yaml` PVC (a separate ML model cache). That's + not on NFS — it's a regular PVC. Recreated empty on ringtail in + [[immich-app-on-ringtail]]; the first ML pod boot will repopulate + it. From dc69b8c68be6d158f15178a08f9f09603de50381 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 13 May 2026 18:55:50 -0700 Subject: [PATCH 072/122] C1: fix borgmatic shower SQLite dump (ssh to ringtail) (#357) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Nightly borgmatic backups have been failing for 2 days. Root cause: the shower SQLite dump `before_backup` hook (added in PR #349) referenced `kubectl --context=k3s-ringtail`, but indri's kubeconfig deliberately doesn't carry the ringtail credentials. The hook's failure aborted the entire run, taking out *both* the local sifaka repo and the BorgBase offsite. Verified the last good archive was `indri-2026-05-11T02:00`. ## Approach ssh into ringtail and run `k3s kubectl` there — no indri-side kubeconfig needed. `/etc/rancher/k3s/k3s.yaml` is mode 644 so no sudo required, and the existing ssh access from indri to ringtail works. Inline-shell quoting got hairy fast (fish on ringtail rejected `POD=...` bash syntax; the nix shower image lacks `tar` so `kubectl cp` fails). Pulled the dump logic into `~/bin/borgmatic-k8s-sqlite-dump`, deployed by the ansible role. Each dump entry now declares a `target`: - `local:` — local kubectl with explicit context (mealie) - `ssh:` — ssh + `k3s kubectl` on the cluster host (shower) Bytes come back via `kubectl exec ... -- cat` instead of `kubectl cp` since `cp` needs `tar` in the pod (nix-built containers don't bundle it). ## Test plan - [x] `mise run provision-indri -- --tags borgmatic --check --diff` shows expected diff - [x] Apply, helper script deployed at `~/bin/borgmatic-k8s-sqlite-dump` - [x] Helper invoked directly with `ssh:eblume@ringtail` produces a valid 288 KB SQLite file - [x] Full `borgmatic create` completes without errors — both mealie.db (1.7 MB) and shower.db (288 KB) appear in `~/.local/share/borgmatic/k8s-dumps/`, archive `indri-2026-05-13T17:31:02` written to sifaka borg repo 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/357 --- ansible/roles/borgmatic/defaults/main.yml | 8 ++- ansible/roles/borgmatic/tasks/main.yml | 14 ++++ .../roles/borgmatic/templates/config.yaml.j2 | 14 +++- .../borgmatic/templates/k8s-sqlite-dump.sh.j2 | 71 +++++++++++++++++++ .../fix-borgmatic-shower-via-ssh.bugfix.md | 14 ++++ 5 files changed, 116 insertions(+), 5 deletions(-) create mode 100644 ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 create mode 100644 docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md diff --git a/ansible/roles/borgmatic/defaults/main.yml b/ansible/roles/borgmatic/defaults/main.yml index 123cb0f..3a89a09 100644 --- a/ansible/roles/borgmatic/defaults/main.yml +++ b/ansible/roles/borgmatic/defaults/main.yml @@ -56,12 +56,16 @@ borgmatic_k8s_sqlite_dumps: namespace: mealie label_selector: app=mealie db_path: /app/data/mealie.db - context: minikube + # local kubectl, --context=minikube (indri's only configured ctx) + target: local:minikube - name: shower namespace: shower label_selector: app=shower db_path: /app/data/db.sqlite3 - context: k3s-ringtail + # ssh to ringtail and run k3s kubectl there — avoids needing a + # ringtail kubeconfig on indri. k3s.yaml on ringtail is + # world-readable (mode 644), so no sudo required. + target: ssh:eblume@ringtail # Exclude patterns borgmatic_exclude_patterns: [] diff --git a/ansible/roles/borgmatic/tasks/main.yml b/ansible/roles/borgmatic/tasks/main.yml index eacefa5..4ac242c 100644 --- a/ansible/roles/borgmatic/tasks/main.yml +++ b/ansible/roles/borgmatic/tasks/main.yml @@ -49,6 +49,20 @@ mode: '0700' when: borgmatic_k8s_sqlite_dumps | length > 0 +- name: Ensure ~/bin exists + ansible.builtin.file: + path: "{{ ansible_env.HOME }}/bin" + state: directory + mode: '0755' + when: borgmatic_k8s_sqlite_dumps | length > 0 + +- name: Deploy k8s SQLite dump helper script + ansible.builtin.template: + src: k8s-sqlite-dump.sh.j2 + dest: "{{ ansible_env.HOME }}/bin/borgmatic-k8s-sqlite-dump" + mode: '0755' + when: borgmatic_k8s_sqlite_dumps | length > 0 + - name: Deploy borgmatic configuration ansible.builtin.template: src: config.yaml.j2 diff --git a/ansible/roles/borgmatic/templates/config.yaml.j2 b/ansible/roles/borgmatic/templates/config.yaml.j2 index 85804b7..0893dbc 100644 --- a/ansible/roles/borgmatic/templates/config.yaml.j2 +++ b/ansible/roles/borgmatic/templates/config.yaml.j2 @@ -32,12 +32,20 @@ exclude_patterns: encryption_passcommand: {{ borgmatic_encryption_passcommand }} {% if borgmatic_k8s_sqlite_dumps %} -# Pre-backup: dump SQLite databases from k8s pods -# Uses sqlite3 .backup for a safe, consistent copy (no corruption from concurrent writes) +# Pre-backup: dump SQLite databases from k8s pods. +# Uses sqlite3.backup() for a safe, consistent copy. +# +# Quoting/escaping is delegated to ~/bin/borgmatic-k8s-sqlite-dump +# (deployed by the borgmatic ansible role). Each entry's `target` +# is either: +# - local: -> local kubectl with --context (mealie etc.) +# - ssh: -> ssh + k3s kubectl on the cluster host, +# used for ringtail since indri's kubeconfig +# deliberately doesn't carry that context. before_backup: - mkdir -p {{ borgmatic_k8s_dump_dir }} {% for db in borgmatic_k8s_sqlite_dumps %} - - /opt/homebrew/bin/kubectl --context={{ db.context }} exec -n {{ db.namespace }} deploy/{{ db.name }} -- python3 -c "import sqlite3; sqlite3.connect('{{ db.db_path }}').backup(sqlite3.connect('/tmp/{{ db.name }}-backup.db'))" && /opt/homebrew/bin/kubectl --context={{ db.context }} cp {{ db.namespace }}/$(/opt/homebrew/bin/kubectl --context={{ db.context }} get pod -n {{ db.namespace }} -l {{ db.label_selector }} -o jsonpath='{.items[0].metadata.name}'):/tmp/{{ db.name }}-backup.db {{ borgmatic_k8s_dump_dir }}/{{ db.name }}.db + - {{ ansible_env.HOME }}/bin/borgmatic-k8s-sqlite-dump {{ db.target }} {{ db.namespace }} {{ db.label_selector }} {{ db.db_path }} {{ db.name }} {{ borgmatic_k8s_dump_dir }}/{{ db.name }}.db {% endfor %} {% endif %} diff --git a/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 b/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 new file mode 100644 index 0000000..323e717 --- /dev/null +++ b/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# {{ ansible_managed }} +# +# Helper script invoked by borgmatic's before_backup hook to capture a +# k8s pod's SQLite database. Keeps the borgmatic config readable by +# pulling all the quoting out of YAML. +# +# Usage: +# borgmatic-k8s-sqlite-dump \ +# +# +# is one of: +# local: - run local kubectl with --context= +# ssh: - ssh to host and run k3s kubectl there +# (no indri-side kubeconfig needed) +# +# - k8s namespace of the pod +# - label selector to find the pod (e.g. app=shower) +# - absolute path inside the pod to the SQLite DB +# - short name used for temp filenames +# - file on this host to receive the dump +set -euo pipefail + +target=${1:?missing target} +namespace=${2:?missing namespace} +selector=${3:?missing selector} +db_path=${4:?missing db path} +name=${5:?missing name} +dump_target=${6:?missing dump target} + +pod_tmp="/tmp/${name}-backup.db" + +python_backup='import sqlite3; sqlite3.connect("'"$db_path"'").backup(sqlite3.connect("'"$pod_tmp"'"))' + +mode=${target%%:*} +ref=${target#*:} + +case "$mode" in + local) + # Pulls dump bytes out via "kubectl exec -- cat" rather than + # "kubectl cp", which would otherwise need tar inside the pod + # (nix-built images like shower don't bundle tar). + context=$ref + kubectl="/opt/homebrew/bin/kubectl --context=$context -n $namespace" + pod=$($kubectl get pod -l "$selector" \ + -o jsonpath='{.items[0].metadata.name}') + $kubectl exec "$pod" -- python3 -c "$python_backup" + $kubectl exec "$pod" -- cat "$pod_tmp" > "$dump_target" + $kubectl exec "$pod" -- rm -f "$pod_tmp" + ;; + ssh) + host=$ref + # Force bash on the remote (user's login shell on ringtail is + # fish). Pipe the script via stdin to dodge nested quoting. + # The dump bytes come back over the ssh stdout stream — no + # intermediate scp, no tar requirement in the pod. + ssh "$host" bash < "$dump_target" +set -euo pipefail +export KUBECONFIG=/etc/rancher/k3s/k3s.yaml +pod=\$(k3s kubectl -n "$namespace" get pod -l "$selector" -o jsonpath='{.items[0].metadata.name}') +k3s kubectl -n "$namespace" exec "\$pod" -- python3 -c '$python_backup' 1>&2 +k3s kubectl -n "$namespace" exec "\$pod" -- cat "$pod_tmp" +k3s kubectl -n "$namespace" exec "\$pod" -- rm -f "$pod_tmp" 1>&2 +EOF + ;; + *) + echo "borgmatic-k8s-sqlite-dump: unknown target mode: $mode" >&2 + echo " expected local: or ssh:" >&2 + exit 1 + ;; +esac diff --git a/docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md b/docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md new file mode 100644 index 0000000..e18272c --- /dev/null +++ b/docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md @@ -0,0 +1,14 @@ +Fix nightly borgmatic backups failing for 2 days. The shower SQLite +dump hook referenced `kubectl --context=k3s-ringtail`, but indri's +kubeconfig deliberately doesn't carry the ringtail credentials. The +`before_backup` hook's failure aborted the entire run, taking out +*both* the local sifaka repo and the BorgBase offsite. Replaced +the inline-shell dump with a `~/bin/borgmatic-k8s-sqlite-dump` +helper deployed by the ansible role. Each dump entry now declares a +`target` of either `local:` (mealie — kubectl uses indri's +kubeconfig) or `ssh:` (shower — ssh into ringtail and +run `k3s kubectl` there, no indri-side kubeconfig needed; k3s.yaml +on ringtail is mode 644 so no sudo required). Bytes stream back via +`kubectl exec ... -- cat` rather than `kubectl cp`, since `kubectl +cp` requires `tar` inside the pod and nix-built images like shower +don't bundle it. From 6e90c4c3631ec593b0b59d97ecad9bc5b92aea15 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 13 May 2026 20:12:00 -0700 Subject: [PATCH 073/122] C0: bump shower to v1.1.1 (probe FOD hash) Co-Authored-By: Claude Opus 4.7 (1M context) --- containers/shower/default.nix | 8 ++++---- docs/changelog.d/+shower-1.1.1.infra.md | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 docs/changelog.d/+shower-1.1.1.infra.md diff --git a/containers/shower/default.nix b/containers/shower/default.nix index e2d369d..242d873 100644 --- a/containers/shower/default.nix +++ b/containers/shower/default.nix @@ -25,7 +25,7 @@ { pkgs ? import { } }: let - version = "1.1.0"; + version = "1.1.1"; python = pkgs.python314; @@ -43,7 +43,7 @@ let showerSdist = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}.tar.gz"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}.tar.gz"; - hash = "sha256-5dp+0u4metOIC6s6/nPlT4cdpFBCV6S3+Z/3RO0sX5U="; + hash = "sha256-muvjkcKnLrrQTb8HZ4cH9SD0pab05JSFSgwheqb0AyM="; }; # Wheel pulled from forge.ops.eblu.me (tailnet) for the same reason the @@ -53,7 +53,7 @@ let showerWheel = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}-py3-none-any.whl"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}-py3-none-any.whl"; - hash = "sha256-7orFbycON9dQxEIb6q45Xx2rFlEZ8xXSrC2tnrO5uug="; + hash = "sha256-dorrwHhZhOn9Qq6Wk3Su24HckgaWtWbkMY7RtAvomv4="; }; staticAssets = pkgs.runCommand "shower-static-assets-${version}" { } '' @@ -148,7 +148,7 @@ let outputHashAlgo = "sha256"; # Pinned dep closure — reproducible until version bumps. To recompute, # set to pkgs.lib.fakeHash and read the failure. - outputHash = "sha256-kTNOswobtkgyQmmqbQM8XO4vvaGg57nCuuZGbNXb0NM="; + outputHash = pkgs.lib.fakeHash; dontFixup = true; }; diff --git a/docs/changelog.d/+shower-1.1.1.infra.md b/docs/changelog.d/+shower-1.1.1.infra.md new file mode 100644 index 0000000..eb9476c --- /dev/null +++ b/docs/changelog.d/+shower-1.1.1.infra.md @@ -0,0 +1 @@ +Bump shower container to v1.1.1 (probe FOD hash). From 4e117dc921f4106e7c243e8eed86953bb1f025b4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 13 May 2026 20:40:22 -0700 Subject: [PATCH 074/122] C0: pin shower v1.1.1 FOD outputHash (probed on ringtail) Co-Authored-By: Claude Opus 4.7 (1M context) --- containers/shower/default.nix | 2 +- docs/changelog.d/+shower-1.1.1-fod-pin.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+shower-1.1.1-fod-pin.infra.md diff --git a/containers/shower/default.nix b/containers/shower/default.nix index 242d873..4f807ed 100644 --- a/containers/shower/default.nix +++ b/containers/shower/default.nix @@ -148,7 +148,7 @@ let outputHashAlgo = "sha256"; # Pinned dep closure — reproducible until version bumps. To recompute, # set to pkgs.lib.fakeHash and read the failure. - outputHash = pkgs.lib.fakeHash; + outputHash = "sha256-HTTmAldIijG03pYZNyO72LBNPCrjmyJQKgW+gU9NplI="; dontFixup = true; }; diff --git a/docs/changelog.d/+shower-1.1.1-fod-pin.infra.md b/docs/changelog.d/+shower-1.1.1-fod-pin.infra.md new file mode 100644 index 0000000..a19b578 --- /dev/null +++ b/docs/changelog.d/+shower-1.1.1-fod-pin.infra.md @@ -0,0 +1 @@ +Pin shower v1.1.1 FOD outputHash (probed locally on ringtail). From 4d2bc9975fc8c0ab18294d71cd5be790bfb8b926 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 13 May 2026 20:51:10 -0700 Subject: [PATCH 075/122] C0: deploy shower v1.1.1 (kustomize newTag bump) Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/shower/kustomization.yaml | 2 +- docs/changelog.d/+shower-1.1.1-deploy.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+shower-1.1.1-deploy.infra.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index b6de844..c0cf4c8 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.1.0-3c7967e-nix + newTag: v1.1.1-4e117dc-nix diff --git a/docs/changelog.d/+shower-1.1.1-deploy.infra.md b/docs/changelog.d/+shower-1.1.1-deploy.infra.md new file mode 100644 index 0000000..61244ac --- /dev/null +++ b/docs/changelog.d/+shower-1.1.1-deploy.infra.md @@ -0,0 +1 @@ +Deploy shower v1.1.1 to ringtail (kustomize newTag bump). From 12314857d8b9fdc17c5dd97b1b92a36d8463c386 Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Fri, 15 May 2026 06:27:43 -0700 Subject: [PATCH 076/122] C0: add GE-Proton to ringtail Steam extraCompatPackages Lets Subnautica 2 (and any other game) opt into the GE-Proton build via Steam's per-game compatibility tool override, as a workaround for the Proton Experimental + DXVK D3D12 Mercuna hang. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+ringtail-proton-ge.infra.md | 4 ++++ nixos/ringtail/gaming.nix | 1 + 2 files changed, 5 insertions(+) create mode 100644 docs/changelog.d/+ringtail-proton-ge.infra.md diff --git a/docs/changelog.d/+ringtail-proton-ge.infra.md b/docs/changelog.d/+ringtail-proton-ge.infra.md new file mode 100644 index 0000000..0d8bc04 --- /dev/null +++ b/docs/changelog.d/+ringtail-proton-ge.infra.md @@ -0,0 +1,4 @@ +Add GE-Proton (`pkgs.proton-ge-bin`) to `programs.steam.extraCompatPackages` +on ringtail. Subnautica 2 hangs at Mercuna plugin init under Proton +Experimental + DXVK D3D12; GE-Proton is available as a Steam per-game +compatibility option to work around it. diff --git a/nixos/ringtail/gaming.nix b/nixos/ringtail/gaming.nix index d84ef9b..c526857 100644 --- a/nixos/ringtail/gaming.nix +++ b/nixos/ringtail/gaming.nix @@ -5,6 +5,7 @@ programs.steam = { enable = true; dedicatedServer.openFirewall = true; + extraCompatPackages = [ pkgs.proton-ge-bin ]; }; # Proton Experimental ships an accessibility bridge (xalia) that hangs during From a33fa47b8063f7ae47ada6f10feb8030f2c69426 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 15 May 2026 06:50:46 -0700 Subject: [PATCH 077/122] C1: deploy shower v1.1.2 (#358) ## Summary Deploys `adelaide-baby-shower-app` **v1.1.2** to ringtail k3s. - Bumps `containers/shower/default.nix` `version` to 1.1.2. - Refreshes sdist + wheel `fetchurl` hashes against the forge PyPI artifacts. - Re-probed FOD `outputHash` on the nix-container-builder runner (ringtail) and pinned the new closure hash. - Bumps kustomize `newTag` to `v1.1.2-b8c7783-nix` (built from this branch's tip). - Bumps `service-versions.yaml` entry for shower to `1.1.2` / `last-reviewed: 2026-05-15`. ## Build provenance Built by Forgejo Actions run #553 on `nix-container-builder` (ringtail) at commit `b8c7783`. After merge a C0 follow-on will rebuild from main and retag so future provenance points at main history. ## Test plan - [ ] `argocd app set shower --revision shower-v1.1.2 && argocd app sync shower` deploys cleanly - [ ] Pod migrates the SQLite PV and serves at `shower.ops.eblu.me` / `shower.eblu.me` - [ ] No new errors in pod logs after `collectstatic` + gunicorn boot Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/358 --- argocd/manifests/shower/kustomization.yaml | 2 +- containers/shower/default.nix | 8 ++++---- docs/changelog.d/shower-v1.1.2.infra.md | 1 + service-versions.yaml | 4 ++-- 4 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 docs/changelog.d/shower-v1.1.2.infra.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index c0cf4c8..2c4dadb 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.1.1-4e117dc-nix + newTag: v1.1.2-b8c7783-nix diff --git a/containers/shower/default.nix b/containers/shower/default.nix index 4f807ed..f7115bc 100644 --- a/containers/shower/default.nix +++ b/containers/shower/default.nix @@ -25,7 +25,7 @@ { pkgs ? import { } }: let - version = "1.1.1"; + version = "1.1.2"; python = pkgs.python314; @@ -43,7 +43,7 @@ let showerSdist = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}.tar.gz"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}.tar.gz"; - hash = "sha256-muvjkcKnLrrQTb8HZ4cH9SD0pab05JSFSgwheqb0AyM="; + hash = "sha256-U00259dlvHSo0c9I/W0kSThyhNKUT8ukG6X+vzj0k9c="; }; # Wheel pulled from forge.ops.eblu.me (tailnet) for the same reason the @@ -53,7 +53,7 @@ let showerWheel = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}-py3-none-any.whl"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}-py3-none-any.whl"; - hash = "sha256-dorrwHhZhOn9Qq6Wk3Su24HckgaWtWbkMY7RtAvomv4="; + hash = "sha256-lF79G9SiCuxG9LcyDJkTeTeJL72qTJTDVE196At1Ods="; }; staticAssets = pkgs.runCommand "shower-static-assets-${version}" { } '' @@ -148,7 +148,7 @@ let outputHashAlgo = "sha256"; # Pinned dep closure — reproducible until version bumps. To recompute, # set to pkgs.lib.fakeHash and read the failure. - outputHash = "sha256-HTTmAldIijG03pYZNyO72LBNPCrjmyJQKgW+gU9NplI="; + outputHash = "sha256-B5INpydOP3DmlgHfgpzKf+2mv0y9Wr2YNK7/5kh0hOc="; dontFixup = true; }; diff --git a/docs/changelog.d/shower-v1.1.2.infra.md b/docs/changelog.d/shower-v1.1.2.infra.md new file mode 100644 index 0000000..aa2db0d --- /dev/null +++ b/docs/changelog.d/shower-v1.1.2.infra.md @@ -0,0 +1 @@ +Deploy shower v1.1.2 — bump container build to new app release. diff --git a/service-versions.yaml b/service-versions.yaml index 63bc5df..02f2979 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -46,8 +46,8 @@ services: - name: shower type: argocd - last-reviewed: 2026-05-11 - current-version: "1.1.0" + last-reviewed: 2026-05-15 + current-version: "1.1.2" upstream-source: https://forge.eblu.me/eblume/adelaide-baby-shower-app notes: | Django app for Adelaide / Heidi / Addie's baby shower. Wheel From 815a0cc6e6d2dc7579633853fd8d06b94afddb26 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 15 May 2026 06:57:24 -0700 Subject: [PATCH 078/122] =?UTF-8?q?C0:=20shower=20=E2=80=94=20rebuild=20fr?= =?UTF-8?q?om=20main=20SHA=20(post-merge=20retag)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PR #358 was squash-merged so the branch commit b8c7783 baked into the prior image tag isn't reachable from main's history. Rebuild from main HEAD (a33fa47) and retag. Image content is byte-identical (FOD is content-addressed, inputs unchanged); only the SHA in the tag changes so future provenance tracing stays on main. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/shower/kustomization.yaml | 2 +- docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index 2c4dadb..6d4628c 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.1.2-b8c7783-nix + newTag: v1.1.2-a33fa47-nix diff --git a/docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md b/docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md new file mode 100644 index 0000000..9355a54 --- /dev/null +++ b/docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md @@ -0,0 +1 @@ +Rebuild shower v1.1.2 from main HEAD (a33fa47) and retag — PR #358 was squash-merged so the branch SHA baked into the prior image tag isn't reachable from main. FOD is content-addressed, so image bytes are identical; only provenance changes. From 96dbbb3cbe7d8a9f695c3bc0bf7006367d1181a4 Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Fri, 15 May 2026 12:11:54 -0700 Subject: [PATCH 079/122] C0: add sn2-prelaunch wrapper to clear SN2 stale lockfiles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UE5 writes Saved/running.dat as a "session in progress" marker. If the previous session exited uncleanly (SIGKILL, crash), it lingers, and SN2 pops up an invisible 0×0 Error dialog at next launch that the GameThread blocks on forever — visible only as a black screen with a spinning loader. Wrap the Steam command to clear the marker files before each launch. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+ringtail-sn2-prelaunch.infra.md | 6 ++++++ nixos/ringtail/gaming.nix | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 docs/changelog.d/+ringtail-sn2-prelaunch.infra.md diff --git a/docs/changelog.d/+ringtail-sn2-prelaunch.infra.md b/docs/changelog.d/+ringtail-sn2-prelaunch.infra.md new file mode 100644 index 0000000..f9c68e2 --- /dev/null +++ b/docs/changelog.d/+ringtail-sn2-prelaunch.infra.md @@ -0,0 +1,6 @@ +Add `sn2-prelaunch` Steam launch wrapper on ringtail that removes +Subnautica 2's stale `Saved/running.dat` and `Saved/beforelobby.dat` +lockfiles before each launch. SN2 pops up an invisible (0×0-sized) +Error dialog when it detects an unclean exit, blocking GameThread +forever; this is observable only as a black screen with a spinning +loader. Use via Steam launch option: `sn2-prelaunch %command%`. diff --git a/nixos/ringtail/gaming.nix b/nixos/ringtail/gaming.nix index c526857..7c00378 100644 --- a/nixos/ringtail/gaming.nix +++ b/nixos/ringtail/gaming.nix @@ -13,6 +13,23 @@ # so disable xalia globally to avoid wedging iscriptevaluator.exe. environment.sessionVariables.PROTON_USE_XALIA = "0"; + # Subnautica 2 pre-launch wrapper. SN2 (UE5) writes Saved/running.dat as a + # "currently running" lockfile. If the prior session exited uncleanly (SIGKILL + # via Steam's Stop button, crash, etc.), the file persists and on next launch + # SN2 pops up an invisible (0x0-sized) Error dialog ("Your game might not have + # exited correctly last time...") that the GameThread blocks on forever — + # observable only as a black screen with a spinning loader. This wrapper + # removes the stale lockfiles before exec'ing the actual game command. + # Use as Steam launch option for Subnautica 2: + # sn2-prelaunch %command% + environment.systemPackages = [ + (pkgs.writeShellScriptBin "sn2-prelaunch" '' + saved="/mnt/games/SteamLibrary/steamapps/compatdata/1962700/pfx/drive_c/users/steamuser/AppData/Local/Subnautica2/Saved" + rm -f "$saved/running.dat" "$saved/beforelobby.dat" + exec "$@" + '') + ]; + # Gamescope — micro-compositor for game fullscreen/resolution management. # Use as Steam launch option: gamescope -W 2560 -H 1440 -f -- %command% programs.gamescope = { From 3645098bf1d64afb46ab562faae1a8aabeee1501 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 15 May 2026 19:56:08 -0700 Subject: [PATCH 080/122] C0: bump shower to v1.1.3 Wheel/sdist + FOD hashes probed on ringtail. Full nix-build verified end-to-end before commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- containers/shower/default.nix | 8 ++++---- docs/changelog.d/+shower-1.1.3.infra.md | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 docs/changelog.d/+shower-1.1.3.infra.md diff --git a/containers/shower/default.nix b/containers/shower/default.nix index f7115bc..c5bd41e 100644 --- a/containers/shower/default.nix +++ b/containers/shower/default.nix @@ -25,7 +25,7 @@ { pkgs ? import { } }: let - version = "1.1.2"; + version = "1.1.3"; python = pkgs.python314; @@ -43,7 +43,7 @@ let showerSdist = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}.tar.gz"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}.tar.gz"; - hash = "sha256-U00259dlvHSo0c9I/W0kSThyhNKUT8ukG6X+vzj0k9c="; + hash = "sha256-a3rCwEdOB+rnYXqsWDifyltpyKUgkOj0ikWB+WGQYKE="; }; # Wheel pulled from forge.ops.eblu.me (tailnet) for the same reason the @@ -53,7 +53,7 @@ let showerWheel = pkgs.fetchurl { name = "adelaide_baby_shower_app-${version}-py3-none-any.whl"; url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}-py3-none-any.whl"; - hash = "sha256-lF79G9SiCuxG9LcyDJkTeTeJL72qTJTDVE196At1Ods="; + hash = "sha256-a6j91gBigG4IzE2DVTBntnZ46Yrx9b5PgHn+Uro98Tk="; }; staticAssets = pkgs.runCommand "shower-static-assets-${version}" { } '' @@ -148,7 +148,7 @@ let outputHashAlgo = "sha256"; # Pinned dep closure — reproducible until version bumps. To recompute, # set to pkgs.lib.fakeHash and read the failure. - outputHash = "sha256-B5INpydOP3DmlgHfgpzKf+2mv0y9Wr2YNK7/5kh0hOc="; + outputHash = "sha256-1xx2qWAIwherklHIPXo6IOKkKHML1KUrUx6pbkMxffc="; dontFixup = true; }; diff --git a/docs/changelog.d/+shower-1.1.3.infra.md b/docs/changelog.d/+shower-1.1.3.infra.md new file mode 100644 index 0000000..33ee49d --- /dev/null +++ b/docs/changelog.d/+shower-1.1.3.infra.md @@ -0,0 +1 @@ +Bumped shower app to v1.1.3 (wheel/sdist + FOD hashes probed on ringtail). From e222d47d455d07d18d1cf66d2a8984aa85d32586 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 15 May 2026 20:09:54 -0700 Subject: [PATCH 081/122] C0: deploy shower v1.1.3 (kustomize newTag bump) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Image v1.1.3-3645098-nix was built directly on ringtail and pushed via skopeo, bypassing the Forgejo runner: indri was severely overloaded (load avg 24.92, minikube VM at 344% CPU) and the workflow-dispatch endpoint timed out. The image content is identical to what the runner would have produced — same default.nix at commit 3645098 (on main), same NIX_PATH (current nixpkgs flake), same skopeo invocation. Tag short-sha matches the commit that defines the recipe so we aren't pinning to a ghost. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/shower/kustomization.yaml | 2 +- docs/changelog.d/+shower-1.1.3-deploy.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+shower-1.1.3-deploy.infra.md diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml index 6d4628c..1c29224 100644 --- a/argocd/manifests/shower/kustomization.yaml +++ b/argocd/manifests/shower/kustomization.yaml @@ -14,4 +14,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/shower - newTag: v1.1.2-a33fa47-nix + newTag: v1.1.3-3645098-nix diff --git a/docs/changelog.d/+shower-1.1.3-deploy.infra.md b/docs/changelog.d/+shower-1.1.3-deploy.infra.md new file mode 100644 index 0000000..833fac6 --- /dev/null +++ b/docs/changelog.d/+shower-1.1.3-deploy.infra.md @@ -0,0 +1 @@ +Deployed shower v1.1.3 to ringtail (image built and pushed from ringtail; runner bypassed due to indri overload). From 1897eb1c5bf4ef1f6d3dfe3601f875b49b8ba2a4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 17 May 2026 08:46:22 -0700 Subject: [PATCH 082/122] C0: move immich blackbox probe to ringtail alloy Immich migrated to ringtail's k3s cluster but the probe still targeted the in-cluster service DNS on indri's minikube, firing ServiceProbeFailure indefinitely. Moved the target into alloy-ringtail's config so the probe runs in the cluster where immich actually lives. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/alloy-k8s/config.alloy | 6 ------ argocd/manifests/alloy-ringtail/config.alloy | 20 +++++++++++++++++++ .../+immich-probe-ringtail.infra.md | 1 + 3 files changed, 21 insertions(+), 6 deletions(-) create mode 100644 docs/changelog.d/+immich-probe-ringtail.infra.md diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index 56a2e13..5a0a8f9 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -196,12 +196,6 @@ prometheus.exporter.blackbox "services" { module = "http_2xx" } - target { - name = "immich" - address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping" - module = "http_2xx" - } - target { name = "navidrome" address = "http://navidrome.navidrome.svc.cluster.local:4533/" diff --git a/argocd/manifests/alloy-ringtail/config.alloy b/argocd/manifests/alloy-ringtail/config.alloy index e92ab0f..e5cc045 100644 --- a/argocd/manifests/alloy-ringtail/config.alloy +++ b/argocd/manifests/alloy-ringtail/config.alloy @@ -45,6 +45,26 @@ prometheus.scrape "kube_state_metrics" { forward_to = [prometheus.remote_write.prometheus.receiver] } +// ============== SERVICE HEALTH PROBES ============== + +// Blackbox-style HTTP probes for in-cluster services on ringtail +prometheus.exporter.blackbox "services" { + config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }" + + target { + name = "immich" + address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping" + module = "http_2xx" + } +} + +// Scrape blackbox probe results +prometheus.scrape "blackbox" { + targets = prometheus.exporter.blackbox.services.targets + scrape_interval = "30s" + forward_to = [prometheus.remote_write.prometheus.receiver] +} + // Push metrics to indri Prometheus prometheus.remote_write "prometheus" { external_labels = { cluster = "ringtail" } diff --git a/docs/changelog.d/+immich-probe-ringtail.infra.md b/docs/changelog.d/+immich-probe-ringtail.infra.md new file mode 100644 index 0000000..f2d3dee --- /dev/null +++ b/docs/changelog.d/+immich-probe-ringtail.infra.md @@ -0,0 +1 @@ +Moved the Immich blackbox health probe from indri's alloy to ringtail's alloy. After the immich migration to ringtail, the probe still targeted `immich-server.immich.svc.cluster.local` on indri's cluster where the service no longer exists, causing a persistent `ServiceProbeFailure` alert. From 2fae0f71618cb7ba8858714693a127555ace6543 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 19 May 2026 06:33:26 -0700 Subject: [PATCH 083/122] C0: switch grafana deployment to Recreate strategy Grafana uses an RWO PVC for SQLite + Bleve search index. RollingUpdate spawns the new pod before terminating the old one, so the new pod crashloops on the index lock until rollout timeout. Recreate terminates the old pod first, letting the new pod acquire the lock cleanly. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/grafana/deployment.yaml | 4 +++- docs/changelog.d/+grafana-recreate-strategy.infra.md | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+grafana-recreate-strategy.infra.md diff --git a/argocd/manifests/grafana/deployment.yaml b/argocd/manifests/grafana/deployment.yaml index 0aad9b3..cbba267 100644 --- a/argocd/manifests/grafana/deployment.yaml +++ b/argocd/manifests/grafana/deployment.yaml @@ -14,7 +14,9 @@ spec: app.kubernetes.io/name: grafana app.kubernetes.io/instance: grafana strategy: - type: RollingUpdate + # RWO PVC for SQLite + Bleve index — RollingUpdate spawns the new pod + # before the old one terminates, and it crashloops on the index lock. + type: Recreate template: metadata: labels: diff --git a/docs/changelog.d/+grafana-recreate-strategy.infra.md b/docs/changelog.d/+grafana-recreate-strategy.infra.md new file mode 100644 index 0000000..3662e10 --- /dev/null +++ b/docs/changelog.d/+grafana-recreate-strategy.infra.md @@ -0,0 +1 @@ +Switched Grafana's deployment strategy from `RollingUpdate` to `Recreate`. With an RWO PVC holding the SQLite database and Bleve search index, `RollingUpdate` reliably crashloops the new pod on the index lock until rollout timeout. `Recreate` terminates the old pod first so the new one acquires the lock cleanly. From ee51bcafb447ff1ef6e76f67f2d0a51fdaffb1c4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 22 May 2026 21:08:53 -0700 Subject: [PATCH 084/122] Rip out compensating-controls framework (#359) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Removes the compensating-controls (CC) framework. Prowler and Kingfisher continue to run weekly and produce reports; the Prowler mutelist YAML files stay in place but no longer carry \`CC: \` prefixes — each entry now just keeps a free-form \`Description\` of why it's muted. The CC review cadence proved to be more process overhead than this single-operator homelab needed. ## What changed **Deleted** - \`compensating-controls.yaml\` — the CC registry - \`mise-tasks/review-compensating-controls\` — the staleness-review task - \`docs/how-to/operations/review-compensating-controls.md\` - \`docs/how-to/operations/record-review-evidence.md\` (was aspirational) - \`docs/explanation/compliance-mute-categories.md\` (proposed-future CC/NA/RA work) - 5 orphan \`+review-cc-*\` / \`+compliance-mute-categories\` changelog fragments **Modified** - 6 mutelist YAML files: stripped \`CC: .\` prefix from every \`Description\` / \`statement\` field, kept the free-form text - \`mise-tasks/review-compliance-reports\`: removed CC mentions from docstrings, panel text, and the node-verification table title. Node-verification logic itself is unchanged. - \`docs/reference/operations/security.md\`: removed the "Compensating controls" section - \`docs/how-to/operations/read-compliance-reports.md\`: rewrote step 3 of "Acting on findings" to point at the mutelist YAML directly - \`docs/changelog.d/prowler-iac-mutelist.infra.md\`: rewrote to drop the "two new compensating controls" framing ## What did not change - All Prowler manifests (cronjobs, RBAC, PVs, kustomization) — scans still run on the same schedule - The Kingfisher deployment - The trivy-shim in the Prowler container — that's about Trivy ignorefile plumbing, independent of the CC concept - The mutelist entries themselves — each \`Resources\` list is unchanged; only the prose of \`Description\` was edited - \`CHANGELOG.md\` — historical releases are left as-is ## Test plan - [ ] Wait for human review before deploying — once merged, re-point ArgoCD: \`argocd app set prowler --revision main && argocd app sync prowler\` (no manifest changes besides the ConfigMap, so impact is limited to muted-finding descriptions in next week's report) - [ ] Confirm next weekly Prowler K8s CIS run (Sunday 3am) still completes and produces a report on sifaka - [ ] Confirm next weekly Prowler IaC run still honors \`trivyignore.yaml\` (the trivy shim is untouched but the ignorefile content was rewritten) - [ ] \`mise run review-compliance-reports\` — verify node-verification block still runs and prints the renamed table title Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/359 --- .../manifests/prowler/mutelist/apiserver.yaml | 24 +- .../prowler/mutelist/control-plane.yaml | 6 +- .../prowler/mutelist/core-pod-security.yaml | 33 ++- .../prowler/mutelist/manual-node-checks.yaml | 30 +-- argocd/manifests/prowler/mutelist/rbac.yaml | 15 +- .../prowler/mutelist/trivyignore.yaml | 24 +- compensating-controls.yaml | 210 ---------------- .../+compliance-mute-categories.doc.md | 1 - ...eview-cc-ephemeral-privileged-jobs.misc.md | 1 - ...review-cc-init-container-isolation.misc.md | 1 - .../+review-cc-trusted-ci-only.misc.md | 1 - .../changelog.d/prowler-iac-mutelist.infra.md | 2 +- ...ervability-stack-audit-2026-05-11.infra.md | 1 - .../rip-out-compensating-controls.infra.md | 1 + .../explanation/compliance-mute-categories.md | 99 -------- .../operations/read-compliance-reports.md | 2 +- .../operations/record-review-evidence.md | 50 ---- .../review-compensating-controls.md | 80 ------ docs/reference/operations/security.md | 8 +- mise-tasks/review-compensating-controls | 229 ------------------ mise-tasks/review-compliance-reports | 12 +- 21 files changed, 72 insertions(+), 758 deletions(-) delete mode 100644 compensating-controls.yaml delete mode 100644 docs/changelog.d/+compliance-mute-categories.doc.md delete mode 100644 docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md delete mode 100644 docs/changelog.d/+review-cc-init-container-isolation.misc.md delete mode 100644 docs/changelog.d/+review-cc-trusted-ci-only.misc.md delete mode 100644 docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md create mode 100644 docs/changelog.d/rip-out-compensating-controls.infra.md delete mode 100644 docs/explanation/compliance-mute-categories.md delete mode 100644 docs/how-to/operations/record-review-evidence.md delete mode 100644 docs/how-to/operations/review-compensating-controls.md delete mode 100755 mise-tasks/review-compensating-controls diff --git a/argocd/manifests/prowler/mutelist/apiserver.yaml b/argocd/manifests/prowler/mutelist/apiserver.yaml index 5a25d4f..fd077e8 100644 --- a/argocd/manifests/prowler/mutelist/apiserver.yaml +++ b/argocd/manifests/prowler/mutelist/apiserver.yaml @@ -6,48 +6,48 @@ Mutelist: "apiserver_always_pull_images_plugin": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: single-user-cluster, local-registry. Only the operator has cluster access; all images pulled from private zot registry." + Description: "Only the operator has cluster access; all images pulled from private zot registry." "apiserver_audit_log_maxage_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_audit_log_maxbackup_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_audit_log_maxsize_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_audit_log_path_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_deny_service_external_ips": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. No external IPs routable; cluster only reachable via tailnet." + Description: "No external IPs routable; cluster only reachable via tailnet." "apiserver_disable_profiling": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet." + Description: "Profiling endpoint unreachable from public internet." "apiserver_encryption_provider_config_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation, single-user-cluster. Etcd not network-exposed; only operator has node access." + Description: "Etcd not network-exposed; only operator has node access." "apiserver_kubelet_cert_auth": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. Kubelet API not exposed outside the node; minikube auto-generates certificates." + Description: "Kubelet API not exposed outside the node; minikube auto-generates certificates." "apiserver_request_timeout_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. API server only reachable via tailnet; DoS risk limited to trusted clients." + Description: "API server only reachable via tailnet; DoS risk limited to trusted clients." "apiserver_service_account_lookup_true": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: single-user-cluster. Only operator manages service accounts; no revoked tokens in circulation." + Description: "Only operator manages service accounts; no revoked tokens in circulation." "apiserver_strong_ciphers_only": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. API server traffic encrypted by WireGuard at the network layer." + Description: "API server traffic encrypted by WireGuard at the network layer." diff --git a/argocd/manifests/prowler/mutelist/control-plane.yaml b/argocd/manifests/prowler/mutelist/control-plane.yaml index 2056691..d3cc34a 100644 --- a/argocd/manifests/prowler/mutelist/control-plane.yaml +++ b/argocd/manifests/prowler/mutelist/control-plane.yaml @@ -6,12 +6,12 @@ Mutelist: "controllermanager_disable_profiling": Regions: ["*"] Resources: ["^kube-controller-manager-minikube$"] - Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet." + Description: "Profiling endpoint unreachable from public internet." "scheduler_profiling": Regions: ["*"] Resources: ["^kube-scheduler-minikube$"] - Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet." + Description: "Profiling endpoint unreachable from public internet." "kubelet_tls_cert_and_key": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: tailscale-network-isolation, single-user-cluster. Kubelet API not exposed outside node; minikube auto-generates certificates." + Description: "Kubelet API not exposed outside node; minikube auto-generates certificates." diff --git a/argocd/manifests/prowler/mutelist/core-pod-security.yaml b/argocd/manifests/prowler/mutelist/core-pod-security.yaml index c39e0c6..b1e986e 100644 --- a/argocd/manifests/prowler/mutelist/core-pod-security.yaml +++ b/argocd/manifests/prowler/mutelist/core-pod-security.yaml @@ -17,9 +17,8 @@ Mutelist: - "^kindnet-" - "^storage-provisioner$" Description: >- - CC: tailscale-network-isolation. Control-plane and networking - pods require hostNetwork by design. Host network itself is - only reachable via tailnet. + Control-plane and networking pods require hostNetwork by design. + Host network itself is only reachable via tailnet. "core_minimize_privileged_containers": Regions: ["*"] Resources: @@ -31,7 +30,6 @@ Mutelist: # Forgejo runner - "^forgejo-runner-" Description: >- - CC: single-user-cluster, operator-managed-pods, trusted-ci-only. kube-proxy: system pod, single-user cluster. ts-*/ingress-*: Tailscale operator-managed. forgejo-runner: DinD limited to trusted private forge repos. @@ -49,25 +47,24 @@ Mutelist: - "^nameserver-" - "^ingress-" Description: >- - CC: single-user-cluster, operator-managed-pods. System pods - managed by minikube and Tailscale operator; seccomp profiles - set by upstream. Single-user cluster limits exploit surface. + System pods managed by minikube and Tailscale operator; + seccomp profiles set by upstream. Single-user cluster limits + exploit surface. "core_minimize_hostPID_containers": Regions: ["*"] Resources: - "^prowler-" Description: >- - CC: ephemeral-privileged-jobs. Prowler CIS scanner requires - hostPID for file permission checks. Runs as CronJob with - 7-day TTL, not a persistent workload. + Prowler CIS scanner requires hostPID for file permission + checks. Runs as CronJob with 7-day TTL, not a persistent + workload. "core_minimize_root_containers_admission": Regions: ["*"] Resources: - "^grafana-" Description: >- - CC: init-container-isolation. Root limited to init-chown-data - container; all runtime containers run as UID 472 with caps - dropped. + Root limited to init-chown-data container; all runtime + containers run as UID 472 with caps dropped. "core_minimize_containers_added_capabilities": Regions: ["*"] Resources: @@ -77,10 +74,9 @@ Mutelist: # Grafana init-chown-data - "^grafana-" Description: >- - CC: single-user-cluster, init-container-isolation. System - pods: capabilities required by function (minikube-managed). - Grafana: CHOWN limited to init phase; runtime containers - drop ALL. + System pods: capabilities required by function + (minikube-managed). Grafana: CHOWN limited to init phase; + runtime containers drop ALL. "core_minimize_containers_capabilities_assigned": Regions: ["*"] Resources: @@ -88,5 +84,4 @@ Mutelist: - "^kindnet-" - "^grafana-" Description: >- - CC: single-user-cluster, init-container-isolation. See - core_minimize_containers_added_capabilities. + See core_minimize_containers_added_capabilities. diff --git a/argocd/manifests/prowler/mutelist/manual-node-checks.yaml b/argocd/manifests/prowler/mutelist/manual-node-checks.yaml index 9c8354d..c91a2a6 100644 --- a/argocd/manifests/prowler/mutelist/manual-node-checks.yaml +++ b/argocd/manifests/prowler/mutelist/manual-node-checks.yaml @@ -1,7 +1,7 @@ # Node-level and RBAC checks that Prowler reports as MANUAL because it -# cannot evaluate them from inside a pod. Compensated by automated -# verification in `mise run review-compliance-reports`, which SSHes into -# the minikube node and checks each condition directly every week. +# cannot evaluate them from inside a pod. Verified out-of-band by the +# node-verification block in `mise run review-compliance-reports`, which +# SSHes into the minikube node and checks each condition directly. Mutelist: Accounts: "*": @@ -9,51 +9,51 @@ Mutelist: "etcd_unique_ca": Regions: ["*"] Resources: ["^etcd-minikube$"] - Description: "CC: node-config-automated-verification. Etcd CA fingerprint verified different from cluster CA by review-compliance-reports." + Description: "Etcd CA fingerprint verified different from cluster CA by review-compliance-reports." "kubelet_conf_file_ownership": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports." + Description: "File ownership verified root:root by review-compliance-reports." "kubelet_conf_file_permissions": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File permissions verified 600 by review-compliance-reports." + Description: "File permissions verified 600 by review-compliance-reports." "kubelet_config_yaml_ownership": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports." + Description: "File ownership verified root:root by review-compliance-reports." "kubelet_config_yaml_permissions": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File permissions verified 644 by review-compliance-reports." + Description: "File permissions verified 644 by review-compliance-reports." "kubelet_service_file_ownership_root": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports." + Description: "File ownership verified root:root by review-compliance-reports." "kubelet_service_file_permissions": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File permissions verified 644 by review-compliance-reports." + Description: "File permissions verified 644 by review-compliance-reports." "kubelet_disable_read_only_port": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. readOnlyPort absence (defaults to 0) verified by review-compliance-reports." + Description: "readOnlyPort absence (defaults to 0) verified by review-compliance-reports." "kubelet_event_record_qps": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. eventRecordQPS absence (defaults to 5) verified by review-compliance-reports." + Description: "eventRecordQPS absence (defaults to 5) verified by review-compliance-reports." "kubelet_manage_iptables": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. makeIPTablesUtilChains absence (defaults to true) verified by review-compliance-reports." + Description: "makeIPTablesUtilChains absence (defaults to true) verified by review-compliance-reports." "kubelet_strong_ciphers_only": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification, tailscale-network-isolation. Go default ciphers used; all traffic WireGuard-encrypted via tailnet." + Description: "Go default ciphers used; all traffic WireGuard-encrypted via tailnet." "rbac_cluster_admin_usage": Regions: ["*"] Resources: - "^cluster-admin$" - "^kubeadm:cluster-admins$" - "^minikube-rbac$" - Description: "CC: node-config-automated-verification, single-user-cluster. Only built-in/minikube cluster-admin bindings present; verified by review-compliance-reports." + Description: "Only built-in/minikube cluster-admin bindings present; verified by review-compliance-reports." diff --git a/argocd/manifests/prowler/mutelist/rbac.yaml b/argocd/manifests/prowler/mutelist/rbac.yaml index c9c52e4..324809d 100644 --- a/argocd/manifests/prowler/mutelist/rbac.yaml +++ b/argocd/manifests/prowler/mutelist/rbac.yaml @@ -13,9 +13,8 @@ Mutelist: # ArgoCD - "^argocd-" Description: >- - CC: single-user-cluster, sso-gated-admin-tools. Built-in - K8s roles: only operator can bind them. ArgoCD: requires - broad access but is SSO-gated via Authentik OIDC. + Built-in K8s roles: only operator can bind them. ArgoCD: + requires broad access but is SSO-gated via Authentik OIDC. "rbac_minimize_pod_creation_access": Regions: ["*"] Resources: @@ -26,14 +25,12 @@ Mutelist: # CloudNativePG operator - "^cnpg-manager$" Description: >- - CC: single-user-cluster. Built-in K8s roles and CNPG - operator. Only the operator can assign these roles; no - untrusted users have cluster access. + Built-in K8s roles and CNPG operator. Only the operator can + assign these roles; no untrusted users have cluster access. "rbac_minimize_service_account_token_creation": Regions: ["*"] Resources: - "^system:" Description: >- - CC: single-user-cluster. kube-controller-manager requires - token creation for SA management. Only operator manages - service accounts. + kube-controller-manager requires token creation for SA + management. Only operator manages service accounts. diff --git a/argocd/manifests/prowler/mutelist/trivyignore.yaml b/argocd/manifests/prowler/mutelist/trivyignore.yaml index 22c612a..87af966 100644 --- a/argocd/manifests/prowler/mutelist/trivyignore.yaml +++ b/argocd/manifests/prowler/mutelist/trivyignore.yaml @@ -14,26 +14,24 @@ misconfigurations: paths: - "argocd/manifests/external-secrets/rbac.yaml" statement: >- - CC: operator-purpose-bound-rbac. external-secrets-operator's entire - function is to read and synthesize Secret objects; ClusterRole over - secrets is its purpose. Both the controller and cert-controller are + external-secrets-operator's entire function is to read and + synthesize Secret objects; ClusterRole over secrets is its + purpose. Both the controller and cert-controller are upstream-defined. - id: KSV-0041 paths: - "argocd/manifests/kube-state-metrics/rbac.yaml" - "argocd/manifests/kube-state-metrics-ringtail/rbac.yaml" statement: >- - CC: kube-state-metrics-metadata-only. KSM exposes only Secret - metadata (name, namespace, type, labels), never the data field. - list/watch on secrets is required for kube_secret_info / - kube_secret_labels metrics. + KSM exposes only Secret metadata (name, namespace, type, labels), + never the data field. list/watch on secrets is required for + kube_secret_info / kube_secret_labels metrics. - id: KSV-0114 paths: - "argocd/manifests/external-secrets/rbac.yaml" statement: >- - CC: operator-purpose-bound-rbac. cert-controller manages the - external-secrets validating webhook configurations to inject its - own rotating CA bundle. RBAC is scoped to two named webhooks - (secretstore-validate, externalsecret-validate) via resourceNames; - KSV-0114 doesn't see the resourceNames restriction so reports the - full ClusterRole. + cert-controller manages the external-secrets validating webhook + configurations to inject its own rotating CA bundle. RBAC is + scoped to two named webhooks (secretstore-validate, + externalsecret-validate) via resourceNames; KSV-0114 doesn't see + the resourceNames restriction so reports the full ClusterRole. diff --git a/compensating-controls.yaml b/compensating-controls.yaml deleted file mode 100644 index 01b3cfd..0000000 --- a/compensating-controls.yaml +++ /dev/null @@ -1,210 +0,0 @@ -# Compensating Controls -# -# Documents controls that mitigate risks from suppressed or accepted security -# findings. Referenced by security tools (Prowler mutelist, Kingfisher config, -# etc.) via "CC: " in finding descriptions or suppression notes. -# -# Used by `mise run review-compensating-controls` to surface stale controls. -# -# Fields: -# id - kebab-case unique identifier, referenced from tool configs -# description - what the control actually does to mitigate risk -# created - date (YYYY-MM-DD) the control was documented -# last-reviewed - date (YYYY-MM-DD) or null -# notes - optional context - -controls: - - id: single-user-cluster - description: >- - Only the cluster operator (eblume) has kubectl access. No untrusted - users can create pods, access cached images, or bind RBAC roles. - created: 2026-03-30 - last-reviewed: 2026-04-01 - notes: >- - Verify by checking kubeconfig distribution and Tailscale ACLs. - If additional users gain cluster access, re-evaluate all findings - muted under this control. - - - id: tailscale-network-isolation - description: >- - Cluster is not internet-exposed. All access requires Tailscale - identity with ACL enforcement. Profiling endpoints, debug ports, - and control-plane APIs are unreachable from the public internet. - created: 2026-03-30 - last-reviewed: 2026-04-06 - notes: >- - Verify with 'tailscale serve status --json' on indri and review - Tailscale ACLs in pulumi/tailscale/. Only tag:flyio-target services - are publicly routable. - - - id: local-registry - description: >- - Operator-built services use a private zot registry - (registry.ops.eblu.me) for supply-chain control. Remaining - images are pulled from public registries without stored - credentials. No shared registry secrets are cached on cluster - nodes. - created: 2026-03-30 - last-reviewed: 2026-04-12 - notes: >- - Verify by checking image prefixes in kustomization.yaml files. - Known external-image categories: (1) upstream apps not yet - mirrored — immich, ollama, frigate, frigate-notify, valkey; - (2) infrastructure components — tailscale operator/proxy, - external-secrets, 1password-connect, forgejo-runner, docker - DinD, nvidia-device-plugin; (3) utility base images — busybox, - alpine (grafana init containers). Track upstream versions in - service-versions.yaml. Goal is to progressively mirror these - into zot. - - - id: sso-gated-admin-tools - description: >- - ArgoCD requires SSO authentication via Authentik OIDC. Wildcard - RBAC roles are mitigated by requiring authenticated identity - before any API access. - created: 2026-03-30 - last-reviewed: 2026-04-14 - notes: >- - Verify Authentik OIDC provider config for ArgoCD and that - anonymous access is disabled. Check ArgoCD --auth-token isn't - leaked. The workflow-bot API key account is scoped to sync/get - only. - - - id: operator-managed-pods - description: >- - Tailscale operator manages proxy pod specs (ts-*, ingress-*, - operator-*, nameserver-*). Pod security settings are set by the - operator, not user manifests. Operator is tracked in - service-versions.yaml and regularly updated. - created: 2026-03-30 - last-reviewed: 2026-04-21 - notes: >- - Verify operator version is current via 'mise run service-review'. - Check Tailscale changelog for security fixes. If operator adds - seccomp support, remove these mutes. As of 2026-04-21: still no - default seccomp on operator-generated pods (upstream issue #7359 - open). A ProxyClass + generic device plugin can downgrade proxies - from privileged to NET_ADMIN+NET_RAW and set seccompProfile — - potential future remediation to remove the seccomp mute without - waiting for upstream defaults. - - - id: ephemeral-privileged-jobs - description: >- - Prowler CIS scanner runs as a CronJob with 7-day TTL - auto-deletion, not as a persistent privileged workload. hostPID - exposure is time-bounded to scan duration (~20s). - created: 2026-03-30 - last-reviewed: 2026-04-29 - notes: >- - Verify TTL is set in cronjob.yaml. Check that no persistent - pods run with hostPID on the scanned cluster (indri). The - alloy-tracing DaemonSet on ringtail also uses hostPID but is - out of scope — Prowler only scans indri. Tracked in Todoist: - "prowler scan against ringtail" — once that lands, the - DaemonSet's hostPID+privileged posture will surface as a CIS - finding and need its own CC or remediation. - - - id: trusted-ci-only - description: >- - Forgejo runner only executes workflows from repos on the private - forge (forge.ops.eblu.me). No external or untrusted repos can - trigger privileged CI jobs. - created: 2026-03-30 - last-reviewed: 2026-05-01 - notes: >- - Verification: (1) Runner config (argocd/manifests/forgejo-runner/ - config.yaml) connects only to https://forge.ops.eblu.me/. (2) Forge - app.ini has DISABLE_REGISTRATION=true and ALLOW_ONLY_EXTERNAL_REGISTRATION - =true (ansible/roles/forgejo/defaults/main.yml) — no untrusted users - can sign up or create repos. The runner registers at instance scope - (repo_id=0/owner_id=0 in action_runner table), but the instance itself - is closed, so no per-repo allow-list is needed. Re-evaluate if the - forge ever opens to additional users or if the runner is repointed - to an external forge. - - - id: init-container-isolation - description: >- - Root privileges and added capabilities (CHOWN) are limited to - init containers that run once at pod startup. All runtime - containers run as non-root (UID 472) with all capabilities - dropped. - created: 2026-03-30 - last-reviewed: 2026-05-04 - notes: >- - Verify by inspecting grafana deployment.yaml securityContext - for both init and runtime containers. If fsGroup alone can - handle PVC ownership, remove init-chown-data and this control. - Retirement deferred until grafana lands on ringtail's k3s - (see [[indri-k8s-migration]]) — storage backend will change, - and removing init-chown-data right before that migration - trades a real safety net for marginal cleanup. Revisit - post-migration. - - - id: node-config-automated-verification - description: >- - Prowler reports certain node-level checks as MANUAL because it runs - inside a pod and cannot evaluate kubelet file permissions, kubelet - config arguments, etcd CA separation, or cluster-admin RBAC bindings. - The review-compliance-reports script SSHes into the minikube node - weekly and programmatically verifies each condition, failing loudly - if any check deviates from expected values. - created: 2026-04-14 - last-reviewed: 2026-04-14 - notes: >- - Verification runs as part of 'mise run review-compliance-reports'. - If minikube node is unreachable, all checks report as FAIL. If new - MANUAL findings appear in Prowler, add corresponding verification - logic to the script and update the mutelist. - - - id: operator-purpose-bound-rbac - description: >- - Operators whose entire function is to manage a sensitive resource - legitimately need RBAC over that resource. external-secrets-operator - manages Secret objects (its purpose) and the cert-controller mutates - its own ValidatingWebhookConfigurations to inject rotating CA bundles. - Risk is bounded by: (1) the operator code being upstream open-source - and reviewed; (2) RBAC scoped to specific named webhooks where - possible; (3) supply chain controls on the operator image (mirrored - to local registry, version tracked in service-versions.yaml). - created: 2026-04-27 - last-reviewed: 2026-04-27 - notes: >- - Verify by checking that the operators in question still match their - stated purpose (i.e. external-secrets is still the only consumer of - these ClusterRoles) and that upstream hasn't published advisories - for credential-handling bugs. Re-evaluate if a non-secrets-managing - ClusterRole appears under this control. - - - id: kube-state-metrics-metadata-only - description: >- - kube-state-metrics holds list/watch on Secrets cluster-wide but only - exposes Secret object *metadata* (name, namespace, type, creation - timestamp, labels) via the kube_secret_info / kube_secret_labels - metrics. Secret data fields are never read into KSM's exposed - metrics by upstream design. Mitigation rests on KSM's metric - schema, the version pin in service-versions.yaml, and the metrics - endpoint being reachable only on the cluster network. - created: 2026-04-27 - last-reviewed: 2026-04-27 - notes: >- - Verify by inspecting the /metrics endpoint output for any series - that include secret data (only *_info and *_labels metrics should - reference secrets, and labels should be limited to user-applied - labels — never the data:). Re-evaluate on KSM version bumps. - - - id: observability-stack-audit - description: >- - Alloy collects pod logs and ships them to Loki, providing an - audit trail for cluster activity. Compensates for missing - apiserver audit logging which neither minikube (indri) nor - k3s (ringtail) configures by default. - created: 2026-03-30 - last-reviewed: 2026-05-11 - notes: >- - Verify Alloy DaemonSet is running on each cluster (alloy-k8s on - minikube, alloy-ringtail on k3s) and Loki is receiving logs. - Note this is weaker than native apiserver audit logs — it - captures pod stdout/stderr, not API request-level auditing. - Consider enabling apiserver audit logging on k3s post-migration - (`--audit-log-path` / `--audit-policy-file`) — minikube made it - hard, k3s makes it straightforward. diff --git a/docs/changelog.d/+compliance-mute-categories.doc.md b/docs/changelog.d/+compliance-mute-categories.doc.md deleted file mode 100644 index c776e46..0000000 --- a/docs/changelog.d/+compliance-mute-categories.doc.md +++ /dev/null @@ -1 +0,0 @@ -New explanation article [[compliance-mute-categories]] documenting the gap between current `CC:`-only mute tagging and the three structurally distinct categories (compensating control, not-applicable, risk-accepted) needed for real PCI DSS / SOC2 practice. Captures the current image-scan mutelist gap (`cronjob-image-scan.yaml` doesn't pass `--mutelist-file`) and proposes an order-of-operations for wiring it up alongside the new tag conventions. Triggered by CVE-2026-31789, an OpenSSL 32-bit-only finding that surfaced the need for an NA category. diff --git a/docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md b/docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md deleted file mode 100644 index 14dcdca..0000000 --- a/docs/changelog.d/+review-cc-ephemeral-privileged-jobs.misc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed compensating control `ephemeral-privileged-jobs`: TTL and hostPID scope verified on indri. Noted that the alloy-tracing DaemonSet on ringtail is out of scope until Prowler scans ringtail (tracked in Todoist). diff --git a/docs/changelog.d/+review-cc-init-container-isolation.misc.md b/docs/changelog.d/+review-cc-init-container-isolation.misc.md deleted file mode 100644 index 295e7f8..0000000 --- a/docs/changelog.d/+review-cc-init-container-isolation.misc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed compensating control `init-container-isolation` (35 days stale). Grafana's running pod matches the manifest and the CC's claim — only `init-chown-data` runs as root with `CHOWN`; runtime containers all run as UID 472 with all caps dropped. Retirement (replacing init-chown-data with `fsGroup` alone) is plausible given the in-tree minikube-hostpath provisioner, but deferred until grafana lands on ringtail's k3s — note added to the CC. diff --git a/docs/changelog.d/+review-cc-trusted-ci-only.misc.md b/docs/changelog.d/+review-cc-trusted-ci-only.misc.md deleted file mode 100644 index 89dc653..0000000 --- a/docs/changelog.d/+review-cc-trusted-ci-only.misc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed compensating control `trusted-ci-only`: Forgejo runner is registered only to the private forge, which has registration disabled — no untrusted users can create repos or trigger privileged CI. Tightened the notes to reflect that the closed-forge property (not a per-repo allow-list) is what actually mitigates the risk. diff --git a/docs/changelog.d/prowler-iac-mutelist.infra.md b/docs/changelog.d/prowler-iac-mutelist.infra.md index 793c1ec..077cfa8 100644 --- a/docs/changelog.d/prowler-iac-mutelist.infra.md +++ b/docs/changelog.d/prowler-iac-mutelist.infra.md @@ -1 +1 @@ -Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var. Two new compensating controls — `operator-purpose-bound-rbac` and `kube-state-metrics-metadata-only` — justify muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`. +Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var, muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`. diff --git a/docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md b/docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md deleted file mode 100644 index 8100c6a..0000000 --- a/docs/changelog.d/review-cc-observability-stack-audit-2026-05-11.infra.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed compensating control `observability-stack-audit`. Updated description to cover ringtail's k3s as well as indri's minikube; both Alloy DaemonSets and Loki are healthy. diff --git a/docs/changelog.d/rip-out-compensating-controls.infra.md b/docs/changelog.d/rip-out-compensating-controls.infra.md new file mode 100644 index 0000000..d41fd1a --- /dev/null +++ b/docs/changelog.d/rip-out-compensating-controls.infra.md @@ -0,0 +1 @@ +Ripped out the compensating-controls (CC) framework: deleted `compensating-controls.yaml`, the `review-compensating-controls` mise task, and the associated how-to / explanation docs. Prowler and Kingfisher continue to run weekly and produce reports; the Prowler mutelist YAML files remain in place but no longer carry `CC: ` prefixes — each entry just keeps a free-form `Description` of why the finding is muted. The CC review cadence proved to be more overhead than this single-operator homelab needed. diff --git a/docs/explanation/compliance-mute-categories.md b/docs/explanation/compliance-mute-categories.md deleted file mode 100644 index 4c5f3a3..0000000 --- a/docs/explanation/compliance-mute-categories.md +++ /dev/null @@ -1,99 +0,0 @@ ---- -title: Compliance Mute Categories -modified: 2026-05-04 -last-reviewed: 2026-05-04 -tags: - - explanation - - security - - compliance ---- - -# Compliance Mute Categories - -> **Note:** This article was drafted by AI and reviewed by Erich. I plan to rewrite all explanatory content in my own words - these serve as placeholders to establish the documentation structure. - -How BlumeOps should categorize muted compliance findings, why a single "compensating control" tag is not enough, and what tooling work is needed to support multiple categories cleanly. - -## Why this matters - -When a compliance scanner ([[prowler]], Trivy via Prowler IaC, Kingfisher) reports a failing finding, there are three structurally different reasons we might suppress it: - -1. **Compensating control (CC)** — the requirement applies and we *do not* meet it directly, but an alternative control mitigates the same risk. -2. **Not applicable (NA)** — the requirement's preconditions cannot be satisfied in our environment, so the finding is structurally inert (e.g. a 32-bit-only CVE on 64-bit-only hosts). -3. **Risk accepted (RA)** — the requirement applies, we do not meet it, no compensating control exists, and we have explicitly chosen to accept the residual risk for a bounded period. - -Today every muted finding in BlumeOps uses the `CC: ` convention. That conflates all three categories. In a real PCI DSS or SOC2 environment, auditors treat them very differently: - -- A CC requires documentation of the constraint, the alternative measure, and recurring validation that the measure still works. -- An NA requires documentation of *why* the precondition cannot be met, with periodic verification that the environmental fact still holds. -- An RA requires an explicit decision-maker, an expiry date, and a scheduled re-decision. - -Mixing them under one tag means stale CCs hide stale RAs, and NAs that should be revisited when the environment changes get treated as permanent fixtures. - -## Trigger case: CVE-2026-31789 - -The 2026-05-03 weekly compliance review surfaced [CVE-2026-31789](https://nvd.nist.gov/vuln/detail/CVE-2026-31789), an OpenSSL heap buffer overflow during X.509 certificate processing on **32-bit systems**. Prowler's image scanner flagged 216 findings across 106 BlumeOps images carrying `libssl3` / `libcrypto3` below the fixed versions. - -The CVE is genuine, but its preconditions cannot be satisfied in our environment: indri is Apple Silicon (arm64), ringtail is x86_64, and we run no 32-bit containers. This is the canonical NA case — not a CC, because there is no "alternative measure mitigating the risk." The risk does not exist for us at all. - -A CC like `no-32bit-runtimes` would technically work, but conflates the categories: if we ever introduce a 32-bit runtime we would have to remember that this CC was load-bearing for the mute, retire or scope it down, and reopen the muted findings. An NA tag with a short justification makes the precondition explicit and self-documents the conditions under which it must be revisited. - -## Current tooling state - -Three Prowler scans run weekly. Their mute paths today: - -| Scan | Mute mechanism | File(s) | -|------|----------------|---------| -| K8s CIS (Sunday) | Prowler `--mutelist-file`, merged from ConfigMap | `argocd/manifests/prowler/mutelist/*.yaml` | -| IaC (Saturday) | Trivy `--ignorefile` shim (Prowler's `--mutelist-file` is a no-op for IaC) | `argocd/manifests/prowler/mutelist/trivyignore.yaml` | -| Container Images (Saturday) | **None — `cronjob-image-scan.yaml` does not pass `--mutelist-file`** | n/a | - -The image scan has never been wired to a mutelist. The CSV reports do contain a `MUTED` column, but it is always `False` because no mutelist is supplied. All 14k+ image findings flow through to `review-compliance-reports` unfiltered. - -The mute tag convention is consistent across the two configured scans: each entry's `Description:` (or `statement:` for trivyignore) starts with `CC: . `. `mise run review-compensating-controls` greps for those IDs to find every file that depends on each control. There is no NA tag, no RA tag, and no expiry field. - -## Proposed model - -### Tag prefixes - -Extend the description-prefix convention: - -- `CC: . ` — references an entry in `compensating-controls.yaml`. Existing convention, unchanged. -- `NA: . ` — environmental precondition fails. Reason should be specific enough that a reviewer can verify it (e.g. `NA: no 32-bit runtimes`, not `NA: doesn't apply`). -- `RA: ; expires . ` — explicit risk acceptance with a hard expiry. Past the expiry, re-review is mandatory. - -Tag choice is exclusive: a given mute is one of CC, NA, or RA. If two reasons apply, pick the strongest — CC > RA > NA. - -### Tooling changes required - -1. **Wire the image scan to a mutelist.** Add `argocd/manifests/prowler/mutelist/image-cves.yaml`, mount-and-merge it the same way `cronjob.yaml` mounts its mutelist parts, and pass `--mutelist-file` to `prowler image`. Verify experimentally that `prowler image` honors the flag — Prowler's behavior across providers is inconsistent, and the IaC provider notably does not. If `prowler image` ignores it, fall back to post-scan filtering inside `review-compliance-reports`. - -2. **Teach `review-compensating-controls` (or a sibling) to surface NA and RA entries.** CCs already get a staleness queue. NAs should appear in a separate queue keyed on the reason text — when an NA reason becomes false (e.g. we do introduce a 32-bit runtime), every NA mute citing that reason must be reopened. RAs should sort by expiry date, with anything past expiry flagged red. - -3. **Expiry parsing.** RA tags carry a hard date. The simplest path is to parse it from the description string at review time. A more durable path is to extend the mutelist YAML schema with a structured `expires:` field and a small wrapper that strips it before passing the file to Prowler. Either works; the structured field is friendlier to editors. - -### Out of scope (for now) - -- Changing the underlying Prowler mutelist YAML schema. Stay within the `Mutelist:` shape Prowler expects. -- Migrating existing `CC:` entries. The current set is genuinely CCs and should stay tagged that way. -- Building an issue-tracker integration. Todoist is the source of truth for "remember to re-review this" until that scales painfully. - -## Order of operations - -When this work is picked up, the suggested sequence is: - -1. **Scope and confirm.** Re-read this article, confirm the model still fits, adjust if not. -2. **Wire the image-scan mutelist.** Smallest atomic change; produces immediate value (the CVE-2026-31789 mute can land as the first NA entry). -3. **Add the NA convention.** Update [[read-compliance-reports]] and [[review-compensating-controls]] how-tos to describe the three tag prefixes. The convention can land before tooling supports it — review will just be manual until tooling catches up. -4. **Extend the review tools.** Add NA and RA queues to `review-compensating-controls` (or a new task). At this point, parse expiry from RA descriptions. -5. **Optionally: structured expiry.** If RA entries become common, migrate to a structured `expires:` YAML field with a wrapper that filters it out before Prowler reads the file. - -The first three steps are a coherent C1. Steps 4–5 can be split off if scope creeps. - -## Related - -- [[read-compliance-reports]] — the weekly review process this feeds into -- [[review-compensating-controls]] — current CC review tooling -- [[security-model]] — overall security posture -- [[prowler]] — scanner reference -- [[agent-change-process]] — how to scope and execute the implementation diff --git a/docs/how-to/operations/read-compliance-reports.md b/docs/how-to/operations/read-compliance-reports.md index 75fd3ab..e676ad5 100644 --- a/docs/how-to/operations/read-compliance-reports.md +++ b/docs/how-to/operations/read-compliance-reports.md @@ -80,7 +80,7 @@ Not all failures require action. Common expected failures in our minikube cluste 1. **Triage** — review new failures, distinguish real issues from expected noise 2. **Remediate** — fix what you can (pod security contexts, RBAC tightening) -3. **Mutelist** — suppress expected/accepted failures via Prowler's `--mutelist-file` to reduce noise in future scans +3. **Mutelist** — suppress expected/accepted failures by adding a Resource entry under the matching Check in `argocd/manifests/prowler/mutelist/*.yaml` with a free-form `Description` explaining why 4. **Track** — compare reports over time to spot regressions ## Related diff --git a/docs/how-to/operations/record-review-evidence.md b/docs/how-to/operations/record-review-evidence.md deleted file mode 100644 index 9de4e37..0000000 --- a/docs/how-to/operations/record-review-evidence.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: Record Review Evidence -modified: 2026-04-01 -last-reviewed: 2026-04-01 -tags: - - how-to - - security - - compliance ---- - -# Record Review Evidence - -How review evidence *would* be captured after a [[review-compensating-controls|compensating control review]], to make the review auditable under a compliance framework. - -blumeops does not currently collect review evidence. This card documents the target process for reference and practice. - -## Why Record Evidence? - -Reviewing a control and updating `last-reviewed` proves the review *happened* but not *what was checked*. Under frameworks like PCI DSS v4.0, a QSA needs to see dated, immutable evidence that the reviewer verified the control and that an appropriate party accepted the residual risk. Compliance platforms like Drata automate this collection, but the underlying artifacts are the same whether you use a platform or a directory of files. - -## What Evidence Would Be Captured - -For each control reviewed, artifacts should answer: - -1. **Who reviewed it** — reviewer name, date -2. **What was verified** — the specific checks performed (e.g., Tailscale ACL policy snapshot, `tailscale status` output, kubectl auth checks) -3. **What was found** — the outcome: control still in effect, circumstances changed, or control invalidated -4. **Residual risk** — what the control does *not* cover (the gap a QSA will ask about) -5. **Acceptance** — formal sign-off that the residual risk is accepted by an appropriate party (reviewer + approver, typically a manager or CTO) - -Supporting artifacts would include command output, policy snapshots, screenshots, or API responses — anything that demonstrates the verification was actually performed. - -## PCI DSS Context - -Under PCI DSS v4.0, compensating controls require a **Compensating Control Worksheet (CCW)** that maps each control to the original requirement it substitutes for. The CCW fields are: - -- **Original requirement** — the specific PCI DSS requirement not directly met -- **Constraint** — why direct compliance isn't feasible -- **Compensating control definition** — what is done instead -- **Risk addressed** — how the control mitigates the original threat -- **Residual risk** — what remains unmitigated -- **Validation procedure** — steps to verify (what `notes` captures in `compensating-controls.yaml`) - -Req 12.3.2 mandates review **at least annually** (quarterly is typical for Level 1 Service Providers). In a platform like Drata, these map to Controls with uploaded Evidence and review workflows requiring sign-off from both the reviewer and an approver. - -## Related - -- [[review-compensating-controls]] — The technical review process -- [[security]] — Security posture overview -- [[read-compliance-reports]] — Interpreting Prowler/Kingfisher reports diff --git a/docs/how-to/operations/review-compensating-controls.md b/docs/how-to/operations/review-compensating-controls.md deleted file mode 100644 index 8a32d98..0000000 --- a/docs/how-to/operations/review-compensating-controls.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -title: Review Compensating Controls -modified: 2026-03-30 -last-reviewed: 2026-03-30 -tags: - - how-to - - security - - maintenance ---- - -# Review Compensating Controls - -How to periodically review compensating controls that justify suppressed security findings. - -## Review by Staleness - -Show controls sorted by when they were last reviewed (most stale first): - -```bash -mise run review-compensating-controls -``` - -This reads `compensating-controls.yaml` (repo root), sorts by `last-reviewed`, and displays the most stale control with all codebase references. It also searches for every file that references the control ID, so you can see exactly which suppressed findings depend on it. - -To show more entries: - -```bash -mise run review-compensating-controls --limit 20 -``` - -## What is a Compensating Control? - -A compensating control is a security measure that mitigates the risk a finding was designed to detect, when the finding itself cannot be directly remediated. For example: - -- **Finding:** API server does not enable AlwaysPullImages admission plugin -- **Risk:** Untrusted users could run pods using cached images they shouldn't have access to -- **Compensating control:** `single-user-cluster` — only the operator has kubectl access; no untrusted users can create pods - -Controls are documented in `compensating-controls.yaml` and referenced from security tool configurations (Prowler mutelist files, Kingfisher config, etc.) using the format `CC: `. - -A compensating control is only one of three structurally distinct ways to suppress a finding — see [[compliance-mute-categories]] for when to reach for a CC versus a not-applicable (`NA:`) or risk-accepted (`RA:`) tag instead. - -## Review Process - -For each control up for review: - -1. **Understand the risk.** Read each suppressed finding that references this control. What attack or misconfiguration does the original check guard against? - -2. **Verify the control is in effect.** Follow the verification steps in the control's `notes` field. For example, for `tailscale-network-isolation`, check that the cluster is not directly internet-exposed and Tailscale ACLs are enforced. - -3. **Assess whether the control actually mitigates the risk.** A compensating control should address the same threat the check was designed to catch, not just be a vaguely related security measure. If it doesn't hold up, either: - - Fix the underlying finding and remove the suppression - - Document a stronger or more specific compensating control - -4. **Check for changed circumstances.** Has the cluster gained new users? Has a service been exposed publicly? Has an operator added native support for the missing feature? Any of these could invalidate the control. - -5. **Update the review date.** Edit `compensating-controls.yaml` and set `last-reviewed` to today's date. Commit alongside any changes. - -## Adding a New Control - -When suppressing a new security finding, either map it to an existing control or add a new one: - -```yaml -- id: my-new-control - description: >- - What this control does and how it mitigates the specific risk. - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - How to verify this control is still in effect. -``` - -Then reference it in the suppression configuration with `CC: my-new-control`. - -## Related - -- [[record-review-evidence]] — Capturing evidence artifacts for audit (aspirational) -- [[security]] — Security posture overview -- [[read-compliance-reports]] — Accessing and interpreting Prowler reports -- [[review-services]] — Periodic service version review (similar staleness pattern) diff --git a/docs/reference/operations/security.md b/docs/reference/operations/security.md index 18561a5..11c4df9 100644 --- a/docs/reference/operations/security.md +++ b/docs/reference/operations/security.md @@ -46,13 +46,7 @@ Security posture and compliance scanning for BlumeOps infrastructure. All compliance scan reports are stored on `sifaka:/volume1/reports/`. See [[read-compliance-reports]] for access and interpretation. -## Compensating controls - -Suppressed findings reference named compensating controls tracked in `compensating-controls.yaml` (repo root). Each control has a review date and verification steps. See [[review-compensating-controls]] for the review process. - -```bash -mise run review-compensating-controls -``` +Suppressed findings are kept in Prowler mutelist YAML under `argocd/manifests/prowler/mutelist/`. Each entry's `Description` field explains why the finding is muted; entries are reviewed ad-hoc rather than on a scheduled cadence. ## Known gaps diff --git a/mise-tasks/review-compensating-controls b/mise-tasks/review-compensating-controls deleted file mode 100755 index e92d302..0000000 --- a/mise-tasks/review-compensating-controls +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.12" -# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] -# /// -#MISE description="Review the most stale compensating control" -#USAGE flag "--limit " default="10" help="Number of controls to show in the table" -"""Review compensating controls by staleness. - -Reads ``compensating-controls.yaml`` and sorts by ``last-reviewed``. -Shows a staleness table, then displays the most stale control with all -references found in the codebase. - -After reviewing, update the control entry: - - last-reviewed: YYYY-MM-DD - -Usage: mise run review-compensating-controls [--limit 10] -""" - -import subprocess -import sys -from datetime import date -from pathlib import Path -from typing import Annotated - -import typer -import yaml -from rich.console import Console -from rich.panel import Panel -from rich.table import Table - -CONTROLS_FILE = Path(__file__).parent.parent / "compensating-controls.yaml" -REPO_ROOT = Path(__file__).parent.parent - - -def load_controls(path: Path) -> list[dict]: - data = yaml.safe_load(path.read_text()) - return data.get("controls", []) - - -def parse_date(raw) -> date | None: - if raw is None: - return None - if isinstance(raw, date): - return raw - try: - return date.fromisoformat(str(raw)) - except ValueError: - return None - - -def find_references(control_id: str) -> list[str]: - """Find all files referencing a control ID using ripgrep.""" - try: - result = subprocess.run( - ["rg", "--no-heading", "-n", control_id, str(REPO_ROOT)], - capture_output=True, - text=True, - timeout=10, - ) - lines = result.stdout.strip().splitlines() - # Exclude the controls file itself and this script - return [ - ln - for ln in lines - if "compensating-controls.yaml" not in ln - and "review-compensating-controls" not in ln - ] - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - -def main( - limit: Annotated[ - int, typer.Option(help="Number of controls to show in the table") - ] = 10, -) -> None: - console = Console() - today = date.today() - - if not CONTROLS_FILE.exists(): - console.print( - f"[bold red]Controls file not found:[/bold red] {CONTROLS_FILE}" - ) - raise typer.Exit(code=1) - - controls = load_controls(CONTROLS_FILE) - - # Parse dates and build sortable entries - entries: list[tuple[dict, date | None]] = [] - for ctrl in controls: - reviewed = parse_date(ctrl.get("last-reviewed")) - entries.append((ctrl, reviewed)) - - # Sort: never-reviewed first, then oldest - entries.sort(key=lambda e: (e[1] is not None, e[1] or date.min)) - - never_reviewed = sum(1 for _, r in entries if r is None) - - # --- Summary panel --- - console.print() - console.print( - Panel( - f"[bold]{len(entries)}[/bold] compensating controls, " - f"[bold red]{never_reviewed}[/bold red] never reviewed", - title="[bold]Compensating Control Review Queue[/bold]", - border_style="cyan", - ) - ) - console.print() - - # --- Staleness table --- - table = Table(show_header=True, header_style="bold") - table.add_column("#", justify="right") - table.add_column("Control ID") - table.add_column("Last Reviewed", justify="right") - table.add_column("Age (days)", justify="right") - table.add_column("Refs", justify="right") - - for i, (ctrl, reviewed) in enumerate(entries[:limit], 1): - control_id = ctrl["id"] - refs = len(find_references(control_id)) - - if reviewed is None: - table.add_row( - str(i), - f"[red]{control_id}[/red]", - "[red]never[/red]", - "[red]—[/red]", - str(refs), - ) - else: - age = (today - reviewed).days - style = "yellow" if age > 90 else "" - id_str = f"[{style}]{control_id}[/{style}]" if style else control_id - date_str = f"[{style}]{reviewed}[/{style}]" if style else str(reviewed) - age_str = f"[{style}]{age}[/{style}]" if style else str(age) - table.add_row(str(i), id_str, date_str, age_str, str(refs)) - - remaining = len(entries) - limit - if remaining > 0: - table.add_row("", f"[dim]… {remaining} more[/dim]", "", "", "") - - console.print(table) - console.print() - - # --- Most stale control detail --- - if not entries: - console.print("[bold red]No controls found![/bold red]") - raise typer.Exit(code=1) - - top_ctrl, top_reviewed = entries[0] - control_id = top_ctrl["id"] - refs = find_references(control_id) - - detail_lines = [ - f"[bold cyan]{control_id}[/bold cyan]", - f"[dim]Last reviewed: {top_reviewed or 'never'}[/dim]", - "", - f"[bold]Description:[/bold] {top_ctrl.get('description', '').strip()}", - ] - notes = top_ctrl.get("notes", "").strip() - if notes: - detail_lines.append(f"[bold]Notes:[/bold] {notes}") - - console.print( - Panel( - "\n".join(detail_lines), - title="[bold]Up For Review[/bold]", - border_style="green", - ) - ) - console.print() - - # --- References --- - if refs: - ref_table = Table( - show_header=True, header_style="bold", title="References in codebase" - ) - ref_table.add_column("File", style="cyan") - ref_table.add_column("Line") - - for ref in refs: - # rg output: file:line:content - parts = ref.split(":", 2) - if len(parts) >= 3: - filepath = parts[0].replace(str(REPO_ROOT) + "/", "") - line_no = parts[1] - content = parts[2].strip() - ref_table.add_row(f"{filepath}:{line_no}", content) - else: - ref_table.add_row(ref, "") - - console.print(ref_table) - else: - console.print( - f"[yellow]No references to '{control_id}' found in the codebase.[/yellow]" - ) - console.print() - - # --- Review checklist --- - checklist = [ - "[bold]Verification:[/bold]\n", - f"• {notes}\n" if notes else "", - "\n[bold]Review each reference:[/bold]\n", - "• For each muted finding referencing this control, confirm:\n", - " 1. The risk the original check guards against\n", - " 2. That this control actually mitigates that risk\n", - " 3. That the control is still in effect (not degraded or bypassed)\n", - "\n[bold]After review:[/bold]\n", - f"• Update compensating-controls.yaml: [cyan]last-reviewed: {today}[/cyan]\n", - "• If the control is no longer valid, either:\n", - " - Fix the underlying finding and remove the mute, or\n", - " - Document a new/updated compensating control\n", - "• Commit the change", - ] - - console.print( - Panel( - "".join(checklist), - title="[bold yellow]Review Guidance[/bold yellow]", - border_style="yellow", - ) - ) - - -if __name__ == "__main__": - typer.run(main) diff --git a/mise-tasks/review-compliance-reports b/mise-tasks/review-compliance-reports index bcbe090..a9146c8 100755 --- a/mise-tasks/review-compliance-reports +++ b/mise-tasks/review-compliance-reports @@ -143,7 +143,10 @@ def _kubectl(args: str, timeout: int = 15) -> subprocess.CompletedProcess: def run_node_verification(console: Console) -> None: """Verify node-level conditions that Prowler reports as MANUAL. - Compensating control: node-config-automated-verification + Prowler runs inside a pod and can't evaluate kubelet file permissions, + kubelet config arguments, etcd CA separation, or cluster-admin RBAC + bindings. We SSH into the minikube node and check each condition here, + failing loudly if any deviates from expected values. """ checks: list[tuple[str, str, bool]] = [] # (name, detail, passed) @@ -278,7 +281,7 @@ def run_node_verification(console: Console) -> None: table = Table( show_header=True, header_style="bold", - title="Node Verification (CC: node-config-automated-verification)", + title="Node Verification (out-of-band checks for MANUAL findings)", ) table.add_column("Check") table.add_column("Detail") @@ -528,8 +531,8 @@ def summarize_report( Panel( f"[bold yellow]{len(latest['unmuted'])} unmuted failure(s) " f"need triage.[/bold yellow]\n\n" - "For each: remediate or mute " - "(add to mutelist + compensating control).", + "For each: remediate, or add a Resource entry to the " + "matching check in argocd/manifests/prowler/mutelist/.", title=f"{label} Verdict", border_style="yellow", ) @@ -653,7 +656,6 @@ def main( ) # --- Node-level MANUAL check verification --- - # Compensating control: node-config-automated-verification # These checks verify conditions Prowler reports as MANUAL because it # runs inside a pod and cannot evaluate them directly. run_node_verification(console) From d02bf062af2cd3a867cd5c4da17686ae0806fa0b Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 22 May 2026 21:29:11 -0700 Subject: [PATCH 085/122] C0: review 1password reference card Added vault split (blumeops vs Personal), noted onepassword-connect runs on both indri and ringtail, and lifted op CLI guidance from agent memory into the card. Bumped last-reviewed. --- docs/changelog.d/+review-1password-doc.doc.md | 1 + docs/reference/services/1password.md | 37 ++++++++++++------- 2 files changed, 24 insertions(+), 14 deletions(-) create mode 100644 docs/changelog.d/+review-1password-doc.doc.md diff --git a/docs/changelog.d/+review-1password-doc.doc.md b/docs/changelog.d/+review-1password-doc.doc.md new file mode 100644 index 0000000..bba9591 --- /dev/null +++ b/docs/changelog.d/+review-1password-doc.doc.md @@ -0,0 +1 @@ +Reviewed [[1password]] reference card: added the `blumeops` vs `Personal` vault split, noted that `onepassword-connect` runs on both indri and ringtail (not just one cluster), and pulled the `op read` vs `op item get --fields` guidance up from agent memory into the card. diff --git a/docs/reference/services/1password.md b/docs/reference/services/1password.md index 4489194..5ad50da 100644 --- a/docs/reference/services/1password.md +++ b/docs/reference/services/1password.md @@ -1,6 +1,7 @@ --- title: 1Password -modified: 2026-02-10 +modified: 2026-05-22 +last-reviewed: 2026-05-22 tags: - service - secrets @@ -8,15 +9,22 @@ tags: # 1Password -Root credential store for all BlumeOps secrets, synced to Kubernetes via External Secrets Operator. +Root credential store for all BlumeOps secrets. Kubernetes workloads read items via [[external-secrets|External Secrets Operator]]; humans and agents read via the `op` CLI. -## Architecture +## Vaults + +| Vault | Purpose | +|-------|---------| +| `blumeops` | Infrastructure secrets — referenced by ExternalSecret manifests and scripts. | +| `Personal` | Human login credentials keyed by URL for autofill. Not consumed by infrastructure. | + +## Kubernetes Integration ``` 1Password Cloud | v -1Password Connect (namespace: 1password) +1Password Connect (namespace: 1password, deployed on both indri and ringtail) | v External Secrets Operator (namespace: external-secrets) @@ -25,15 +33,15 @@ External Secrets Operator (namespace: external-secrets) Native Kubernetes Secrets ``` -## Vault +**ClusterSecretStore:** `onepassword-blumeops` (same name on both clusters). -The `blumeops` vault contains all infrastructure credentials. +Services reference 1Password items via `ExternalSecret` manifests. Both `minikube-indri` and `k3s-ringtail` run their own `onepassword-connect` deployment talking to the same vault. -## Kubernetes Integration +## Direct Access -**ClusterSecretStore:** `onepassword-blumeops` +Prefer `op read "op://vault/item/field"` over `op item get --fields` in scripts and IaC — `op item get --fields` wraps multi-line values in quotes, corrupting them. `op item get` without flags is fine for exploring item metadata. -Services reference 1Password items via `ExternalSecret` manifests. +If an item name contains special characters (e.g. parentheses), use the item ID instead of the name in the `op://` path. ## Disaster Recovery Backup @@ -41,8 +49,9 @@ The `mise run op-backup` task encrypts a `.1pux` vault export and transfers it t ## Related -- [[argocd]] - Uses secrets for git access -- [[postgresql]] - Database credentials -- [[run-1password-backup]] - Periodic backup procedure -- [[restore-1password-backup]] - Recovery from backup -- [[borgmatic]] - Backup system +- [[external-secrets]] — Kubernetes operator that consumes ClusterSecretStore +- [[argocd]] — Uses secrets for git access +- [[postgresql]] — Database credentials +- [[run-1password-backup]] — Periodic backup procedure +- [[restore-1password-backup]] — Recovery from backup +- [[borgmatic]] — Backup system From 08a1cb164a3f96b408979ecda560a9f7dbf768b4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 22 May 2026 21:36:13 -0700 Subject: [PATCH 086/122] C0: fix 1password export filename in backup how-to MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1Password's desktop app names exports as 1PasswordExport--.1pux automatically — you can't choose the name. Procedure now points the task at that glob. --- .../+1password-backup-doc-export-name.doc.md | 1 + docs/how-to/operations/run-1password-backup.md | 12 +++++------- 2 files changed, 6 insertions(+), 7 deletions(-) create mode 100644 docs/changelog.d/+1password-backup-doc-export-name.doc.md diff --git a/docs/changelog.d/+1password-backup-doc-export-name.doc.md b/docs/changelog.d/+1password-backup-doc-export-name.doc.md new file mode 100644 index 0000000..6c4d262 --- /dev/null +++ b/docs/changelog.d/+1password-backup-doc-export-name.doc.md @@ -0,0 +1 @@ +Fixed the export-filename step in [[run-1password-backup]]: 1Password's desktop app names the export `1PasswordExport--.1pux` automatically rather than letting you save to a fixed name, so the procedure now points the task at that glob instead of pretending the default name is `1Password-export.1pux`. diff --git a/docs/how-to/operations/run-1password-backup.md b/docs/how-to/operations/run-1password-backup.md index b0807da..0dc9ec9 100644 --- a/docs/how-to/operations/run-1password-backup.md +++ b/docs/how-to/operations/run-1password-backup.md @@ -26,20 +26,18 @@ How to export and encrypt your 1Password vaults for inclusion in [[borgmatic]] b 1. Open the 1Password desktop app 2. **File > Export > All Vaults** 3. Choose **1PUX** format -4. Save to `~/Documents/1Password-export.1pux` +4. Save to `~/Documents/` — 1Password names the file `1PasswordExport--.1pux` automatically; don't bother renaming it, pass the path to the task in the next step ### 2. Run the Backup Task -```fish -mise run op-backup -``` - -Or, if you saved the export to a non-default location: +Pass the exported file's path: ```fish -mise run op-backup ~/path/to/export.1pux +mise run op-backup ~/Documents/1PasswordExport-*.1pux ``` +(If only one export exists in `~/Documents/`, the glob expands cleanly. Otherwise, paste the full path.) + The task will: 1. Prompt for the `.1pux` path if not provided From 57fd88b2698e87b5767d90c1a82151b1db87f446 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 22 May 2026 21:50:43 -0700 Subject: [PATCH 087/122] C0: fix op item edit syntax in zot key rotation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pbpaste | op item edit ... "field[password]=-" stdin syntax is rejected by op 2.34 as "invalid JSON" — recent op versions treat piped input as a full JSON template, not a single field value. Procedure now uses an inline assignment via a local fish variable. --- docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md | 1 + docs/reference/services/zot.md | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md diff --git a/docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md b/docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md new file mode 100644 index 0000000..ec8834f --- /dev/null +++ b/docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md @@ -0,0 +1 @@ +Fixed the `op item edit` invocation in the [[zot]] API-key rotation procedure: the previous `pbpaste | op item edit ... "field[password]=-"` stdin syntax is rejected by op 2.34 as "invalid JSON" (recent op versions treat piped input as a full JSON template, not a single field value). Procedure now reads the clipboard into a local fish variable and passes it as an inline assignment. diff --git a/docs/reference/services/zot.md b/docs/reference/services/zot.md index d00a200..b01a6ce 100644 --- a/docs/reference/services/zot.md +++ b/docs/reference/services/zot.md @@ -56,8 +56,9 @@ The `zot-ci` API key expires every **90 days**. To rotate: 5. Generate a new API key, copy it to clipboard 6. Update 1Password: ```fish - pbpaste | op item edit "Forgejo Secrets" --vault blumeops "zot-ci-api[password]=-" + set -l NEWKEY (pbpaste); op item edit "Forgejo Secrets" --vault blumeops "zot-ci-api[password]=$NEWKEY"; set -e NEWKEY ``` + The value is briefly visible to other `ps`-readers on this machine (single-user mac, acceptable tradeoff). The older `pbpaste | op item edit ... "field[password]=-"` stdin syntax was rejected by op 2.34 as "invalid JSON" — recent op versions treat piped input as a full JSON template. 7. Sync to Forgejo: `mise run provision-indri -- --tags forgejo_actions_secrets` ## Related From 35ae171783ca7ac54bc57fc1cc23e7a171b36782 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 27 May 2026 07:15:07 -0700 Subject: [PATCH 088/122] C0: fix sync button location in manage-forgejo-mirrors The verify step pointed to the main repo page, but the "Synchronize now" button is in the Mirror settings section of the settings page. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md | 1 + docs/how-to/configuration/manage-forgejo-mirrors.md | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md diff --git a/docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md b/docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md new file mode 100644 index 0000000..f71fc81 --- /dev/null +++ b/docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md @@ -0,0 +1 @@ +Fix manage-forgejo-mirrors verify step — sync button is on the repo settings page ("Synchronize now"), not the main repo page. diff --git a/docs/how-to/configuration/manage-forgejo-mirrors.md b/docs/how-to/configuration/manage-forgejo-mirrors.md index 9c0e113..5d150dc 100644 --- a/docs/how-to/configuration/manage-forgejo-mirrors.md +++ b/docs/how-to/configuration/manage-forgejo-mirrors.md @@ -137,8 +137,8 @@ Return to [GitHub token settings](https://github.com/settings/tokens?type=beta) Trigger a manual sync on one mirror to confirm the new PAT works: -1. Go to any mirror repo on forge (e.g., `mirrors/cloudnative-pg`) -2. Click the sync button (circular arrows icon) next to the mirror status +1. Go to any mirror repo's settings page on forge (e.g., `https://forge.eblu.me/mirrors/cloudnative-pg/settings`) +2. In the "Mirror settings" section, click "Synchronize now" 3. Confirm the sync completes without errors ## Related From c09bd5b6129ce688722b305801100ae1199c9036 Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Wed, 27 May 2026 11:54:32 -0700 Subject: [PATCH 089/122] C0: cap systemd-coredump on ringtail to stop game-crash lockups Wine/Proton game segfaults (e.g. Diablo IV) produced multi-GB cores that systemd-coredump spent minutes compressing to disk, pinning the CPU and freezing the desktop. Cap ProcessSizeMax/ExternalSizeMax at 1G (oversized cores logged but skipped) and MaxUse at 2G to bound the store. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+ringtail-coredump-size-cap.infra.md | 1 + nixos/ringtail/configuration.nix | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 docs/changelog.d/+ringtail-coredump-size-cap.infra.md diff --git a/docs/changelog.d/+ringtail-coredump-size-cap.infra.md b/docs/changelog.d/+ringtail-coredump-size-cap.infra.md new file mode 100644 index 0000000..824b2df --- /dev/null +++ b/docs/changelog.d/+ringtail-coredump-size-cap.infra.md @@ -0,0 +1 @@ +Cap systemd-coredump on ringtail (ProcessSizeMax/ExternalSizeMax 1G, MaxUse 2G) so multi-GB Wine/Proton game crash dumps no longer thrash the disk and lock up the desktop. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index e8c634a..f01ce9f 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -609,6 +609,22 @@ in AllowSuspendThenHibernate=no ''; + # Cap systemd-coredump. Wine/Proton games (Diablo IV, etc.) segfault + # regularly and dump multi-GB cores; with the stock (effectively unbounded) + # limits, systemd-coredump then spends minutes streaming and compressing the + # dump to disk — e.g. a single D4 crash produced a 4.6G core, read 13.7G and + # wrote 17.4G, pinning the CPU and locking up the desktop for ~3.5 minutes. + # Those cores are useless anyway: Nix .so files carry no build-id, so no + # backtrace can be generated. Capping uncompressed size at 1G makes oversized + # cores get logged-but-skipped (the kernel stops dumping once we stop reading) + # while real service cores (well under 1G) are still captured. MaxUse bounds + # the on-disk store so frequent game crashes can't accumulate (was at 8.6G). + systemd.coredump.extraConfig = '' + ProcessSizeMax=1G + ExternalSizeMax=1G + MaxUse=2G + ''; + # NixOS release system.stateVersion = "25.11"; } From 753fa9cb6317108ab8701e1f58ec1ba7c991d211 Mon Sep 17 00:00:00 2001 From: Erich Blume <725328+eblume@users.noreply.github.com> Date: Wed, 27 May 2026 12:59:29 -0700 Subject: [PATCH 090/122] C0: disable VRR on ringtail DP-1 to stop OMEN panel flicker The OMEN 27i IPS pumps brightness when its refresh swings into the low VRR range during low-framerate content (game cutscenes), producing a ~20Hz flicker that compounds over a session until a reboot. GPU health is clean (no Xid/ECC/thermal); pinning fixed 165Hz eliminates it. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/changelog.d/+ringtail-vrr-flicker.bugfix.md | 1 + nixos/ringtail/configuration.nix | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+ringtail-vrr-flicker.bugfix.md diff --git a/docs/changelog.d/+ringtail-vrr-flicker.bugfix.md b/docs/changelog.d/+ringtail-vrr-flicker.bugfix.md new file mode 100644 index 0000000..cb23344 --- /dev/null +++ b/docs/changelog.d/+ringtail-vrr-flicker.bugfix.md @@ -0,0 +1 @@ +Disabled adaptive sync (VRR) on ringtail's DP-1 output. The OMEN 27i IPS panel pumps brightness when its refresh rate swings into the low VRR range during low-framerate content (e.g. game cutscenes), producing a flicker that worsened over a session until a reboot. Pinning the panel to a fixed 165Hz eliminates it. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index f01ce9f..bc893d5 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -337,7 +337,12 @@ in output = { "DP-1" = { mode = "2560x1440@165Hz"; - adaptive_sync = "on"; + # VRR off: the OMEN 27i IPS pumps gamma/brightness when the panel + # refresh swings into its low VRR range (e.g. low-fps game + # cutscenes), producing a ~20Hz flicker that compounds over a long + # session until a reboot. Fixed refresh at 165Hz eliminates it. + # If you want VRR back, cap in-game fps so refresh never dips low. + adaptive_sync = "off"; bg = "~/.config/sway/wallpaper.jpg fill"; }; }; From c00d7db5079e78772e5e7e3780d7594baa009bd4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 06:01:57 -0700 Subject: [PATCH 091/122] Recurring maintenance batch (2026-05-27) (#360) Bundle of recurring overdue tasks: - Ringtail flake update - Security & compliance report review - Tooling deps bump (prek, fly, mise, forgejo workflows) - Top stale doc review - Top stale service review (if trivial) Larger items (service version bumps requiring upgrades, non-local container migration) split out as separate PRs. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/360 --- .../recurring-maintenance-2026-05-27.doc.md | 1 + .../recurring-maintenance-2026-05-27.infra.md | 4 ++++ docs/reference/infrastructure/indri.md | 9 +++++++-- fly/Dockerfile | 8 ++++---- mise-tasks/branch-cleanup | 2 +- mise-tasks/container-build-and-release | 2 +- mise-tasks/container-list | 2 +- mise-tasks/container-version-check | 2 +- mise-tasks/dns-acme-cleanup | 2 +- mise-tasks/docs-mikado | 2 +- mise-tasks/docs-preview | 2 +- mise-tasks/docs-review | 2 +- mise-tasks/docs-review-stale | 2 +- mise-tasks/mikado-branch-invariant-check | 2 +- mise-tasks/op-backup | 2 +- mise-tasks/pr-comments | 2 +- mise-tasks/prune-ringtail-generations | 2 +- mise-tasks/review-compliance-reports | 2 +- mise-tasks/runner-logs | 2 +- mise-tasks/service-review | 2 +- mise-tasks/spork-create | 2 +- nixos/ringtail/flake.lock | 18 +++++++++--------- prek.toml | 8 ++++---- 23 files changed, 46 insertions(+), 36 deletions(-) create mode 100644 docs/changelog.d/recurring-maintenance-2026-05-27.doc.md create mode 100644 docs/changelog.d/recurring-maintenance-2026-05-27.infra.md diff --git a/docs/changelog.d/recurring-maintenance-2026-05-27.doc.md b/docs/changelog.d/recurring-maintenance-2026-05-27.doc.md new file mode 100644 index 0000000..af30489 --- /dev/null +++ b/docs/changelog.d/recurring-maintenance-2026-05-27.doc.md @@ -0,0 +1 @@ +Reviewed [[indri]] reference card: added `devpi`, `cv`, and `docs` to the native-services list; widened the k8s note to reflect the growing set of apps now on ringtail and the planned indri-minikube decommission; added CPU/RAM specs. diff --git a/docs/changelog.d/recurring-maintenance-2026-05-27.infra.md b/docs/changelog.d/recurring-maintenance-2026-05-27.infra.md new file mode 100644 index 0000000..f2d48ad --- /dev/null +++ b/docs/changelog.d/recurring-maintenance-2026-05-27.infra.md @@ -0,0 +1,4 @@ +Recurring maintenance batch: + +- Ringtail flake inputs refreshed (`disko`, `home-manager`, `nixpkgs`). +- Tooling deps bumped: prek hooks (trufflehog v3.95.3, kingfisher v1.101.0, ruff v0.15.14, `ansible-core` 2.21.0); fly proxy base images (nginx 1.30.1-alpine, alloy v1.16.1); `typer==0.26.2` in mise tasks. diff --git a/docs/reference/infrastructure/indri.md b/docs/reference/infrastructure/indri.md index cbb2a0f..67652ca 100644 --- a/docs/reference/infrastructure/indri.md +++ b/docs/reference/infrastructure/indri.md @@ -1,6 +1,7 @@ --- title: Indri -modified: 2026-02-19 +modified: 2026-05-27 +last-reviewed: 2026-05-27 tags: - infrastructure - host @@ -15,6 +16,7 @@ Primary BlumeOps server. Mac Mini M1 (2020). | Property | Value | |----------|-------| | **Model** | Mac mini M1, 2020 (Macmini9,1) | +| **CPU / RAM** | 8 cores / 16 GB | | **Storage** | 2TB internal SSD | | **macOS** | 15.7.3 (Sequoia) | | **Tailscale hostname** | `indri.tail8d86e.ts.net` | @@ -30,9 +32,12 @@ Primary BlumeOps server. Mac Mini M1 (2020). - [[borgmatic]] - Backup system - [[alloy|Alloy]] - Metrics/logs collector - [[caddy]] - Reverse proxy for `*.ops.eblu.me` +- [[devpi]] - PyPI mirror (LaunchAgent) +- [[cv]] - Static CV site, served by Caddy +- [[docs]] - Quartz-built docs site, served by Caddy **Kubernetes (via minikube):** -- [[apps|Most k8s applications]] (Frigate, ntfy migrated to [[ringtail]] k3s) +- [[apps|Most k8s applications]]. A growing set of apps (Authentik, Frigate, ntfy, Immich, Homepage, Shower, Kingfisher, alloy-ringtail) now run on [[ringtail]]'s k3s instead. Long-term plan is to decommission indri's minikube entirely. **GUI Applications (manual start required):** - Docker Desktop - Container runtime for minikube diff --git a/fly/Dockerfile b/fly/Dockerfile index eae8c35..d4e7a18 100644 --- a/fly/Dockerfile +++ b/fly/Dockerfile @@ -1,5 +1,5 @@ -# nginx 1.30.0-alpine -FROM nginx@sha256:0272e4604ed93c1792f03695a033a6e8546840f86e0de20a884bb17d2c924883 +# nginx 1.30.1-alpine +FROM nginx@sha256:c819f83c54b0361f5557601bf5eb4943d09360e7a7fdf426afc466570f45874d # Copy tailscale binaries from official image (v1.94.2) COPY --from=docker.io/tailscale/tailscale@sha256:95e528798bebe75f39b10e74e7051cf51188ee615934f232ba7ad06a3390ffa1 \ @@ -13,8 +13,8 @@ RUN mkdir -p /var/run/tailscale /var/lib/tailscale \ && apk add --no-cache fail2ban \ && rm -f /etc/fail2ban/jail.d/alpine-ssh.conf -# Copy Alloy binary from official image (v1.16.0, Ubuntu-based, needs libc6-compat) -COPY --from=docker.io/grafana/alloy@sha256:6e00cf7c5a692ff5f24844529416ed017d76fce922f8199004e73d5eca46b6b8 \ +# Copy Alloy binary from official image (v1.16.1, Ubuntu-based, needs libc6-compat) +COPY --from=docker.io/grafana/alloy@sha256:51aeb9d829239345070619dad3edd6873186f913c84f45b365b74574fcb38ec0 \ /bin/alloy /usr/local/bin/alloy RUN mkdir -p /var/log/nginx /etc/alloy /tmp/alloy-data diff --git a/mise-tasks/branch-cleanup b/mise-tasks/branch-cleanup index 575c9a1..a538880 100755 --- a/mise-tasks/branch-cleanup +++ b/mise-tasks/branch-cleanup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Delete branches that have been merged into main (local and remote)" #MISE alias="bc" diff --git a/mise-tasks/container-build-and-release b/mise-tasks/container-build-and-release index ba569e7..85e6cb8 100755 --- a/mise-tasks/container-build-and-release +++ b/mise-tasks/container-build-and-release @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["typer==0.25.0", "httpx==0.28.1"] +# dependencies = ["typer==0.26.2", "httpx==0.28.1"] # /// #MISE description="Trigger container build workflows via Forgejo API" #USAGE arg "" help="Container name (directory under containers/)" diff --git a/mise-tasks/container-list b/mise-tasks/container-list index 26639f2..7dad346 100755 --- a/mise-tasks/container-list +++ b/mise-tasks/container-list @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="List available containers and their recent tags" #USAGE arg "[name]" help="Optional container name to filter output" diff --git a/mise-tasks/container-version-check b/mise-tasks/container-version-check index 4ebe3b6..06f96ae 100755 --- a/mise-tasks/container-version-check +++ b/mise-tasks/container-version-check @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Validate container version consistency across container.py, Dockerfiles, nix derivations, and service-versions.yaml" #USAGE flag "--all-files" help="Check all containers, not just changed ones" diff --git a/mise-tasks/dns-acme-cleanup b/mise-tasks/dns-acme-cleanup index 432a6ce..3a53b11 100755 --- a/mise-tasks/dns-acme-cleanup +++ b/mise-tasks/dns-acme-cleanup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Delete orphaned ACME challenge TXT records in eblu.me" #USAGE flag "--dry-run" help="List orphans without deleting" diff --git a/mise-tasks/docs-mikado b/mise-tasks/docs-mikado index eea052f..c632e46 100755 --- a/mise-tasks/docs-mikado +++ b/mise-tasks/docs-mikado @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="View active Mikado dependency chains for C2 changes" #USAGE arg "[card]" help="Card stem to show chain for" diff --git a/mise-tasks/docs-preview b/mise-tasks/docs-preview index faa79af..9e0bd16 100755 --- a/mise-tasks/docs-preview +++ b/mise-tasks/docs-preview @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Build docs with Dagger and serve locally, opening to a specific card" #USAGE arg "" help="Card path relative to docs/, e.g. how-to/knowledgebase/review-documentation" diff --git a/mise-tasks/docs-review b/mise-tasks/docs-review index d07904d..12e301f 100755 --- a/mise-tasks/docs-review +++ b/mise-tasks/docs-review @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Review the most stale documentation card by last-reviewed date" #USAGE flag "--limit " default="15" help="Number of docs to show in the table" diff --git a/mise-tasks/docs-review-stale b/mise-tasks/docs-review-stale index 4449213..0c5490e 100755 --- a/mise-tasks/docs-review-stale +++ b/mise-tasks/docs-review-stale @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich==15.0.0", "typer==0.25.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Report docs by git-last-modified date, highlighting stale ones" #USAGE flag "--threshold " default="180" help="Days before a doc is considered stale" diff --git a/mise-tasks/mikado-branch-invariant-check b/mise-tasks/mikado-branch-invariant-check index 1f0fbcf..3135bf2 100755 --- a/mise-tasks/mikado-branch-invariant-check +++ b/mise-tasks/mikado-branch-invariant-check @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich==15.0.0", "typer==0.25.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Validate Mikado Branch Invariant on mikado/* branches" #USAGE arg "[commit_msg_file]" help="Commit message file (passed by commit-msg hook)" diff --git a/mise-tasks/op-backup b/mise-tasks/op-backup index 37a97a6..7db033b 100755 --- a/mise-tasks/op-backup +++ b/mise-tasks/op-backup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich==15.0.0", "typer==0.25.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Encrypt a 1Password .1pux export and send to indri for borgmatic" #USAGE arg "[export_path]" help="Path to .1pux export file (prompted if omitted)" diff --git a/mise-tasks/pr-comments b/mise-tasks/pr-comments index 7205617..39d7c9a 100755 --- a/mise-tasks/pr-comments +++ b/mise-tasks/pr-comments @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="List unresolved comments on a PR" #USAGE arg "" help="Pull request number" diff --git a/mise-tasks/prune-ringtail-generations b/mise-tasks/prune-ringtail-generations index 2b8e3f9..2ad8dc8 100755 --- a/mise-tasks/prune-ringtail-generations +++ b/mise-tasks/prune-ringtail-generations @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich==15.0.0", "typer==0.25.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Prune old NixOS generations on ringtail, preserving rollback safety" #MISE alias="prg" diff --git a/mise-tasks/review-compliance-reports b/mise-tasks/review-compliance-reports index a9146c8..24d2afc 100755 --- a/mise-tasks/review-compliance-reports +++ b/mise-tasks/review-compliance-reports @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich==15.0.0", "typer==0.25.0", "pyyaml==6.0.3"] +# dependencies = ["rich==15.0.0", "typer==0.26.2", "pyyaml==6.0.3"] # /// #MISE description="Summarize the latest Prowler and Kingfisher compliance reports from sifaka" #USAGE flag "--full" help="Show all unmuted failures, not just new ones" diff --git a/mise-tasks/runner-logs b/mise-tasks/runner-logs index 9c988ee..3c5e8e3 100755 --- a/mise-tasks/runner-logs +++ b/mise-tasks/runner-logs @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="List recent Forgejo Actions runs or fetch logs for a specific job" #USAGE arg "[run_number]" help="Run number to show jobs for (omit to list recent runs)" diff --git a/mise-tasks/service-review b/mise-tasks/service-review index 2d50e0b..f83b104 100755 --- a/mise-tasks/service-review +++ b/mise-tasks/service-review @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Review the most stale service for version freshness" #USAGE flag "--limit " default="15" help="Number of services to show in the table" diff --git a/mise-tasks/spork-create b/mise-tasks/spork-create index 92f4e5c..3f18563 100755 --- a/mise-tasks/spork-create +++ b/mise-tasks/spork-create @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.25.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Create a spork (floating-branch soft-fork) of a mirrored upstream project" #USAGE arg "" help="Repository name in the mirrors/ org on forge (e.g. kingfisher)" diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock index 0f53d0e..0f0da7e 100644 --- a/nixos/ringtail/flake.lock +++ b/nixos/ringtail/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1777713215, - "narHash": "sha256-8GzXDOXckDWwST8TY5DbwYFjdvQLlP7K9CLSVx6iTTo=", + "lastModified": 1779699611, + "narHash": "sha256-EcCaSTKnmg2o4wLKaN1aqQFomwyhO7ik0bX9COdyCas=", "owner": "nix-community", "repo": "disko", - "rev": "63b4e7e6cf75307c1d26ac3762b886b5b0247267", + "rev": "5ba0c9555c28685e57fa54c7a25e42c7efdbfc8d", "type": "github" }, "original": { @@ -27,11 +27,11 @@ ] }, "locked": { - "lastModified": 1778401693, - "narHash": "sha256-OVHdCqXXUF5UdGkH+FF2ZL06OLZjj2kvP2dIUmzVWoo=", + "lastModified": 1779506708, + "narHash": "sha256-QOD/CNm196nCJRheux/URi4/HE66fthdOMqCJoPP1Y0=", "owner": "nix-community", "repo": "home-manager", - "rev": "389b83002efc26f1145e89a6a8e6edc5a6435948", + "rev": "3ee51fbdac8c8bdfe1e7e1fcaba6520a563f394f", "type": "github" }, "original": { @@ -43,11 +43,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1778430510, - "narHash": "sha256-Ti+ZBvW6yrWWAg2szExVTwCd4qOJ3KlVr1tFHfyfi8Q=", + "lastModified": 1779467186, + "narHash": "sha256-nOesoDCiXcUftqbRBMz9tt4blI5PvljMWbm3kuCA+0s=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "8fd9daa3db09ced9700431c5b7ad0e8ba199b575", + "rev": "b77b3de8775677f84492abe84635f87b0e153f0f", "type": "github" }, "original": { diff --git a/prek.toml b/prek.toml index add7799..2c66b82 100644 --- a/prek.toml +++ b/prek.toml @@ -28,7 +28,7 @@ hooks = [{ id = "check-yaml", args = ["--unsafe"] }] # Secret detection (running both tools in parallel to compare coverage) [[repos]] repo = "https://github.com/trufflesecurity/trufflehog" -rev = "17456f8c7d042d8c82c9a8ca9e937231f9f42e26" # v3.95.2 +rev = "37b77001d0174ebec2fcca2bd83ff83a6d45a3ab" # v3.95.3 hooks = [ { id = "trufflehog", entry = "trufflehog git file://. --since-commit HEAD --no-verification --fail", stages = [ "pre-commit", @@ -38,7 +38,7 @@ hooks = [ [[repos]] repo = "https://github.com/mongodb/kingfisher" -rev = "9ddec4ab8b53653d4941e6b3fd4ff602ce91d81b" # v1.97.0 +rev = "6f560103cc6ea082ef4b80a9098e3f3111afb8bc" # v1.101.0 hooks = [ { id = "kingfisher", args = [ "scan", @@ -69,12 +69,12 @@ name = "ansible-lint" entry = "env ANSIBLE_ROLES_PATH=ansible/roles ansible-lint" language = "python" files = "^ansible/" -additional_dependencies = ["ansible-lint==26.4.0", "ansible-core==2.20.5"] +additional_dependencies = ["ansible-lint==26.4.0", "ansible-core==2.21.0"] # Python - ruff for linting and formatting [[repos]] repo = "https://github.com/astral-sh/ruff-pre-commit" -rev = "6fec9b7edb08fd9989088709d864a7826dc74e80" # v0.15.12 +rev = "0c7b6c989466a93942def1f84baf36ddfcd60c83" # v0.15.14 hooks = [{ id = "ruff", args = ["--fix"] }, { id = "ruff-format" }] # Python - ty type checker From 4e25180b0ae3ff212b7fc4d57d136f215a92c310 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 07:13:13 -0700 Subject: [PATCH 092/122] C0: clone blumeops via tailnet on ringtail provision MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch ringtail.yml from forge.eblu.me (Fly proxy, WAN) to forge.ops.eblu.me (Caddy on indri, tailnet). Ringtail is always on the tailnet — the WAN round-trip was overhead and made provision-ringtail fail any time Fly was slow or down. --- ansible/playbooks/ringtail.yml | 2 +- docs/changelog.d/+ringtail-clone-via-tailnet.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+ringtail-clone-via-tailnet.infra.md diff --git a/ansible/playbooks/ringtail.yml b/ansible/playbooks/ringtail.yml index ee5604b..b05d67a 100644 --- a/ansible/playbooks/ringtail.yml +++ b/ansible/playbooks/ringtail.yml @@ -57,7 +57,7 @@ tasks: - name: Ensure blumeops repo is present ansible.builtin.git: - repo: "https://forge.eblu.me/eblume/blumeops.git" + repo: "https://forge.ops.eblu.me/eblume/blumeops.git" dest: /etc/blumeops version: "{{ ringtail_commit | default('main') }}" force: true diff --git a/docs/changelog.d/+ringtail-clone-via-tailnet.infra.md b/docs/changelog.d/+ringtail-clone-via-tailnet.infra.md new file mode 100644 index 0000000..d664163 --- /dev/null +++ b/docs/changelog.d/+ringtail-clone-via-tailnet.infra.md @@ -0,0 +1 @@ +Switch the ringtail provisioning playbook's blumeops clone URL from `forge.eblu.me` (public, via Fly proxy) to `forge.ops.eblu.me` (tailnet, direct via Caddy on indri). Ringtail is always on the tailnet, so the WAN round-trip is pure overhead — it also made `provision-ringtail` brittle whenever the Fly proxy was slow or down. From f6febb1f772e858a82d69e7baade4f526e550f97 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 07:59:22 -0700 Subject: [PATCH 093/122] C0: switch fly proxy deploy strategy to immediate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bluegreen kept timing out — the new green machine couldn't reach "started" within Fly's 5-minute deploy budget. The cold-start sequence (tailscaled → tailscale up → wait-for-MagicDNS → nginx startup) eats most of that, leaving no headroom for healthcheck propagation. For a single-machine proxy, bluegreen offers little benefit anyway: no warm second instance, so trading 5-10s of downtime for predictable completion is the right call. --- docs/changelog.d/+fly-deploy-immediate-strategy.infra.md | 1 + fly/fly.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+fly-deploy-immediate-strategy.infra.md diff --git a/docs/changelog.d/+fly-deploy-immediate-strategy.infra.md b/docs/changelog.d/+fly-deploy-immediate-strategy.infra.md new file mode 100644 index 0000000..205bd6a --- /dev/null +++ b/docs/changelog.d/+fly-deploy-immediate-strategy.infra.md @@ -0,0 +1 @@ +Switch the Fly proxy deploy strategy from `bluegreen` to `immediate` in `fly/fly.toml`. With a single proxy machine, bluegreen offers little benefit — the green machine routinely failed to reach "started" inside Fly's default 5-minute deploy timeout (the cold-start sequence of `tailscaled` → `tailscale up` → wait-for-MagicDNS → nginx startup eats most of the budget), and the failed deploys would roll back. `immediate` replaces the machine in place with a brief downtime (~5–10s) but actually completes. diff --git a/fly/fly.toml b/fly/fly.toml index 11aac9c..6ccf29d 100644 --- a/fly/fly.toml +++ b/fly/fly.toml @@ -7,7 +7,7 @@ primary_region = "sjc" memory = "512mb" [deploy] -strategy = "bluegreen" +strategy = "immediate" [http_service] internal_port = 8080 From 4d1f4af25b9d2a55c1b0731e3a6b83259fc33dfa Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 09:59:46 -0700 Subject: [PATCH 094/122] =?UTF-8?q?Upgrade=20unpoller=20v2.34.0=20?= =?UTF-8?q?=E2=86=92=20v3.2.0,=20migrate=20to=20container.py=20(#361)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Service Review pickup: unpoller (last reviewed 73 days ago). - Upgrades unpoller from v2.34.0 to v3.2.0 (major version bump). - Migrates the container build from a Dockerfile to a native Dagger pipeline (`containers/unpoller/container.py`) following the navidrome / miniflux pattern. - Refreshes `service-versions.yaml` (last-reviewed, current-version). ## Breaking changes (upstream) - **v3.0.0** — UniFi network API shifts (later 10.x). Some metric / event / log names and labels may have changed. Worth a follow-up sweep of the unpoller Grafana dashboard for missing series. - **v3.2.0** — defaults to a 60s background poll feeding cached Prometheus scrapes (was on-demand poll per scrape). To restore previous behavior, set `interval = 0` in `up.conf`. Leaving the new default in this PR — every-15s scrapes will simply serve from cache, which is fine for our use. ## Build - Image: `registry.ops.eblu.me/blumeops/unpoller:v3.2.0-1b27242` - Built by build-container workflow run #559 from this branch. ## Test plan - [ ] `argocd app set unpoller --revision unpoller-v3 && argocd app sync unpoller` - [ ] Pod comes Ready - [ ] Verify metrics exported (`Site/Client/UAP/USG/USW` counts in logs, `unpoller_*` series in Prometheus) - [ ] Spot-check unpoller Grafana dashboard for missing series after the v3 API shift - [ ] After merge: `argocd app set unpoller --revision main && argocd app sync unpoller` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/361 --- argocd/manifests/unpoller/kustomization.yaml | 2 +- containers/unpoller/Dockerfile | 43 ---------------- containers/unpoller/container.py | 53 ++++++++++++++++++++ docs/changelog.d/unpoller-v3.infra.md | 1 + service-versions.yaml | 4 +- 5 files changed, 57 insertions(+), 46 deletions(-) delete mode 100644 containers/unpoller/Dockerfile create mode 100644 containers/unpoller/container.py create mode 100644 docs/changelog.d/unpoller-v3.infra.md diff --git a/argocd/manifests/unpoller/kustomization.yaml b/argocd/manifests/unpoller/kustomization.yaml index 5b7a9e2..d2c4e28 100644 --- a/argocd/manifests/unpoller/kustomization.yaml +++ b/argocd/manifests/unpoller/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/unpoller - newTag: v2.34.0-613f05d + newTag: v3.2.0-1b27242 configMapGenerator: - name: unpoller-config diff --git a/containers/unpoller/Dockerfile b/containers/unpoller/Dockerfile deleted file mode 100644 index 241b375..0000000 --- a/containers/unpoller/Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -# UnPoller — UniFi metrics exporter for Prometheus -# Two-stage build: Go compilation, then minimal Alpine runtime - -ARG CONTAINER_APP_VERSION=v2.34.0 - -FROM golang:alpine3.22 AS build - -ARG CONTAINER_APP_VERSION -RUN apk add --no-cache git - -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/unpoller.git /app - -WORKDIR /app - -ENV CGO_ENABLED=0 - -RUN go build -ldflags="-s -w \ - -X main.version=${CONTAINER_APP_VERSION} \ - -X main.builtBy=blumeops \ - -X golift.io/version.Version=${CONTAINER_APP_VERSION} \ - -X golift.io/version.Branch=HEAD \ - -X golift.io/version.BuildUser=blumeops \ - -X golift.io/version.Revision=blumeops-build" \ - -o /bin/unpoller . - -FROM alpine:3.22 - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="UnPoller" -LABEL org.opencontainers.image.description="UniFi metrics exporter for Prometheus" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=build /bin/unpoller /usr/bin/unpoller - -EXPOSE 9130 -USER 65534:65534 -ENTRYPOINT ["/usr/bin/unpoller"] -CMD ["--config", "/etc/unpoller/up.conf"] diff --git a/containers/unpoller/container.py b/containers/unpoller/container.py new file mode 100644 index 0000000..bfc75ba --- /dev/null +++ b/containers/unpoller/container.py @@ -0,0 +1,53 @@ +"""UnPoller — UniFi metrics exporter for Prometheus. + +Two-stage build: Go backend, Alpine runtime. +Source cloned from forge mirror. +""" + +import dagger + +from blumeops.containers import ( + alpine_runtime, + clone_from_forge, + go_build, + oci_labels, +) + +VERSION = "v3.2.0" + + +async def build(src: dagger.Directory) -> dagger.Container: + source = clone_from_forge("unpoller", VERSION) + + backend = go_build( + source, + "/unpoller", + ldflags=( + f"-s -w " + f"-X main.version={VERSION} " + f"-X main.builtBy=blumeops " + f"-X golift.io/version.Version={VERSION} " + f"-X golift.io/version.Branch=HEAD " + f"-X golift.io/version.BuildUser=blumeops " + f"-X golift.io/version.Revision=blumeops-build" + ), + ) + + runtime = alpine_runtime( + extra_apk=["ca-certificates", "tzdata"], + create_user=False, + ) + runtime = oci_labels( + runtime, + title="UnPoller", + description="UniFi metrics exporter for Prometheus", + version=VERSION, + ) + return ( + runtime.with_file("/usr/bin/unpoller", backend.file("/unpoller")) + .with_exposed_port(9130) + .with_user("65534") + .with_default_args( + args=["/usr/bin/unpoller", "--config", "/etc/unpoller/up.conf"] + ) + ) diff --git a/docs/changelog.d/unpoller-v3.infra.md b/docs/changelog.d/unpoller-v3.infra.md new file mode 100644 index 0000000..fa6eaf9 --- /dev/null +++ b/docs/changelog.d/unpoller-v3.infra.md @@ -0,0 +1 @@ +Upgrade unpoller v2.34.0 → v3.2.0 and migrate container build from Dockerfile to native Dagger (container.py). v3.0.0 carries breaking UniFi API changes; v3.2.0 introduces a 60s background poll (cached scrapes) by default — set `interval = 0` in `up.conf` to restore on-demand polling. diff --git a/service-versions.yaml b/service-versions.yaml index 02f2979..63b0f15 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -345,8 +345,8 @@ services: - name: unpoller type: argocd - last-reviewed: 2026-03-16 - current-version: "v2.34.0" + last-reviewed: 2026-05-28 + current-version: "v3.2.0" upstream-source: https://github.com/unpoller/unpoller/releases notes: UniFi metrics exporter for Prometheus From e703d25efe2b2da12793a6c459bce95ecdc48435 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 10:10:21 -0700 Subject: [PATCH 095/122] C0: rebuild unpoller container from squashed main commit Image was previously tagged with the unpoller-v3 branch SHA (1b27242), which doesn't exist in main's history after squash-merge. Rebuilt from the squashed commit so the tag references a reachable commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/unpoller/kustomization.yaml | 2 +- docs/changelog.d/+unpoller-rebuild-on-main.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+unpoller-rebuild-on-main.infra.md diff --git a/argocd/manifests/unpoller/kustomization.yaml b/argocd/manifests/unpoller/kustomization.yaml index d2c4e28..bf776bb 100644 --- a/argocd/manifests/unpoller/kustomization.yaml +++ b/argocd/manifests/unpoller/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/unpoller - newTag: v3.2.0-1b27242 + newTag: v3.2.0-4d1f4af configMapGenerator: - name: unpoller-config diff --git a/docs/changelog.d/+unpoller-rebuild-on-main.infra.md b/docs/changelog.d/+unpoller-rebuild-on-main.infra.md new file mode 100644 index 0000000..60ae8fa --- /dev/null +++ b/docs/changelog.d/+unpoller-rebuild-on-main.infra.md @@ -0,0 +1 @@ +Rebuild unpoller container from squashed main commit so the image SHA tag matches a commit in main's history (was tagged with the pre-squash branch SHA). From 1ce381cb6e15ca1226feee1d6a0fa2c449f929b7 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 14:36:33 -0700 Subject: [PATCH 096/122] C0: surface missing-log failures in runner-logs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `mise run runner-logs -j ` previously silently succeeded with no output when forgejo had no log for the task. Two layered causes: 1. zstdcat exits 0 even when the file is missing (writes "can't stat … -- ignored" to stderr). 2. ssh to indri runs fish, which silently drops the remote exit code so the subprocess returncode is always 0. Probe `test -f` over SSH and parse a stdout marker (EXISTS / MISSING) to detect the missing-log case, then report it explicitly with the indri path and a hint about action_task.log_in_storage = 0 so the operator knows where to look next. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../+runner-logs-missing-log.misc.md | 1 + mise-tasks/runner-logs | 25 ++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+runner-logs-missing-log.misc.md diff --git a/docs/changelog.d/+runner-logs-missing-log.misc.md b/docs/changelog.d/+runner-logs-missing-log.misc.md new file mode 100644 index 0000000..c06704a --- /dev/null +++ b/docs/changelog.d/+runner-logs-missing-log.misc.md @@ -0,0 +1 @@ +`mise run runner-logs -j ` now reports a clear error when the log file doesn't exist on indri (e.g. a runner crash that left `action_task.log_in_storage = 0`). Previously it printed only the header and exited 0, because `zstdcat` exits 0 with a "can't stat … -- ignored" stderr message and ssh+fish on indri swallows the remote exit code. diff --git a/mise-tasks/runner-logs b/mise-tasks/runner-logs index 3c5e8e3..0d3028b 100755 --- a/mise-tasks/runner-logs +++ b/mise-tasks/runner-logs @@ -229,12 +229,35 @@ def fetch_log(run_number: int, job_index: int, repo: str, token: str) -> None: hex_prefix = f"{task_id & 0xff:02x}" log_path = f"~/forgejo/data/actions_log/{repo}/{hex_prefix}/{task_id}.log.zst" + # indri's login shell (fish) silently swallows SSH exit codes, so we can't + # rely on returncode. zstdcat itself also exits 0 with a "can't stat ... + # -- ignored" stderr message when the file is missing. Detect missing logs + # by running `test -f` over SSH and parsing the marker line from stdout. + probe = subprocess.run( + ["ssh", "indri", f"test -f {log_path} && echo EXISTS || echo MISSING"], + capture_output=True, + text=True, + ) + marker = probe.stdout.strip().splitlines()[-1] if probe.stdout.strip() else "" + if marker != "EXISTS": + typer.echo( + f"Error: log not found for run #{run_number} job {job_index} (task {task_id})", + err=True, + ) + typer.echo(f"Path: indri:{log_path}", err=True) + typer.echo( + "The runner may have crashed before uploading its log buffer " + "(action_task.log_in_storage = 0).", + err=True, + ) + raise typer.Exit(1) + result = subprocess.run( ["ssh", "indri", f"zstdcat {log_path}"], capture_output=True, text=True, ) - if result.returncode != 0: + if result.returncode != 0 or not result.stdout: typer.echo( f"Error: could not read log for run #{run_number} job {job_index} (task {task_id})", err=True, From ecded3007368e094baebeed10fbf2a3fe49aed90 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 14:51:09 -0700 Subject: [PATCH 097/122] Make valkey local on ringtail (nix amd64) + bump to 8.1.7 (#362) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Weekly "make one non-local container local" pickup: immich-ringtail still pulled `docker.io/valkey/valkey:8.1.6` because the existing `containers/valkey/container.py` build was arm64-only. - Adds `containers/valkey/default.nix` — nix-built amd64 valkey image, packaged by the ringtail nix-container-builder runner using `pkgs.dockerTools.buildLayeredImage`. Mirrors the existing `containers/authentik-redis/default.nix` pattern. - `containers/valkey/container.py` keeps building the Alpine arm64 image for paperless on indri. Bumped both builds to upstream valkey 8.1.7 (Alpine 3.22 now ships `8.1.7-r0`; nixpkgs has 8.1.7). - Splits `VERSION` (upstream app) from `ALPINE_PIN` (apk pin) in `container.py` so both build files can declare the same upstream version and pass `container-version-check`. - Updates `service-versions.yaml`: current-version 8.1.7, refreshed last-reviewed, upstream-source now points at the canonical valkey-io releases page. - Switches kustomizations: - `immich-ringtail/kustomization.yaml`: `docker.io/valkey/valkey:8.1.6` → `registry.ops.eblu.me/blumeops/valkey:v8.1.7-02859c5-nix`, comment updated. - `paperless/kustomization.yaml`: `v8.1.6-r0-fabca04` → `v8.1.7-02859c5`. ## Build build-container run #563 — both jobs succeeded after a transient runner crash on the first dispatch (#562 build-nix), which surfaced two separate bugs that landed in a separate C0 on main: - `runner-logs` silently returned 0 with no output when the log file didn't exist on indri - `ssh indri` swallowing remote exit codes (fish login shell), which the wrapper now works around via a stdout marker ## Test plan - [ ] `argocd app set immich-ringtail --revision valkey-nix && argocd app sync immich-ringtail` - [ ] `argocd app set paperless --revision valkey-nix && argocd app sync paperless` - [ ] Both valkey pods come Ready and start serving on :6379 - [ ] Immich app + paperless can read/write their respective cache - [ ] After merge: rebuild from squashed main commit + update kustomization tags (squash-tag follow-up) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/362 --- .../immich-ringtail/kustomization.yaml | 9 +++--- argocd/manifests/paperless/kustomization.yaml | 2 +- containers/valkey/container.py | 15 +++++----- containers/valkey/default.nix | 30 +++++++++++++++++++ docs/changelog.d/valkey-nix.infra.md | 1 + service-versions.yaml | 15 +++++----- 6 files changed, 53 insertions(+), 19 deletions(-) create mode 100644 containers/valkey/default.nix create mode 100644 docs/changelog.d/valkey-nix.infra.md diff --git a/argocd/manifests/immich-ringtail/kustomization.yaml b/argocd/manifests/immich-ringtail/kustomization.yaml index c1f639e..7a97fef 100644 --- a/argocd/manifests/immich-ringtail/kustomization.yaml +++ b/argocd/manifests/immich-ringtail/kustomization.yaml @@ -21,8 +21,9 @@ images: - name: ghcr.io/immich-app/immich-machine-learning # CUDA variant of the same release — ringtail has an RTX 4080 newTag: v2.6.3-cuda - # Using upstream multi-arch valkey image directly; the - # registry.ops.eblu.me/blumeops/valkey mirror is arm64-only (built - # on indri) and would crashloop on ringtail. + # amd64 valkey built via nix on the ringtail nix-container-builder + # (see containers/valkey/default.nix). The Alpine container.py build + # is arm64-only and serves paperless on indri. - name: docker.io/valkey/valkey - newTag: "8.1.6" + newName: registry.ops.eblu.me/blumeops/valkey + newTag: v8.1.7-02859c5-nix diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml index 9c6a086..575dfb4 100644 --- a/argocd/manifests/paperless/kustomization.yaml +++ b/argocd/manifests/paperless/kustomization.yaml @@ -16,4 +16,4 @@ images: newTag: v2.20.13-07f52e9 - name: docker.io/library/redis newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.6-r0-fabca04 + newTag: v8.1.7-02859c5 diff --git a/containers/valkey/container.py b/containers/valkey/container.py index 5d150e7..34e8524 100644 --- a/containers/valkey/container.py +++ b/containers/valkey/container.py @@ -1,8 +1,8 @@ -"""Valkey — native Dagger build. +"""Valkey — native Dagger build (arm64, indri). Alpine 3.22 base with the `valkey` apk package (8.1.x — Redis-compatible). -Mirrors `docker.io/valkey/valkey:8.1-alpine`, used by paperless and immich -as a cache/queue sidecar. +Used by paperless (sidecar) on indri. immich on ringtail uses the +nix-built amd64 variant from `default.nix` in this directory. """ import dagger @@ -10,9 +10,10 @@ from dagger import dag from blumeops.containers import oci_labels -# Alpine 3.22 ships valkey 8.1.6-r0. Alpine 3.23 jumps to 9.0 — hold on 3.22 -# to keep this a 1:1 swap for the upstream `valkey:8.1-alpine` image. -VERSION = "8.1.6-r0" +# Alpine 3.22 currently ships valkey 8.1.7-r0. Alpine 3.23 jumps to 9.0 — +# hold on 3.22 to keep this aligned with the 8.1 line. +VERSION = "8.1.7" +ALPINE_PIN = "8.1.7-r0" ALPINE_BASE = "alpine:3.22" @@ -21,7 +22,7 @@ async def build(src: dagger.Directory) -> dagger.Container: ctr = ( dag.container() .from_(ALPINE_BASE) - .with_exec(["apk", "add", "--no-cache", f"valkey={VERSION}"]) + .with_exec(["apk", "add", "--no-cache", f"valkey={ALPINE_PIN}"]) .with_exec(["mkdir", "-p", "/data"]) .with_exec(["chown", "valkey:valkey", "/data"]) .with_workdir("/data") diff --git a/containers/valkey/default.nix b/containers/valkey/default.nix new file mode 100644 index 0000000..9cb1713 --- /dev/null +++ b/containers/valkey/default.nix @@ -0,0 +1,30 @@ +# Nix-built Valkey for ringtail (amd64) +# Companion to container.py (Alpine 3.22, arm64 on indri). +# Used by immich-ringtail which needs an amd64 image; paperless on indri +# continues to use the Alpine container.py build. +# +# The version assertion ensures nix-build fails if a flake.lock update +# changes the Valkey version — forcing an explicit version acknowledgment +# here and in service-versions.yaml (enforced by container-version-check). +{ pkgs ? import { } }: + +let + version = "8.1.7"; +in + +assert pkgs.valkey.version == version; + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/valkey"; + contents = [ + pkgs.valkey + ]; + + config = { + Entrypoint = [ "${pkgs.valkey}/bin/valkey-server" ]; + Cmd = [ "--bind" "0.0.0.0" "--protected-mode" "no" "--dir" "/data" ]; + ExposedPorts = { + "6379/tcp" = { }; + }; + }; +} diff --git a/docs/changelog.d/valkey-nix.infra.md b/docs/changelog.d/valkey-nix.infra.md new file mode 100644 index 0000000..e41eb63 --- /dev/null +++ b/docs/changelog.d/valkey-nix.infra.md @@ -0,0 +1 @@ +Add nix-built amd64 valkey for ringtail (`containers/valkey/default.nix`) so immich-ringtail can stop pulling the upstream multi-arch `docker.io/valkey/valkey` image. Existing `container.py` continues to build Alpine arm64 for paperless on indri. Both bump to valkey 8.1.7 (Alpine 3.22 8.1.7-r0 / nixpkgs 8.1.7). diff --git a/service-versions.yaml b/service-versions.yaml index 63b0f15..5440f01 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -146,14 +146,15 @@ services: - name: valkey type: argocd - last-reviewed: 2026-05-01 - current-version: "8.1.6-r0" - upstream-source: https://pkgs.alpinelinux.org/package/v3.22/community/aarch64/valkey + last-reviewed: 2026-05-28 + current-version: "8.1.7" + upstream-source: https://github.com/valkey-io/valkey/releases notes: >- - Shared Alpine-built valkey image, used as a sidecar/cache by paperless - (sidecar) and immich (separate Deployment). Mirrors the upstream - docker.io/valkey/valkey:8.1-alpine. Pinned to Alpine 3.22 for valkey 8.1.x; - Alpine 3.23 jumps to 9.0. Distinct from authentik-redis (nix-built Redis + Dual-build valkey image: container.py builds Alpine 3.22 + apk valkey + (arm64, indri) for paperless; default.nix builds via nixpkgs (amd64, + ringtail) for immich-ringtail. Both track upstream valkey 8.1.x; Alpine + 3.22 currently ships 8.1.7-r0 and nixpkgs valkey is 8.1.7. Alpine 3.23 + jumps to 9.0. Distinct from authentik-redis (nix-built Redis 8.x) which has its own entry. - name: external-secrets From f588638331567d921e189cbff25db5425ccebaef Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 28 May 2026 14:53:21 -0700 Subject: [PATCH 098/122] C0: rebuild valkey from squashed main commit Image tags from PR #362 (v8.1.7-02859c5{,-nix}) referenced a branch SHA that no longer exists on main after squash-merge. Rebuilt both the dagger arm64 and nix amd64 variants from the squashed commit (ecded30) and updated paperless + immich-ringtail to the new tags. Co-Authored-By: Claude Opus 4.7 (1M context) --- argocd/manifests/immich-ringtail/kustomization.yaml | 2 +- argocd/manifests/paperless/kustomization.yaml | 2 +- docs/changelog.d/+valkey-rebuild-on-main.infra.md | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+valkey-rebuild-on-main.infra.md diff --git a/argocd/manifests/immich-ringtail/kustomization.yaml b/argocd/manifests/immich-ringtail/kustomization.yaml index 7a97fef..2fa131c 100644 --- a/argocd/manifests/immich-ringtail/kustomization.yaml +++ b/argocd/manifests/immich-ringtail/kustomization.yaml @@ -26,4 +26,4 @@ images: # is arm64-only and serves paperless on indri. - name: docker.io/valkey/valkey newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.7-02859c5-nix + newTag: v8.1.7-ecded30-nix diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml index 575dfb4..3cd0d74 100644 --- a/argocd/manifests/paperless/kustomization.yaml +++ b/argocd/manifests/paperless/kustomization.yaml @@ -16,4 +16,4 @@ images: newTag: v2.20.13-07f52e9 - name: docker.io/library/redis newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.7-02859c5 + newTag: v8.1.7-ecded30 diff --git a/docs/changelog.d/+valkey-rebuild-on-main.infra.md b/docs/changelog.d/+valkey-rebuild-on-main.infra.md new file mode 100644 index 0000000..c743e61 --- /dev/null +++ b/docs/changelog.d/+valkey-rebuild-on-main.infra.md @@ -0,0 +1 @@ +Rebuild valkey container from squashed main commit (both arm64 dagger and amd64 nix variants), and update paperless + immich-ringtail kustomizations to the main-SHA tags `v8.1.7-ecded30` and `v8.1.7-ecded30-nix`. From e0064de83d0d15a1f34f16146542a62817dca3ef Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 1 Jun 2026 15:52:09 -0700 Subject: [PATCH 099/122] C0: update ringtail flake inputs (nixpkgs, disko) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../+ringtail-flake-update-2026-06-01.infra.md | 4 ++++ nixos/ringtail/flake.lock | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) create mode 100644 docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md diff --git a/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md b/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md new file mode 100644 index 0000000..dd488b6 --- /dev/null +++ b/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md @@ -0,0 +1,4 @@ +Update the ringtail NixOS flake lockfile (`nixos/ringtail/flake.lock`): bump +`nixpkgs` (b77b3de → 25f5383) and `disko` (5ba0c95 → 115e521) to latest. +`nixpkgs-services` was intentionally left pinned (skipped by the +`flake-update` pipeline). Routine recurring maintenance per [[manage-lockfile]]. diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock index 0f0da7e..bb60501 100644 --- a/nixos/ringtail/flake.lock +++ b/nixos/ringtail/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1779699611, - "narHash": "sha256-EcCaSTKnmg2o4wLKaN1aqQFomwyhO7ik0bX9COdyCas=", + "lastModified": 1780290312, + "narHash": "sha256-eTAlX0CwgB84Ts3GaBd944A3DRXVMzgA0EqroZBISUo=", "owner": "nix-community", "repo": "disko", - "rev": "5ba0c9555c28685e57fa54c7a25e42c7efdbfc8d", + "rev": "115e5211780054d8a890b41f0b7734cafad54dfe", "type": "github" }, "original": { @@ -43,11 +43,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1779467186, - "narHash": "sha256-nOesoDCiXcUftqbRBMz9tt4blI5PvljMWbm3kuCA+0s=", + "lastModified": 1779796641, + "narHash": "sha256-ZsIrKmhp4vbBXoXXmR/tBXA/UCsAQiJL9vsgZEduhVY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "b77b3de8775677f84492abe84635f87b0e153f0f", + "rev": "25f538306313eae3927264466c70d7001dcea1df", "type": "github" }, "original": { From a36a18aaa6714e187834edc09eb2fc565d0f5fbb Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 1 Jun 2026 20:52:20 -0700 Subject: [PATCH 100/122] C0: black-hole /mirrors/* at Fly edge + name-and-shame scrapers A $29.60 Fly bill traced to ~1.25 TB/30d egress on forge.eblu.me (99.95% of all proxy egress), ~71% of it AI scrapers (Meta meta-externalagent, OpenAI GPTBot, Amazonbot, Bytespider) crawling the public mirror repos' infinite git-history URL space and timing out Forgejo. robots.txt already disallowed /mirrors/ but those agents ignore it, so enforce at the edge: return 403 (^~ to beat the regex asset locations), served as a roll-of-dishonour page with an X-Naughty-Scrapers header. Mirrors stay reachable on the tailnet via forge.ops.eblu.me. Tier 2 (UA denylist + Anubis) and the Cloudflare rejection are documented in docs/explanation/ai-scraper-mitigation.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../+ai-scraper-mitigation-doc.doc.md | 1 + .../+forge-mirrors-blackhole.infra.md | 1 + docs/explanation/ai-scraper-mitigation.md | 201 ++++++++++++++++++ docs/tutorials/expose-service-publicly.md | 7 + fly/Dockerfile | 1 + fly/naughty.html | 64 ++++++ fly/nginx.conf | 27 +++ 7 files changed, 302 insertions(+) create mode 100644 docs/changelog.d/+ai-scraper-mitigation-doc.doc.md create mode 100644 docs/changelog.d/+forge-mirrors-blackhole.infra.md create mode 100644 docs/explanation/ai-scraper-mitigation.md create mode 100644 fly/naughty.html diff --git a/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md b/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md new file mode 100644 index 0000000..246fedb --- /dev/null +++ b/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md @@ -0,0 +1 @@ +Add `docs/explanation/ai-scraper-mitigation.md` — the egress-cost / AI-crawler threat model for the public Fly proxy, the tiered mitigation plan (Tier 1: mirror black-hole, shipped; Tier 2: user-agent denylist + Anubis; Tier 3: Cloudflare, rejected on principle), and the data behind it. diff --git a/docs/changelog.d/+forge-mirrors-blackhole.infra.md b/docs/changelog.d/+forge-mirrors-blackhole.infra.md new file mode 100644 index 0000000..29a5e6a --- /dev/null +++ b/docs/changelog.d/+forge-mirrors-blackhole.infra.md @@ -0,0 +1 @@ +Black-hole the `/mirrors/*` repositories at the Fly proxy edge (`return 403` → `forge.ops.eblu.me`). A surprise $29.60 Fly bill traced to ~1.24 TB/30d of egress on `forge.eblu.me`, 99.95% of all proxy egress — of which ~71% was AI scrapers (Meta `meta-externalagent`, OpenAI `GPTBot`, Amazonbot) crawling the near-infinite git-history URL space of the public mirror repos and timing out Forgejo in the process. Mirrors exist for supply-chain control and are consumed over the tailnet, so their public web UI had no legitimate audience. `robots.txt` already disallowed `/mirrors/`, but the offending agents ignore it. Tier-2 mitigations (user-agent denylist, Anubis proof-of-work gateway) are documented in `docs/explanation/ai-scraper-mitigation.md`. diff --git a/docs/explanation/ai-scraper-mitigation.md b/docs/explanation/ai-scraper-mitigation.md new file mode 100644 index 0000000..fe4ba3d --- /dev/null +++ b/docs/explanation/ai-scraper-mitigation.md @@ -0,0 +1,201 @@ +--- +title: AI Scraper Mitigation +modified: 2026-06-01 +last-reviewed: 2026-06-01 +tags: + - explanation + - fly-io + - forgejo + - security + - networking +--- + +# AI Scraper Mitigation on the Public Proxy + +> **Note:** This article was drafted by AI and reviewed by Erich. I plan to rewrite all explanatory content in my own words — these serve as placeholders to establish the documentation structure. + +How BlumeOps keeps AI crawlers from running up the [[expose-service-publicly|Fly.io proxy]] egress bill and DoS-ing [[forgejo|Forgejo]] on [[indri]]. + +## The incident + +A $29.60 Fly.io invoice arrived, nearly all of it a single line: + +``` +Bandwidth: Egress (iad) — 958,524,714,138 bytes — $19.17 +``` + +The `iad` (Ashburn) region is a red herring: the proxy machine runs in `sjc`, +but Fly bills egress at the edge PoP nearest the *client*, so `iad` just means +"the traffic went to clients on the US East Coast." + +Tracing it through the nginx access logs (shipped to Loki via [[alloy|Alloy]]): + +| Signal | Value | +|--------|-------| +| Total proxy egress (30d) | ~1.25 TB | +| Share that was `forge.eblu.me` | **99.95%** | +| Share of forge egress that was `/mirrors/*` | **~71%** | +| Share that was declared AI bots | **~85%+** | +| Top offenders | Meta `meta-externalagent` (66% of bytes), OpenAI `GPTBot` (16%), Amazonbot, Bytespider | +| Forgejo `5xx` (upstream timeouts) | tens of thousands/day, spiking to 112k | + +The crawlers were walking [[forgejo|Forgejo]]'s git-history browse endpoints — +`src/commit/`, `commits/`, `blame/`, `raw/commit/`, plus `.patch`/`.diff` +and `?page=N` pagination. That URL space is effectively **infinite**: every +file × every commit × every page, multiplied across every mirrored repo. A +crawler that follows links never finishes, and every page is a cache `MISS` +that both tunnels to indri *and* bills as egress. + +Two distinct harms, not one: + +1. **Cost** — ~1.25 TB/mo of egress on a free-tier-ish proxy. +2. **Availability** — the crawl alone generates ~400–530k requests/day, + enough to time out Forgejo regardless of how much RAM [[indri]] has. Moving + egress elsewhere would *not* fix this; the crawl has to be throttled at the + source. + +`robots.txt` already `Disallow`s `/mirrors/`, `/user/`, and archive/download +paths — but **`meta-externalagent` and `GPTBot` ignore it.** For these agents, +`robots.txt` is a dead letter, which is why edge enforcement is required. + +## The tiered plan + +### Tier 1 — Black-hole `/mirrors/*` (shipped) + +The mirror repositories (`tailscale`, `prometheus`, `mealie`, `paperless-ngx`, +…) are mirrors of *already-public upstreams*, kept for supply-chain control +(see [[spork-strategy]] and the container/mirror story in [[why-gitops]]). They +are consumed by CI, gilbert, and other tailnet clients over +`forge.ops.eblu.me`. Their web UI on the public internet served **no +legitimate audience** — only scrapers. So the proxy now returns `403` for +anything under `/mirrors/`, pointing humans at the tailnet host: + +```nginx +location ^~ /mirrors/ { + return 403 "Mirror repositories are tailnet-only — use forge.ops.eblu.me.\n"; +} +``` + +The `^~` modifier matters: without it, the regex `location` blocks for static +assets (`*.css`, `*.js`, release downloads) would match first and leak content +under `/mirrors/`. `^~` tells nginx to stop at the prefix match and skip the +regex round. + +This is config, not bot-fighting — we simply stopped serving an infinite +tarpit to the world. It removes ~71% of forge egress and a large share of the +upstream timeouts, with zero impact on any human or tailnet consumer. It +mirrors the existing tailnet-only blocks for `/api/packages/` and `/swagger`. + +The `403` is also a small act of public shaming. Blocked requests are served a +"roll of dishonour" page (`fly/naughty.html`, status kept at `403` via +`error_page 403 /naughty.html`) that names the offending operators and their +share of the stolen bytes, and every response carries an `X-Naughty-Scrapers` +header: + +``` +X-Naughty-Scrapers: OpenAI/GPTBot, Meta/meta-externalagent, Amazonbot, ByteDance/Bytespider — robots.txt ignorers +``` + +Petty? A little. But it costs nothing, documents *why* the block exists for the +next person who hits it, and the page is a few KB versus the megabytes of git +HTML the crawlers were taking. + +**Trade-off accepted:** mirror release-artifact downloads over WAN now also +`403`. Legitimate consumers already pull these over the tailnet, and the public +exposure was the same crawl liability, so this is intentional. + +### Tier 2 — Defend the repos that *stay* public (planned) + +`/eblume/*` is intentionally public (a public profile is a feature). But the +same git-history endpoints are still a tarpit there, just lower-volume. Two +layers, in increasing order of effort and effectiveness: + +#### 2a. User-agent denylist (cheap, evadable) + +Block the declared AI crawlers at the edge regardless of path: + +```nginx +# Illustrative — not yet deployed. +map $http_user_agent $is_ai_bot { + default 0; + "~*meta-externalagent" 1; + "~*GPTBot" 1; + "~*ClaudeBot" 1; + "~*Amazonbot" 1; + "~*Bytespider" 1; + "~*SemrushBot" 1; +} +# in the forge.eblu.me server block: +if ($is_ai_bot) { return 403; } +``` + +This catches ~85% of *current* traffic for a few lines of config. It is +trivially evadable — a scraper need only spoof a browser UA — so it is a +speed-bump, not a wall. Keep `robots.txt` too: well-behaved crawlers +(Googlebot, Bingbot) do honor it, and it documents intent. + +#### 2b. Anubis proof-of-work gateway (the real wall) + +[Anubis](https://github.com/TecharoHQ/anubis) is a Go reverse proxy that +weighs each request with a browser-based proof-of-work challenge before passing +it upstream. It was written for *exactly this scenario* — its author built it +after Amazon's scraper took down their Git server — and is widely deployed in +front of Forgejo/Gitea (Codeberg, the UN, etc.). Headless scrapers that can't +run the challenge JS never reach the application; humans clear it once and +proceed. + +Why it fits BlumeOps better than the alternatives: + +- **It attacks cost *and* availability at once.** Bots receive a few-KB + challenge page instead of MB of git HTML (egress collapses) and never reach + Forgejo (timeouts collapse). No other single lever does both. +- **It stays in-house.** No third party terminates our TLS or sees our + traffic. + +Placement options: + +| Where | Pros | Cons | +|-------|------|------| +| On [[indri]], between [[caddy|Caddy]] and Forgejo | Protects every path and every entry (WAN *and* tailnet); one config | Adds a hop and a service to the indri critical path; the challenge page still tunnels back through Fly for WAN clients (small egress) | +| On the Fly proxy machine, in front of nginx | Challenge served at the edge — bots never even tunnel to indri | Fly VM is small (512 MB); another moving part in the boot sequence alongside `tailscaled`/nginx/`fail2ban`/Alloy | + +Leaning toward Caddy-side on indri for simplicity and uniform coverage, but +this is the open design question for Tier 2. Anubis is MIT-licensed and the +author has signalled a future move to an `equi-x`-based challenge, so pin a +version and track upstream. + +### Tier 3 — Move egress off Fly entirely (rejected) + +A [[#The incident|Cloudflare]] Tunnel (`cloudflared` on indri → Cloudflare +edge) would make this a non-problem on the cost axis: Cloudflare does not meter +proxied bandwidth, and it bundles free AI-bot mitigation (Bot Fight Mode, the +"block AI scrapers" toggle, Managed Challenge, AI Labyrinth). One move would +zero the egress bill and add bot defense. + +**We are not doing this, on principle.** Cloudflare is a solid platform and a +defensible engineering choice — but it already sits in front of an enormous +fraction of the modern web, and routing BlumeOps through it would add one more +site to the pile of the internet that one company can see and gate. BlumeOps +deliberately keeps its own backbone ([[expose-service-publicly|Fly + Tailscale ++ Caddy]], DNS at [[gandi|Gandi]] — see the "no Cloudflare dependency" line in +that doc). This is a values decision, not a technical one: we would rather pay +a few dollars and run our own mitigation than centralize on Cloudflare. + +It is also worth noting that **Tier 3 would not, by itself, fix the upstream +timeouts** — free egress just means we'd stop *caring* that bots crawl, while +they continued to hammer Forgejo. Crawl mitigation (Tier 1 + Tier 2) is +required regardless of where egress is billed. + +## Summary + +| Tier | Lever | Cost | Availability | Status | +|------|-------|------|--------------|--------| +| 1 | Black-hole `/mirrors/*` at edge | −~71% | big drop | **shipped** | +| 2a | UA denylist on remaining repos | −most of the rest | further drop | planned | +| 2b | Anubis PoW gateway | −near-total | near-total | planned | +| 3 | Cloudflare Tunnel | −total | needs 2b anyway | **rejected (principle)** | + +The guiding insight: the cheapest, lowest-risk mitigation is to **not serve an +infinite-URL surface that has no human audience.** Everything past Tier 1 is +about defending the surface we *do* want public, in-house, without ceding +control of our traffic to a third party. diff --git a/docs/tutorials/expose-service-publicly.md b/docs/tutorials/expose-service-publicly.md index 886cad4..65af611 100644 --- a/docs/tutorials/expose-service-publicly.md +++ b/docs/tutorials/expose-service-publicly.md @@ -376,6 +376,13 @@ Mitigations for dynamic services: - fail2ban on indri (see below) can block IPs showing abuse patterns - The break-glass shutoff remains the last resort +The most acute version of this in practice has been **AI scrapers**, which +ignore `robots.txt` and crawl dynamic services (notably [[forgejo|Forgejo]]'s +infinite git-history URL space) into both a surprise egress bill and an +effective L7 DoS. See [[ai-scraper-mitigation]] for the incident, the tiered +defense (mirror black-hole, user-agent denylist, Anubis proof-of-work), and +why a Cloudflare Tunnel is *not* the chosen answer here. + If a publicly exposed dynamic service attracts targeted attacks or the home network bandwidth is impacted, consider migrating to Cloudflare Tunnel for enterprise-grade DDoS protection (requires DNS migration; diff --git a/fly/Dockerfile b/fly/Dockerfile index d4e7a18..406c849 100644 --- a/fly/Dockerfile +++ b/fly/Dockerfile @@ -25,6 +25,7 @@ COPY fail2ban/action.d/nginx-deny.conf /etc/fail2ban/action.d/nginx-deny.conf COPY nginx.conf /etc/nginx/nginx.conf COPY error.html /usr/share/nginx/html/error.html +COPY naughty.html /usr/share/nginx/html/naughty.html COPY alloy.river /etc/alloy/config.alloy COPY start.sh /start.sh RUN chmod +x /start.sh diff --git a/fly/naughty.html b/fly/naughty.html new file mode 100644 index 0000000..d899171 --- /dev/null +++ b/fly/naughty.html @@ -0,0 +1,64 @@ + + + + + + + 403 · Roll of Dishonour + + + +
+

🪤 403 — you walked into the scraper trap

+

These are mirror repositories. They are tailnet-only.

+ +

+ This path used to serve the web UI for mirrors of public upstream + projects. It exists for supply-chain control, not for crawling. A + robots.txt politely disallowed /mirrors/. + A pack of AI scrapers ignored it, walked the infinite git-history URL + space, and ran up ~1.25 TB of egress and a real + money bill in a single month — while timing out the server for everyone + else. +

+ +

So /mirrors/ is closed at the edge now. Roll of dishonour, + by share of the bytes they stole:

+ + + + + + + + + +
OperatorUser-Agent
Metameta-externalagent
OpenAIGPTBot
AmazonAmazonbot
ByteDanceBytespider
+ +

+ If you are a human who actually wanted these mirrors, they are reachable + from the tailnet at forge.ops.eblu.me. If you are a crawler: + read the robots.txt next time. We left you a header, too. +

+ +
GNU Terry Pratchett
+
+ + diff --git a/fly/nginx.conf b/fly/nginx.conf index 570e6c9..ec35774 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -215,6 +215,33 @@ http { return 403 "API documentation is only available at forge.ops.eblu.me (tailnet).\n"; } + # Black-hole the mirror repositories on WAN. These are mirrors of + # already-public upstreams (tailscale, prometheus, mealie, …) kept + # for supply-chain control; CI, gilbert, and tailnet clients consume + # them via forge.ops.eblu.me. Their web UI served no public purpose + # but AI scrapers, which crawled the near-infinite git-history URL + # space (src/commit, commits, blame, raw) and drove ~70% of Fly + # egress (1.24 TB/30d → a surprise bill) plus enough upstream load to + # time out Forgejo. robots.txt already Disallows /mirrors/, but + # meta-externalagent and GPTBot ignore it — so enforce at the edge. + # `^~` makes this win over the regex locations below (e.g. *.css), so + # static assets under /mirrors/ can't leak through. We also name and + # shame: blocked requests get a "roll of dishonour" page (403 status + # preserved) and an X-Naughty-Scrapers header. See + # docs/explanation/ai-scraper-mitigation.md. + location ^~ /mirrors/ { + error_page 403 /naughty.html; + return 403; + } + + # Roll of dishonour — served on the /mirrors/ 403, status kept at 403. + location = /naughty.html { + internal; + root /usr/share/nginx/html; + add_header X-Naughty-Scrapers "OpenAI/GPTBot, Meta/meta-externalagent, Amazonbot, ByteDance/Bytespider — robots.txt ignorers" always; + add_header X-Clacks-Overhead "GNU Terry Pratchett" always; + } + # Redirect archive endpoints to tailnet — archive requests generate full # git bundles on demand. Unauthenticated crawlers hitting unique commit # SHAs cause unbounded CPU and disk usage (DoS vector). Legitimate users From 40bd92982015582cb7aa2680c6dc8412706498fb Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 1 Jun 2026 20:55:05 -0700 Subject: [PATCH 101/122] C0: remove visible GNU Terry Pratchett from naughty.html body MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GNU lives in the overhead — the X-Clacks-Overhead header — never on the visible page. Keep the header, drop the footer. Co-Authored-By: Claude Opus 4.8 (1M context) --- fly/naughty.html | 3 --- 1 file changed, 3 deletions(-) diff --git a/fly/naughty.html b/fly/naughty.html index d899171..b6eada8 100644 --- a/fly/naughty.html +++ b/fly/naughty.html @@ -21,7 +21,6 @@ td.share { color: #f2c14e; text-align: right; font-variant-numeric: tabular-nums; } .name { color: #e8867a; } a { color: #7fb3d5; } - footer { margin-top: 2rem; color: #5c574f; font-size: .85rem; } @@ -57,8 +56,6 @@ from the tailnet at forge.ops.eblu.me. If you are a crawler: read the robots.txt next time. We left you a header, too.

- -
GNU Terry Pratchett
From fcac8e5a7290bac54b25f82895c8120ef81367ff Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 10:34:00 -0700 Subject: [PATCH 102/122] =?UTF-8?q?Wave=201=20indri=E2=86=92ringtail=20mig?= =?UTF-8?q?ration:=20paperless,=20teslamate,=20mealie=20(#363)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate paperless, teslamate, and mealie off the OOM-saturated minikube-indri node onto ringtail k3s, shedding ~1.1 GiB of resident load. Second chain in the indri-k8s decommission after immich. **Containers ported to Nix (default.nix), build-verified on ringtail:** - paperless → wraps nixpkgs paperless-ngx 2.20.15 (pinned unstable); runs as web/worker/beat/consumer - mealie → wraps nixpkgs mealie 3.16.0 (forward 4-minor bump, breaking-change reviewed); single gunicorn, SQLite - teslamate → from-scratch beamPackages mixRelease (not in nixpkgs); erlang_27+elixir_1_18, npm assets, ex_cldr locales pre-fetched **Data:** cold downtime-tolerant cutover. paperless+teslamate postgres dump/restore from quiesced source into a new ringtail blumeops-pg CNPG cluster; mealie SQLite PVC copied. Source DBs untouched until verified (rollback = repoint). **Also:** ringtail blumeops-pg cluster + ExternalSecrets scaffold; fixes pre-existing shower version-check drift. Runbook: docs/how-to/ringtail/migrate-wave1-ringtail.md. Deploy-from-branch + cutover happens before merge; container images rebuilt from main after merge. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/363 --- argocd/apps/mealie-ringtail.yaml | 26 +++ argocd/apps/paperless-ringtail.yaml | 28 +++ argocd/apps/teslamate-ringtail.yaml | 28 +++ .../databases-ringtail/blumeops-pg.yaml | 97 +++++++++ .../external-secret-borgmatic.yaml | 30 +++ .../external-secret-eblume.yaml | 30 +++ .../external-secret-paperless.yaml | 28 +++ .../external-secret-teslamate.yaml | 30 +++ .../databases-ringtail/kustomization.yaml | 6 + .../manifests/mealie-ringtail/deployment.yaml | 102 +++++++++ .../mealie-ringtail/external-secret.yaml | 23 ++ .../ingress-tailscale.yaml | 0 .../mealie-ringtail/kustomization.yaml | 15 ++ argocd/manifests/mealie-ringtail/pvc.yaml | 14 ++ argocd/manifests/mealie-ringtail/service.yaml | 13 ++ argocd/manifests/mealie/deployment.yaml | 4 +- argocd/manifests/mealie/kustomization.yaml | 2 +- .../paperless-ringtail/deployment.yaml | 201 ++++++++++++++++++ .../paperless-ringtail/external-secret.yaml | 31 +++ .../ingress-tailscale.yaml | 0 .../paperless-ringtail/kustomization.yaml | 21 ++ .../manifests/paperless-ringtail/pv-nfs.yaml | 22 ++ argocd/manifests/paperless-ringtail/pvc.yaml | 15 ++ .../manifests/paperless-ringtail/service.yaml | 13 ++ argocd/manifests/paperless/deployment.yaml | 5 +- argocd/manifests/paperless/kustomization.yaml | 2 +- .../teslamate-ringtail/deployment.yaml | 72 +++++++ .../external-secret-db.yaml | 25 +++ .../external-secret-encryption-key.yaml | 27 +++ .../ingress-tailscale.yaml | 0 .../teslamate-ringtail/kustomization.yaml | 15 ++ .../manifests/teslamate-ringtail/service.yaml | 12 ++ argocd/manifests/teslamate/deployment.yaml | 5 +- argocd/manifests/teslamate/kustomization.yaml | 2 +- containers/mealie/Dockerfile | 145 ------------- containers/mealie/default.nix | 65 ++++++ containers/paperless/Dockerfile | 156 -------------- containers/paperless/default.nix | 77 +++++++ containers/teslamate/container.py | 104 --------- containers/teslamate/default.nix | 122 +++++++++++ containers/teslamate/entrypoint.sh | 23 -- .../migrate-wave1-ringtail.infra.md | 13 ++ .../immich/migrate-immich-to-ringtail.md | 2 + .../how-to/ringtail/migrate-wave1-ringtail.md | 176 +++++++++++++++ service-versions.yaml | 40 +++- 45 files changed, 1422 insertions(+), 445 deletions(-) create mode 100644 argocd/apps/mealie-ringtail.yaml create mode 100644 argocd/apps/paperless-ringtail.yaml create mode 100644 argocd/apps/teslamate-ringtail.yaml create mode 100644 argocd/manifests/databases-ringtail/blumeops-pg.yaml create mode 100644 argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml create mode 100644 argocd/manifests/databases-ringtail/external-secret-eblume.yaml create mode 100644 argocd/manifests/databases-ringtail/external-secret-paperless.yaml create mode 100644 argocd/manifests/databases-ringtail/external-secret-teslamate.yaml create mode 100644 argocd/manifests/mealie-ringtail/deployment.yaml create mode 100644 argocd/manifests/mealie-ringtail/external-secret.yaml rename argocd/manifests/{mealie => mealie-ringtail}/ingress-tailscale.yaml (100%) create mode 100644 argocd/manifests/mealie-ringtail/kustomization.yaml create mode 100644 argocd/manifests/mealie-ringtail/pvc.yaml create mode 100644 argocd/manifests/mealie-ringtail/service.yaml create mode 100644 argocd/manifests/paperless-ringtail/deployment.yaml create mode 100644 argocd/manifests/paperless-ringtail/external-secret.yaml rename argocd/manifests/{paperless => paperless-ringtail}/ingress-tailscale.yaml (100%) create mode 100644 argocd/manifests/paperless-ringtail/kustomization.yaml create mode 100644 argocd/manifests/paperless-ringtail/pv-nfs.yaml create mode 100644 argocd/manifests/paperless-ringtail/pvc.yaml create mode 100644 argocd/manifests/paperless-ringtail/service.yaml create mode 100644 argocd/manifests/teslamate-ringtail/deployment.yaml create mode 100644 argocd/manifests/teslamate-ringtail/external-secret-db.yaml create mode 100644 argocd/manifests/teslamate-ringtail/external-secret-encryption-key.yaml rename argocd/manifests/{teslamate => teslamate-ringtail}/ingress-tailscale.yaml (100%) create mode 100644 argocd/manifests/teslamate-ringtail/kustomization.yaml create mode 100644 argocd/manifests/teslamate-ringtail/service.yaml delete mode 100644 containers/mealie/Dockerfile create mode 100644 containers/mealie/default.nix delete mode 100644 containers/paperless/Dockerfile create mode 100644 containers/paperless/default.nix delete mode 100644 containers/teslamate/container.py create mode 100644 containers/teslamate/default.nix delete mode 100644 containers/teslamate/entrypoint.sh create mode 100644 docs/changelog.d/migrate-wave1-ringtail.infra.md create mode 100644 docs/how-to/ringtail/migrate-wave1-ringtail.md diff --git a/argocd/apps/mealie-ringtail.yaml b/argocd/apps/mealie-ringtail.yaml new file mode 100644 index 0000000..2f014a9 --- /dev/null +++ b/argocd/apps/mealie-ringtail.yaml @@ -0,0 +1,26 @@ +# Mealie on ringtail k3s. +# +# Wave-1 indri-k8s decommission. Staging deployment; the minikube `mealie` +# app stays in parallel until cutover (copy SQLite PVC, drop the minikube +# tailscale ingress, flip Caddy). See [[migrate-wave1-ringtail]]. +# +# Prerequisites: +# - external-secrets-ringtail (onepassword-blumeops ClusterSecretStore) +# - mealie-data PVC contents copied from minikube at cutover +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: mealie-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/mealie-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: mealie + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/paperless-ringtail.yaml b/argocd/apps/paperless-ringtail.yaml new file mode 100644 index 0000000..bec98e9 --- /dev/null +++ b/argocd/apps/paperless-ringtail.yaml @@ -0,0 +1,28 @@ +# Paperless-ngx on ringtail k3s. +# +# Wave-1 indri-k8s decommission. Staging deployment; the minikube +# `paperless` app stays in parallel until cutover (drop the minikube +# tailscale ingress to free the name, then flip Caddy). See +# [[migrate-wave1-ringtail]]. +# +# Prerequisites: +# - databases-ringtail blumeops-pg (paperless database + role) +# - external-secrets-ringtail (onepassword-blumeops ClusterSecretStore) +# - sifaka NFS rule granting ringtail access to /volume1/paperless +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: paperless-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/paperless-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: paperless + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/teslamate-ringtail.yaml b/argocd/apps/teslamate-ringtail.yaml new file mode 100644 index 0000000..b7b3491 --- /dev/null +++ b/argocd/apps/teslamate-ringtail.yaml @@ -0,0 +1,28 @@ +# TeslaMate on ringtail k3s. +# +# Wave-1 indri-k8s decommission. Staging deployment; the minikube +# `teslamate` app stays in parallel until cutover (migrate the teslamate +# database, drop the minikube tailscale ingress, flip Caddy). See +# [[migrate-wave1-ringtail]]. +# +# Prerequisites: +# - databases-ringtail blumeops-pg (teslamate database + role; cube + +# earthdistance extensions created by superuser at cutover) +# - external-secrets-ringtail (onepassword-blumeops ClusterSecretStore) +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: teslamate-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/teslamate-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: teslamate + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/databases-ringtail/blumeops-pg.yaml b/argocd/manifests/databases-ringtail/blumeops-pg.yaml new file mode 100644 index 0000000..3a37249 --- /dev/null +++ b/argocd/manifests/databases-ringtail/blumeops-pg.yaml @@ -0,0 +1,97 @@ +# PostgreSQL Cluster for blumeops services on ringtail k3s. +# +# Wave-1 indri-k8s decommission target (see [[migrate-wave1-ringtail]]). +# Holds the paperless and teslamate databases migrated off the minikube +# blumeops-pg via cold pg_dump/pg_restore at cutover. miniflux + authentik +# stay where they are for now (later waves), so this cluster only carries +# the wave-1 roles. +# +# Apps reach this in-cluster at blumeops-pg-rw.databases.svc.cluster.local +# — the same name they used on minikube, so teslamate's DATABASE_HOST is +# unchanged. +# +# Database creation is deferred to cutover, mirroring the minikube cluster +# (where only the bootstrap database is declared and the rest were created +# out-of-band): +# - paperless: the bootstrap database below (restored into at cutover). +# - teslamate: created at its cutover by the eblume superuser, because the +# dump's `earthdistance` extension is untrusted and CREATE EXTENSION +# needs superuser. (cube + earthdistance ownership then transferred to +# the teslamate role so it can ALTER EXTENSION UPDATE.) +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: blumeops-pg + namespace: databases +spec: + instances: 1 + imageName: ghcr.io/cloudnative-pg/postgresql:18.3 + + storage: + size: 10Gi + storageClass: local-path + + bootstrap: + initdb: + database: paperless + owner: paperless + + managed: + roles: + # eblume superuser for admin + privileged restore steps (extensions) + - name: eblume + login: true + superuser: true + createdb: true + createrole: true + connectionLimit: -1 + ensure: present + inherit: true + passwordSecret: + name: blumeops-pg-eblume + # borgmatic read-only user for backups + - name: borgmatic + login: true + connectionLimit: -1 + ensure: present + inherit: true + inRoles: + - pg_read_all_data + passwordSecret: + name: blumeops-pg-borgmatic + # paperless user (also the bootstrap database owner above; the + # managed role sets its password from the 1Password-backed secret) + - name: paperless + login: true + connectionLimit: -1 + ensure: present + inherit: true + passwordSecret: + name: blumeops-pg-paperless + # teslamate user. Extension ownership (cube, earthdistance) is + # transferred to this role at cutover so it can ALTER EXTENSION UPDATE. + - name: teslamate + login: true + connectionLimit: -1 + ensure: present + inherit: true + passwordSecret: + name: blumeops-pg-teslamate + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + + postgresql: + parameters: + max_connections: "50" + shared_buffers: "128MB" + password_encryption: "scram-sha-256" + pg_hba: + # Password auth from anywhere; network security is via Tailscale. + - host all all 0.0.0.0/0 scram-sha-256 + - host all all ::/0 scram-sha-256 diff --git a/argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml b/argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml new file mode 100644 index 0000000..ee600e3 --- /dev/null +++ b/argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml @@ -0,0 +1,30 @@ +# ExternalSecret for borgmatic backup user password +# +# Replaces the manual op inject workflow from secret-borgmatic.yaml.tpl +# +# 1Password item: "borgmatic" in blumeops vault +# Field: "db-password" +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: blumeops-pg-borgmatic + namespace: databases +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: blumeops-pg-borgmatic + creationPolicy: Owner + template: + type: kubernetes.io/basic-auth + data: + username: borgmatic + password: "{{ .password }}" + data: + - secretKey: password + remoteRef: + key: borgmatic + property: db-password diff --git a/argocd/manifests/databases-ringtail/external-secret-eblume.yaml b/argocd/manifests/databases-ringtail/external-secret-eblume.yaml new file mode 100644 index 0000000..a324c7d --- /dev/null +++ b/argocd/manifests/databases-ringtail/external-secret-eblume.yaml @@ -0,0 +1,30 @@ +# ExternalSecret for eblume superuser password +# +# Replaces the manual op inject workflow from secret-eblume.yaml.tpl +# +# 1Password item: "postgres" in blumeops vault +# Field: "password" +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: blumeops-pg-eblume + namespace: databases +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: blumeops-pg-eblume + creationPolicy: Owner + template: + type: kubernetes.io/basic-auth + data: + username: eblume + password: "{{ .password }}" + data: + - secretKey: password + remoteRef: + key: postgres + property: password diff --git a/argocd/manifests/databases-ringtail/external-secret-paperless.yaml b/argocd/manifests/databases-ringtail/external-secret-paperless.yaml new file mode 100644 index 0000000..e5742be --- /dev/null +++ b/argocd/manifests/databases-ringtail/external-secret-paperless.yaml @@ -0,0 +1,28 @@ +# ExternalSecret for Paperless database user password +# +# 1Password item: "Paperless (blumeops)" in blumeops vault +# Field: "postgresql-password" +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: blumeops-pg-paperless + namespace: databases +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: blumeops-pg-paperless + creationPolicy: Owner + template: + type: kubernetes.io/basic-auth + data: + username: paperless + password: "{{ .password }}" + data: + - secretKey: password + remoteRef: + key: Paperless (blumeops) + property: postgresql-password diff --git a/argocd/manifests/databases-ringtail/external-secret-teslamate.yaml b/argocd/manifests/databases-ringtail/external-secret-teslamate.yaml new file mode 100644 index 0000000..0c52e0b --- /dev/null +++ b/argocd/manifests/databases-ringtail/external-secret-teslamate.yaml @@ -0,0 +1,30 @@ +# ExternalSecret for TeslaMate database user password +# +# Replaces the manual op inject workflow from secret-teslamate.yaml.tpl +# +# 1Password item: "TeslaMate" in blumeops vault +# Field: "db_password" +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: blumeops-pg-teslamate + namespace: databases +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: blumeops-pg-teslamate + creationPolicy: Owner + template: + type: kubernetes.io/basic-auth + data: + username: teslamate + password: "{{ .password }}" + data: + - secretKey: password + remoteRef: + key: TeslaMate + property: db_password diff --git a/argocd/manifests/databases-ringtail/kustomization.yaml b/argocd/manifests/databases-ringtail/kustomization.yaml index 971e2d4..2bc2af3 100644 --- a/argocd/manifests/databases-ringtail/kustomization.yaml +++ b/argocd/manifests/databases-ringtail/kustomization.yaml @@ -7,3 +7,9 @@ resources: - immich-pg.yaml - external-secret-immich-borgmatic.yaml - service-immich-pg-tailscale.yaml + # wave-1 indri-k8s decommission: blumeops-pg (paperless + teslamate) + - blumeops-pg.yaml + - external-secret-eblume.yaml + - external-secret-borgmatic.yaml + - external-secret-paperless.yaml + - external-secret-teslamate.yaml diff --git a/argocd/manifests/mealie-ringtail/deployment.yaml b/argocd/manifests/mealie-ringtail/deployment.yaml new file mode 100644 index 0000000..10d06ab --- /dev/null +++ b/argocd/manifests/mealie-ringtail/deployment.yaml @@ -0,0 +1,102 @@ +# Mealie on ringtail k3s — Nix image. +# +# Single gunicorn process (the Nix image's default `mealie-run` entrypoint +# runs init_db then gunicorn), serving the prebuilt frontend. DB is SQLite +# on the mealie-data PVC; its contents are copied from the minikube PVC at +# cutover. See [[migrate-wave1-ringtail]]. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mealie + namespace: mealie +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: mealie + template: + metadata: + labels: + app: mealie + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: mealie + image: registry.ops.eblu.me/blumeops/mealie:kustomized + ports: + - containerPort: 9000 + env: + - name: BASE_URL + value: "https://meals.ops.eblu.me" + - name: ALLOW_SIGNUP + value: "false" + - name: TZ + value: "America/Los_Angeles" + - name: MAX_WORKERS + value: "1" + - name: WEB_CONCURRENCY + value: "1" + # OIDC — Authentik (public client, PKCE) + - name: OIDC_AUTH_ENABLED + value: "true" + - name: OIDC_CONFIGURATION_URL + value: "https://authentik.ops.eblu.me/application/o/mealie/.well-known/openid-configuration" + - name: OIDC_CLIENT_ID + value: "mealie" + - name: OIDC_CLIENT_SECRET + valueFrom: + secretKeyRef: + name: mealie-secrets + key: oidc-client-secret + - name: OIDC_AUTO_REDIRECT + value: "false" + - name: OIDC_PROVIDER_NAME + value: "Authentik" + - name: OIDC_ADMIN_GROUP + value: "admins" + - name: OIDC_SIGNUP_ENABLED + value: "true" + - name: OIDC_USER_CLAIM + value: "email" + # OpenAI — recipe parsing, image OCR, ingredient extraction + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + name: mealie-secrets + key: openai-api-key + - name: OPENAI_MODEL + value: "gpt-4o" + - name: OPENAI_REQUEST_TIMEOUT + value: "120" + - name: OPENAI_WORKERS + value: "1" + volumeMounts: + - name: data + mountPath: /app/data + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "1000Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /api/app/about + port: 9000 + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /api/app/about + port: 9000 + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: data + persistentVolumeClaim: + claimName: mealie-data diff --git a/argocd/manifests/mealie-ringtail/external-secret.yaml b/argocd/manifests/mealie-ringtail/external-secret.yaml new file mode 100644 index 0000000..99c2793 --- /dev/null +++ b/argocd/manifests/mealie-ringtail/external-secret.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: mealie-secrets + namespace: mealie +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: mealie-secrets + creationPolicy: Owner + data: + - secretKey: oidc-client-secret + remoteRef: + key: "Authentik (blumeops)" + property: mealie-client-secret + - secretKey: openai-api-key + remoteRef: + key: "openai (blumeops)" + property: credential diff --git a/argocd/manifests/mealie/ingress-tailscale.yaml b/argocd/manifests/mealie-ringtail/ingress-tailscale.yaml similarity index 100% rename from argocd/manifests/mealie/ingress-tailscale.yaml rename to argocd/manifests/mealie-ringtail/ingress-tailscale.yaml diff --git a/argocd/manifests/mealie-ringtail/kustomization.yaml b/argocd/manifests/mealie-ringtail/kustomization.yaml new file mode 100644 index 0000000..8428042 --- /dev/null +++ b/argocd/manifests/mealie-ringtail/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: mealie + +resources: + - deployment.yaml + - service.yaml + - pvc.yaml + - ingress-tailscale.yaml + - external-secret.yaml + +images: + - name: registry.ops.eblu.me/blumeops/mealie + newTag: v3.16.0-1d4cbbf-nix diff --git a/argocd/manifests/mealie-ringtail/pvc.yaml b/argocd/manifests/mealie-ringtail/pvc.yaml new file mode 100644 index 0000000..89c38ef --- /dev/null +++ b/argocd/manifests/mealie-ringtail/pvc.yaml @@ -0,0 +1,14 @@ +# SQLite data volume for Mealie on ringtail. Contents copied from the +# minikube mealie-data PVC at cutover (recipes, meal plans, uploaded media). +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: mealie-data + namespace: mealie +spec: + accessModes: + - ReadWriteOnce + storageClassName: local-path + resources: + requests: + storage: 2Gi diff --git a/argocd/manifests/mealie-ringtail/service.yaml b/argocd/manifests/mealie-ringtail/service.yaml new file mode 100644 index 0000000..4162b96 --- /dev/null +++ b/argocd/manifests/mealie-ringtail/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: mealie + namespace: mealie +spec: + selector: + app: mealie + ports: + - name: http + port: 9000 + targetPort: 9000 + protocol: TCP diff --git a/argocd/manifests/mealie/deployment.yaml b/argocd/manifests/mealie/deployment.yaml index bdcf91e..7cdd275 100644 --- a/argocd/manifests/mealie/deployment.yaml +++ b/argocd/manifests/mealie/deployment.yaml @@ -4,7 +4,9 @@ metadata: name: mealie namespace: mealie spec: - replicas: 1 + # Migrated to ringtail (mealie-ringtail). Scaled to 0; SQLite PVC retained + # for rollback until the decommission PR. See [[migrate-wave1-ringtail]]. + replicas: 0 selector: matchLabels: app: mealie diff --git a/argocd/manifests/mealie/kustomization.yaml b/argocd/manifests/mealie/kustomization.yaml index fb0713b..02563f4 100644 --- a/argocd/manifests/mealie/kustomization.yaml +++ b/argocd/manifests/mealie/kustomization.yaml @@ -7,7 +7,7 @@ resources: - deployment.yaml - service.yaml - pvc.yaml - - ingress-tailscale.yaml + # ingress removed: name 'meals' handed off to mealie-ringtail at cutover - external-secret.yaml images: diff --git a/argocd/manifests/paperless-ringtail/deployment.yaml b/argocd/manifests/paperless-ringtail/deployment.yaml new file mode 100644 index 0000000..de4f456 --- /dev/null +++ b/argocd/manifests/paperless-ringtail/deployment.yaml @@ -0,0 +1,201 @@ +# Paperless-ngx on ringtail k3s — Nix image, multi-process. +# +# The upstream s6 image ran web + worker + scheduler + consumer (and DB +# migrations) in one container. The Nix image (containers/paperless/ +# default.nix) ships the binaries but no supervisor, so we run those as +# four containers in one pod, sharing the local data/consume dirs +# (emptyDir) and the NFS media volume; redis is colocated so +# PAPERLESS_REDIS=localhost works for all. A migrate initContainer runs +# DB migrations once before the app containers start. +# +# DB points in-cluster at the ringtail blumeops-pg (was pg.ops.eblu.me on +# indri). PAPERLESS_{DATA_DIR,MEDIA_ROOT,CONSUMPTION_DIR} are set +# explicitly because the Nix package does not default to the upstream +# /usr/src/paperless paths. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: paperless + namespace: paperless +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: paperless + template: + metadata: + labels: + app: paperless + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + initContainers: + # redis as a native sidecar (restartPolicy: Always): starts before + # the migrate init and stays running for the app containers, so all + # of them reach PAPERLESS_REDIS=localhost:6379. + - name: redis + image: docker.io/library/redis:kustomized + restartPolicy: Always + ports: + - containerPort: 6379 + volumeMounts: + - name: redis-data + mountPath: /data + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "128Mi" + - name: migrate + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["paperless-ngx", "migrate", "--no-input"] + env: &paperless-env + - name: PAPERLESS_URL + value: "https://paperless.ops.eblu.me" + - name: PAPERLESS_REDIS + value: "redis://localhost:6379" + - name: PAPERLESS_DBHOST + value: "blumeops-pg-rw.databases.svc.cluster.local" + - name: PAPERLESS_DBPORT + value: "5432" + - name: PAPERLESS_DBNAME + value: "paperless" + - name: PAPERLESS_DBUSER + value: "paperless" + - name: PAPERLESS_DBPASS + valueFrom: + secretKeyRef: + name: paperless-secrets + key: db-password + # Explicit port to override the k8s-injected PAPERLESS_PORT + # (service named 'paperless' would set PAPERLESS_PORT=tcp://...) + - name: PAPERLESS_PORT + value: "8000" + - name: PAPERLESS_DATA_DIR + value: "/usr/src/paperless/data" + - name: PAPERLESS_MEDIA_ROOT + value: "/usr/src/paperless/media" + - name: PAPERLESS_CONSUMPTION_DIR + value: "/usr/src/paperless/consume" + - name: PAPERLESS_SECRET_KEY + valueFrom: + secretKeyRef: + name: paperless-secrets + key: secret-key + - name: PAPERLESS_TIME_ZONE + value: "America/Los_Angeles" + - name: PAPERLESS_OCR_LANGUAGE + value: "eng" + - name: PAPERLESS_TASK_WORKERS + value: "1" + - name: PAPERLESS_ADMIN_USER + value: "eblume" + - name: PAPERLESS_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: paperless-secrets + key: admin-password + - name: PAPERLESS_ADMIN_MAIL + value: "blume.erich@gmail.com" + - name: PAPERLESS_APPS + value: "allauth.socialaccount.providers.openid_connect" + - name: PAPERLESS_SOCIALACCOUNT_PROVIDERS + valueFrom: + secretKeyRef: + name: paperless-secrets + key: socialaccount-providers + - name: PAPERLESS_SOCIALACCOUNT_ALLOW_SIGNUPS + value: "true" + - name: PAPERLESS_SOCIAL_AUTO_SIGNUP + value: "true" + - name: PAPERLESS_ACCOUNT_ALLOW_SIGNUPS + value: "false" + - name: PAPERLESS_REDIRECT_LOGIN_TO_SSO + value: "false" + volumeMounts: &paperless-mounts + - name: data + mountPath: /usr/src/paperless/data + - name: media + mountPath: /usr/src/paperless/media + - name: consume + mountPath: /usr/src/paperless/consume + containers: + - name: web + image: registry.ops.eblu.me/blumeops/paperless:kustomized + ports: + - containerPort: 8000 + name: http + env: *paperless-env + volumeMounts: *paperless-mounts + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "1000m" + livenessProbe: + httpGet: + path: / + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 10 + + - name: worker + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["celery", "--app", "paperless", "worker", "--loglevel", "INFO"] + env: *paperless-env + volumeMounts: *paperless-mounts + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "1000m" + + - name: beat + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["celery", "--app", "paperless", "beat", "--loglevel", "INFO"] + env: *paperless-env + volumeMounts: *paperless-mounts + resources: + requests: + memory: "64Mi" + cpu: "20m" + limits: + memory: "256Mi" + + - name: consumer + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["paperless-ngx", "document_consumer"] + env: *paperless-env + volumeMounts: *paperless-mounts + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + + volumes: + - name: data + emptyDir: {} + - name: media + persistentVolumeClaim: + claimName: paperless-media + - name: consume + emptyDir: {} + - name: redis-data + emptyDir: + sizeLimit: 1Gi diff --git a/argocd/manifests/paperless-ringtail/external-secret.yaml b/argocd/manifests/paperless-ringtail/external-secret.yaml new file mode 100644 index 0000000..750b7c5 --- /dev/null +++ b/argocd/manifests/paperless-ringtail/external-secret.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: paperless-secrets + namespace: paperless +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: paperless-secrets + creationPolicy: Owner + data: + - secretKey: db-password + remoteRef: + key: "Paperless (blumeops)" + property: postgresql-password + - secretKey: secret-key + remoteRef: + key: "Paperless (blumeops)" + property: secret-key + - secretKey: admin-password + remoteRef: + key: "Paperless (blumeops)" + property: admin-password + - secretKey: socialaccount-providers + remoteRef: + key: "Paperless (blumeops)" + property: socialaccount-providers diff --git a/argocd/manifests/paperless/ingress-tailscale.yaml b/argocd/manifests/paperless-ringtail/ingress-tailscale.yaml similarity index 100% rename from argocd/manifests/paperless/ingress-tailscale.yaml rename to argocd/manifests/paperless-ringtail/ingress-tailscale.yaml diff --git a/argocd/manifests/paperless-ringtail/kustomization.yaml b/argocd/manifests/paperless-ringtail/kustomization.yaml new file mode 100644 index 0000000..0a691e0 --- /dev/null +++ b/argocd/manifests/paperless-ringtail/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: paperless + +resources: + - deployment.yaml + - service.yaml + - pv-nfs.yaml + - pvc.yaml + - ingress-tailscale.yaml + - external-secret.yaml + +images: + - name: registry.ops.eblu.me/blumeops/paperless + newTag: v2.20.15-1d4cbbf-nix + # amd64 valkey built via nix (the v8.1.7-ecded30 tag without -nix is the + # arm64 Alpine build for indri and fails on ringtail with exec format error) + - name: docker.io/library/redis + newName: registry.ops.eblu.me/blumeops/valkey + newTag: v8.1.7-ecded30-nix diff --git a/argocd/manifests/paperless-ringtail/pv-nfs.yaml b/argocd/manifests/paperless-ringtail/pv-nfs.yaml new file mode 100644 index 0000000..2990d1a --- /dev/null +++ b/argocd/manifests/paperless-ringtail/pv-nfs.yaml @@ -0,0 +1,22 @@ +# NFS PersistentVolume for the Paperless document library, mounted from +# ringtail. Same sifaka export (/volume1/paperless) as the minikube PV, +# but a distinct PV name so both clusters can declare it during the +# parallel-run before cutover. +# +# Prerequisite: sifaka must have an NFS rule granting ringtail Read/Write +# (Squash=No mapping) on the paperless share — the same step done for +# immich. See [[sifaka-nfs-from-ringtail]]. +apiVersion: v1 +kind: PersistentVolume +metadata: + name: paperless-media-nfs-pv-ringtail +spec: + capacity: + storage: 500Gi + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + nfs: + server: sifaka + path: /volume1/paperless diff --git a/argocd/manifests/paperless-ringtail/pvc.yaml b/argocd/manifests/paperless-ringtail/pvc.yaml new file mode 100644 index 0000000..8b44660 --- /dev/null +++ b/argocd/manifests/paperless-ringtail/pvc.yaml @@ -0,0 +1,15 @@ +# PersistentVolumeClaim for the Paperless document library on ringtail. +# Binds the NFS PV for sifaka:/volume1/paperless. +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: paperless-media + namespace: paperless +spec: + accessModes: + - ReadWriteMany + storageClassName: "" + volumeName: paperless-media-nfs-pv-ringtail + resources: + requests: + storage: 500Gi diff --git a/argocd/manifests/paperless-ringtail/service.yaml b/argocd/manifests/paperless-ringtail/service.yaml new file mode 100644 index 0000000..cff2972 --- /dev/null +++ b/argocd/manifests/paperless-ringtail/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: paperless + namespace: paperless +spec: + selector: + app: paperless + ports: + - name: http + port: 8000 + targetPort: 8000 + protocol: TCP diff --git a/argocd/manifests/paperless/deployment.yaml b/argocd/manifests/paperless/deployment.yaml index cc2c013..1730486 100644 --- a/argocd/manifests/paperless/deployment.yaml +++ b/argocd/manifests/paperless/deployment.yaml @@ -4,7 +4,10 @@ metadata: name: paperless namespace: paperless spec: - replicas: 1 + # Migrated to ringtail (paperless-ringtail). Scaled to 0 to prevent + # double-writing the now-ringtail-owned database; manifest retained for + # rollback until the decommission PR. See [[migrate-wave1-ringtail]]. + replicas: 0 selector: matchLabels: app: paperless diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml index 3cd0d74..a92a769 100644 --- a/argocd/manifests/paperless/kustomization.yaml +++ b/argocd/manifests/paperless/kustomization.yaml @@ -8,7 +8,7 @@ resources: - service.yaml - pv-nfs.yaml - pvc.yaml - - ingress-tailscale.yaml + # ingress removed: name 'paperless' handed off to paperless-ringtail at cutover - external-secret.yaml images: diff --git a/argocd/manifests/teslamate-ringtail/deployment.yaml b/argocd/manifests/teslamate-ringtail/deployment.yaml new file mode 100644 index 0000000..cf8cc73 --- /dev/null +++ b/argocd/manifests/teslamate-ringtail/deployment.yaml @@ -0,0 +1,72 @@ +# TeslaMate on ringtail k3s — Nix image. +# +# The Nix image's Entrypoint waits for postgres, runs migrations +# (TeslaMate.Release.migrate), then starts the release — so no command +# override is needed. Stateless; all data lives in the teslamate database +# on the ringtail blumeops-pg (DATABASE_HOST already an in-cluster name, +# unchanged from minikube). See [[migrate-wave1-ringtail]]. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: teslamate + namespace: teslamate +spec: + replicas: 1 + selector: + matchLabels: + app: teslamate + template: + metadata: + labels: + app: teslamate + spec: + securityContext: + seccompProfile: + type: RuntimeDefault + containers: + - name: teslamate + image: registry.ops.eblu.me/blumeops/teslamate:kustomized + ports: + - containerPort: 4000 + env: + - name: DATABASE_USER + value: "teslamate" + - name: DATABASE_PASS + valueFrom: + secretKeyRef: + name: teslamate-db + key: password + - name: DATABASE_NAME + value: "teslamate" + - name: DATABASE_HOST + value: "blumeops-pg-rw.databases.svc.cluster.local" + - name: ENCRYPTION_KEY + valueFrom: + secretKeyRef: + name: teslamate-encryption + key: key + - name: DISABLE_MQTT + value: "true" + - name: CHECK_ORIGIN + value: "false" + - name: TZ + value: "America/Los_Angeles" + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: / + port: 4000 + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 4000 + initialDelaySeconds: 10 + periodSeconds: 10 diff --git a/argocd/manifests/teslamate-ringtail/external-secret-db.yaml b/argocd/manifests/teslamate-ringtail/external-secret-db.yaml new file mode 100644 index 0000000..11eeec6 --- /dev/null +++ b/argocd/manifests/teslamate-ringtail/external-secret-db.yaml @@ -0,0 +1,25 @@ +# ExternalSecret for TeslaMate database password +# +# Replaces the manual op inject workflow from secret-db.yaml.tpl +# +# 1Password item: "TeslaMate" in blumeops vault +# Field: "db_password" +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: teslamate-db + namespace: teslamate +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: teslamate-db + creationPolicy: Owner + data: + - secretKey: password + remoteRef: + key: TeslaMate + property: db_password diff --git a/argocd/manifests/teslamate-ringtail/external-secret-encryption-key.yaml b/argocd/manifests/teslamate-ringtail/external-secret-encryption-key.yaml new file mode 100644 index 0000000..96938bf --- /dev/null +++ b/argocd/manifests/teslamate-ringtail/external-secret-encryption-key.yaml @@ -0,0 +1,27 @@ +# ExternalSecret for TeslaMate encryption key +# +# Replaces the manual op inject workflow from secret-encryption-key.yaml.tpl +# +# 1Password item: "TeslaMate" in blumeops vault +# Field: "api_enc_key" +# +# This key encrypts Tesla API tokens at rest in the database. +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: teslamate-encryption + namespace: teslamate +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: teslamate-encryption + creationPolicy: Owner + data: + - secretKey: key + remoteRef: + key: TeslaMate + property: api_enc_key diff --git a/argocd/manifests/teslamate/ingress-tailscale.yaml b/argocd/manifests/teslamate-ringtail/ingress-tailscale.yaml similarity index 100% rename from argocd/manifests/teslamate/ingress-tailscale.yaml rename to argocd/manifests/teslamate-ringtail/ingress-tailscale.yaml diff --git a/argocd/manifests/teslamate-ringtail/kustomization.yaml b/argocd/manifests/teslamate-ringtail/kustomization.yaml new file mode 100644 index 0000000..f31fe09 --- /dev/null +++ b/argocd/manifests/teslamate-ringtail/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: teslamate + +resources: + - deployment.yaml + - service.yaml + - ingress-tailscale.yaml + - external-secret-db.yaml + - external-secret-encryption-key.yaml + +images: + - name: registry.ops.eblu.me/blumeops/teslamate + newTag: v3.0.0-191be1b-nix diff --git a/argocd/manifests/teslamate-ringtail/service.yaml b/argocd/manifests/teslamate-ringtail/service.yaml new file mode 100644 index 0000000..b04f45e --- /dev/null +++ b/argocd/manifests/teslamate-ringtail/service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: teslamate + namespace: teslamate +spec: + selector: + app: teslamate + ports: + - port: 4000 + targetPort: 4000 + type: ClusterIP diff --git a/argocd/manifests/teslamate/deployment.yaml b/argocd/manifests/teslamate/deployment.yaml index 42859a7..cf7f9bb 100644 --- a/argocd/manifests/teslamate/deployment.yaml +++ b/argocd/manifests/teslamate/deployment.yaml @@ -4,7 +4,10 @@ metadata: name: teslamate namespace: teslamate spec: - replicas: 1 + # Migrated to ringtail (teslamate-ringtail). Scaled to 0 to prevent + # double-writing the now-ringtail-owned database; manifest retained for + # rollback until the decommission PR. See [[migrate-wave1-ringtail]]. + replicas: 0 selector: matchLabels: app: teslamate diff --git a/argocd/manifests/teslamate/kustomization.yaml b/argocd/manifests/teslamate/kustomization.yaml index a00586f..be9d39d 100644 --- a/argocd/manifests/teslamate/kustomization.yaml +++ b/argocd/manifests/teslamate/kustomization.yaml @@ -6,7 +6,7 @@ namespace: teslamate resources: - deployment.yaml - service.yaml - - ingress-tailscale.yaml + # ingress removed: name 'tesla' handed off to teslamate-ringtail at cutover - external-secret-db.yaml - external-secret-encryption-key.yaml diff --git a/containers/mealie/Dockerfile b/containers/mealie/Dockerfile deleted file mode 100644 index 8df38bf..0000000 --- a/containers/mealie/Dockerfile +++ /dev/null @@ -1,145 +0,0 @@ -# Mealie — self-hosted recipe manager -# Built from source via forge mirror of mealie-recipes/mealie -# Based on upstream docker/Dockerfile (multi-stage: Node frontend + Python backend) - -ARG CONTAINER_APP_VERSION=v3.12.0 - -############################################### -# Frontend Build -############################################### -FROM node:24-slim AS frontend-builder - -ARG CONTAINER_APP_VERSION -RUN apt-get update && apt-get install --no-install-recommends -y git ca-certificates && rm -rf /var/lib/apt/lists/* - -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/mealie.git /src - -WORKDIR /src/frontend - -RUN yarn install \ - --prefer-offline \ - --frozen-lockfile \ - --non-interactive \ - --production=false \ - --network-timeout 1000000 - -RUN yarn generate - -############################################### -# Python Base -############################################### -FROM python:3.12-slim AS python-base - -ENV MEALIE_HOME="/app" -ENV PYTHONUNBUFFERED=1 \ - PYTHONDONTWRITEBYTECODE=1 \ - PIP_NO_CACHE_DIR=off \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - VENV_PATH="/opt/mealie" - -ENV PATH="$VENV_PATH/bin:$PATH" - -RUN useradd -u 911 -U -d $MEALIE_HOME -s /bin/bash abc \ - && usermod -G users abc \ - && mkdir $MEALIE_HOME - -############################################### -# Backend Package Build -############################################### -FROM python-base AS backend-builder - -ARG CONTAINER_APP_VERSION -RUN apt-get update \ - && apt-get install --no-install-recommends -y curl git ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install uv - -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/mealie.git /src - -WORKDIR /src - -COPY --from=frontend-builder /src/frontend/dist ./mealie/frontend - -RUN uv build --out-dir dist - -RUN uv export --no-editable --no-emit-project --extra pgsql --format requirements-txt --output-file dist/requirements.txt \ - && MEALIE_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])") \ - && echo "mealie[pgsql]==${MEALIE_VERSION} \\" >> dist/requirements.txt \ - && pip hash dist/mealie-${MEALIE_VERSION}-py3-none-any.whl | tail -n1 | tr -d '\n' >> dist/requirements.txt \ - && echo " \\" >> dist/requirements.txt \ - && pip hash dist/mealie-${MEALIE_VERSION}.tar.gz | tail -n1 >> dist/requirements.txt - -############################################### -# Python Venv Build -############################################### -FROM python-base AS venv-builder - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - build-essential \ - libpq-dev \ - libwebp-dev \ - ffmpeg \ - libsasl2-dev libldap2-dev libssl-dev \ - gnupg gnupg2 gnupg1 \ - && rm -rf /var/lib/apt/lists/* - -RUN python3 -m venv --upgrade-deps $VENV_PATH - -COPY --from=backend-builder /src/dist /dist - -RUN . $VENV_PATH/bin/activate \ - && pip install --require-hashes -r /dist/requirements.txt --find-links /dist - -############################################### -# Production Image -############################################### -FROM python-base AS production - -ENV PRODUCTION=true -ENV TESTING=false - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - ffmpeg \ - gosu \ - iproute2 \ - libldap-common \ - libldap2 \ - && rm -rf /var/lib/apt/lists/* - -RUN mkdir -p /run/secrets - -COPY --from=venv-builder $VENV_PATH $VENV_PATH - -ENV NLTK_DATA="/nltk_data/" -RUN mkdir -p $NLTK_DATA -RUN python -m nltk.downloader -d $NLTK_DATA averaged_perceptron_tagger_eng - -VOLUME ["$MEALIE_HOME/data/"] -ENV APP_PORT=9000 - -EXPOSE ${APP_PORT} - -COPY --from=backend-builder /src/docker/healthcheck.sh $MEALIE_HOME/healthcheck.sh -RUN chmod +x $MEALIE_HOME/healthcheck.sh -HEALTHCHECK CMD $MEALIE_HOME/healthcheck.sh - -ENV HOST=0.0.0.0 - -COPY --from=backend-builder /src/docker/entry.sh $MEALIE_HOME/run.sh -RUN chmod +x $MEALIE_HOME/run.sh - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Mealie" -LABEL org.opencontainers.image.description="Self-hosted recipe manager" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -ENTRYPOINT ["/app/run.sh"] diff --git a/containers/mealie/default.nix b/containers/mealie/default.nix new file mode 100644 index 0000000..fdb1430 --- /dev/null +++ b/containers/mealie/default.nix @@ -0,0 +1,65 @@ +# Nix-built Mealie for ringtail (amd64). +# +# Replaces the from-source Dockerfile build (Node frontend + Python venv) +# with nixpkgs' mealie, which ships a single `mealie` gunicorn entrypoint +# serving the prebuilt frontend + backend — so this is a clean single- +# process wrap (unlike paperless, which is multi-process). +# +# Mealie stores its DB as SQLite under DATA_DIR (the mealie-data PVC at +# /app/data); there is no postgres. The run wrapper mirrors the nixpkgs +# mealie NixOS module: run `libexec/init_db` (Alembic migrations) first, +# then exec gunicorn. +# +# Self-pins nixos-unstable: stable nixpkgs lags at 3.9.2, unstable carries +# 3.16.0. This is a forward 4-minor bump from the v3.12.0 Dockerfile build +# (the deferred upgrade) — mealie auto-migrates the SQLite DB forward on +# startup via init_db; the source PVC is retained for rollback. The version +# assertion makes nix-build fail if a pin bump changes the version. +let + nixpkgs = fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/331800de5053fcebacf6813adb5db9c9dca22a0c.tar.gz"; + sha256 = "1p54fm6dkbq62kpi55cr4wyx7b1nsajpsnjgs64cmp073fwi15f7"; + }; + pkgs = import nixpkgs { system = "x86_64-linux"; }; + + version = "3.16.0"; + + app = pkgs.mealie; + + # Mirror the NixOS module's mealie service: init_db (Alembic) then + # gunicorn bound to the app port. DATA_DIR/env come from the image + + # k8s manifest. + mealie-run = pkgs.writeShellScriptBin "mealie-run" '' + set -e + ${app}/libexec/init_db + exec ${pkgs.lib.getExe app} -b 0.0.0.0:9000 + ''; +in + +assert app.version == version; + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/mealie"; + + contents = [ + app + mealie-run + pkgs.bashInteractive + pkgs.coreutils + pkgs.cacert + pkgs.tzdata + ]; + + config = { + Cmd = [ "${mealie-run}/bin/mealie-run" ]; + Env = [ + "DATA_DIR=/app/data" + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "PYTHONUNBUFFERED=1" + "PRODUCTION=true" + ]; + ExposedPorts = { + "9000/tcp" = { }; + }; + }; +} diff --git a/containers/paperless/Dockerfile b/containers/paperless/Dockerfile deleted file mode 100644 index a7b4e65..0000000 --- a/containers/paperless/Dockerfile +++ /dev/null @@ -1,156 +0,0 @@ -# syntax=docker/dockerfile:1 -# Paperless-ngx — self-hosted document management -# Built from source via forge mirror of paperless-ngx/paperless-ngx -# Closely follows upstream Dockerfile structure with git clone instead of COPY - -ARG CONTAINER_APP_VERSION=v2.20.13 - -############################################### -# Stage 1: Clone source (reused by later stages) -############################################### -FROM docker.io/library/alpine:3.22 AS source - -ARG CONTAINER_APP_VERSION -RUN apk add --no-cache git -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/paperless-ngx.git /src - -############################################### -# Stage 2: Compile frontend -############################################### -FROM --platform=$BUILDPLATFORM docker.io/node:20-trixie-slim AS compile-frontend - -COPY --from=source /src/src-ui /src/src-ui -WORKDIR /src/src-ui - -RUN set -eux \ - && npm update -g pnpm \ - && npm install -g corepack@latest \ - && corepack enable \ - && pnpm install - -RUN set -eux \ - && ./node_modules/.bin/ng build --configuration production - -############################################### -# Stage 3: s6-overlay base -############################################### -FROM ghcr.io/astral-sh/uv:0.9.15-python3.12-trixie-slim AS s6-overlay-base - -WORKDIR /usr/src/s6 - -ENV S6_BEHAVIOUR_IF_STAGE2_FAILS=2 \ - S6_CMD_WAIT_FOR_SERVICES_MAXTIME=0 \ - S6_VERBOSITY=1 \ - PATH=/command:$PATH - -ARG TARGETARCH -ARG TARGETVARIANT -ARG S6_OVERLAY_VERSION=3.2.1.0 - -RUN set -eux \ - && apt-get update \ - && apt-get install --yes --quiet --no-install-recommends curl xz-utils \ - && S6_ARCH="" \ - && if [ "${TARGETARCH}${TARGETVARIANT}" = "amd64" ]; then S6_ARCH="x86_64"; \ - elif [ "${TARGETARCH}${TARGETVARIANT}" = "arm64" ]; then S6_ARCH="aarch64"; fi \ - && if [ -z "${S6_ARCH}" ]; then echo "Error: Cannot determine arch"; exit 1; fi \ - && curl --fail --silent --show-error --location --remote-name-all --parallel \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz" \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz.sha256" \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-${S6_ARCH}.tar.xz" \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-${S6_ARCH}.tar.xz.sha256" \ - && sha256sum --check ./*.sha256 \ - && tar --directory / -Jxpf s6-overlay-noarch.tar.xz \ - && tar --directory / -Jxpf s6-overlay-${S6_ARCH}.tar.xz \ - && rm ./*.tar.xz ./*.sha256 \ - && apt-get --yes purge curl xz-utils \ - && apt-get --yes autoremove --purge \ - && rm -rf /var/lib/apt/lists/* - -# Copy rootfs (s6 service definitions, init scripts) -COPY --from=source /src/docker/rootfs / - -############################################### -# Stage 4: Main application -############################################### -FROM s6-overlay-base AS main-app - -ARG CONTAINER_APP_VERSION -ARG DEBIAN_FRONTEND=noninteractive -ARG TARGETARCH -ARG JBIG2ENC_VERSION=0.30 - -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONWARNINGS="ignore:::django.http.response:517" \ - PNGX_CONTAINERIZED=1 \ - UV_LINK_MODE=copy \ - UV_CACHE_DIR=/cache/uv/ - -# Runtime packages -RUN set -eux \ - && apt-get update \ - && apt-get install --yes --quiet --no-install-recommends \ - curl gosu tzdata fonts-liberation gettext ghostscript gnupg \ - icc-profiles-free imagemagick postgresql-client \ - tesseract-ocr tesseract-ocr-eng tesseract-ocr-deu tesseract-ocr-fra \ - tesseract-ocr-ita tesseract-ocr-spa unpaper pngquant jbig2dec \ - libxml2 libxslt1.1 qpdf file libmagic1 media-types zlib1g \ - libzbar0 poppler-utils \ - && curl --fail --silent --show-error --location --remote-name-all \ - "https://github.com/paperless-ngx/builder/releases/download/jbig2enc-trixie-v${JBIG2ENC_VERSION}/jbig2enc_${JBIG2ENC_VERSION}-1_${TARGETARCH}.deb" \ - && dpkg --install ./jbig2enc_${JBIG2ENC_VERSION}-1_${TARGETARCH}.deb \ - && cp /etc/ImageMagick-6/paperless-policy.xml /etc/ImageMagick-6/policy.xml \ - && rm --force *.deb \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /usr/src/paperless/src/ - -# Python dependencies -COPY --from=source /src/pyproject.toml /src/uv.lock /usr/src/paperless/src/ - -RUN --mount=type=cache,target=${UV_CACHE_DIR},id=python-cache \ - set -eux \ - && apt-get update \ - && apt-get install --yes --quiet --no-install-recommends \ - build-essential default-libmysqlclient-dev pkg-config \ - && uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt \ - && uv pip install --system --no-python-downloads --python-preference system --requirements requirements.txt \ - && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" snowball_data \ - && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" stopwords \ - && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" punkt_tab \ - && apt-get --yes purge build-essential default-libmysqlclient-dev pkg-config \ - && apt-get --yes autoremove --purge \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Copy backend source -COPY --from=source /src/src ./ - -# Copy compiled frontend -COPY --from=compile-frontend /src/src/documents/static/frontend/ ./documents/static/frontend/ - -# Create user and finalize -RUN set -eux \ - && addgroup --gid 1000 paperless \ - && useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \ - && mkdir -p /usr/src/paperless/data /usr/src/paperless/media \ - /usr/src/paperless/consume /usr/src/paperless/export \ - && chown -R paperless:paperless /usr/src/paperless \ - && s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \ - && s6-setuidgid paperless python3 manage.py compilemessages - -VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", \ - "/usr/src/paperless/consume", "/usr/src/paperless/export"] - -ENTRYPOINT ["/init"] -EXPOSE 8000 - -HEALTHCHECK --interval=30s --timeout=10s --retries=5 \ - CMD [ "curl", "-fs", "-S", "-L", "--max-time", "2", "http://localhost:8000" ] - -LABEL org.opencontainers.image.title="Paperless-ngx" -LABEL org.opencontainers.image.description="Self-hosted document management system" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" diff --git a/containers/paperless/default.nix b/containers/paperless/default.nix new file mode 100644 index 0000000..734d909 --- /dev/null +++ b/containers/paperless/default.nix @@ -0,0 +1,77 @@ +# Nix-built Paperless-ngx for ringtail (amd64). +# +# Replaces the from-source Dockerfile build (s6-overlay) with nixpkgs' +# paperless-ngx, which already bundles the full OCR/imaging closure +# (tesseract, ghostscript, imagemagick, qpdf, poppler, jbig2enc) and the +# NLTK data via wrappers — so the image stays lean. +# +# Unlike the upstream s6 image, this image does NOT run all processes +# itself. Paperless is multi-process; on ringtail it runs as four +# containers sharing this one image, each with a different command: +# web -> paperless-web (granian, the wrapper below) +# worker -> celery --app paperless worker +# beat -> celery --app paperless beat +# consumer -> paperless-ngx document_consumer +# plus a redis/valkey sidecar. The PYTHONPATH/granian invocation mirrors +# the nixpkgs paperless NixOS module's paperless-web service exactly. +# +# Self-pins nixos-unstable: stable nixpkgs lags at 2.19.6, while unstable +# carries 2.20.15 — a same-minor forward patch bump from the previous +# Dockerfile build (v2.20.13). The version assertion makes nix-build fail +# if a pin bump changes the version, forcing an explicit acknowledgment +# here and in service-versions.yaml (enforced by container-version-check). +let + nixpkgs = fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/331800de5053fcebacf6813adb5db9c9dca22a0c.tar.gz"; + sha256 = "1p54fm6dkbq62kpi55cr4wyx7b1nsajpsnjgs64cmp073fwi15f7"; + }; + pkgs = import nixpkgs { system = "x86_64-linux"; }; + + version = "2.20.15"; + + app = pkgs.paperless-ngx; + + # Mirror the NixOS module's paperless-web service: granian serving the + # ASGI app with the package's propagated deps + src on PYTHONPATH. + pythonPath = + "${app.python.pkgs.makePythonPath app.propagatedBuildInputs}:${app}/lib/paperless-ngx/src"; + + paperless-web = pkgs.writeShellScriptBin "paperless-web" '' + export PYTHONPATH="${pythonPath}" + export PAPERLESS_NLTK_DIR="${app.nltkDataDir}" + exec ${app.python.pkgs.granian}/bin/granian \ + --interface asginl --ws \ + --host 0.0.0.0 --port 8000 \ + "paperless.asgi:application" + ''; +in + +assert app.version == version; + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/paperless"; + + contents = [ + app + paperless-web + pkgs.bashInteractive + pkgs.coreutils + pkgs.cacert + pkgs.tzdata + ]; + + config = { + # Default command is the web server; worker/beat/consumer containers + # override `command` in their k8s manifests. + Cmd = [ "${paperless-web}/bin/paperless-web" ]; + Env = [ + "PAPERLESS_NLTK_DIR=${app.nltkDataDir}" + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "PYTHONUNBUFFERED=1" + "PNGX_CONTAINERIZED=1" + ]; + ExposedPorts = { + "8000/tcp" = { }; + }; + }; +} diff --git a/containers/teslamate/container.py b/containers/teslamate/container.py deleted file mode 100644 index 519d77d..0000000 --- a/containers/teslamate/container.py +++ /dev/null @@ -1,104 +0,0 @@ -"""TeslaMate — Tesla data logger. - -Two-stage build: Elixir+Node (builder), Debian slim (runtime). -Source cloned from forge mirror. -""" - -import dagger -from dagger import dag - -from blumeops.containers import clone_from_forge, oci_labels - -VERSION = "v3.0.0" - - -async def build(src: dagger.Directory) -> dagger.Container: - source = clone_from_forge("teslamate", VERSION) - - # Stage 1: Build Elixir release with Node.js assets - builder = ( - dag.container() - .from_("elixir:1.19.5-otp-26") - .with_exec( - [ - "bash", - "-c", - "apt-get update" - " && apt-get install -y ca-certificates curl gnupg git zstd brotli" - " && mkdir -p /etc/apt/keyrings" - " && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" - " | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg" - ' && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg]' - ' https://deb.nodesource.com/node_22.x nodistro main"' - " > /etc/apt/sources.list.d/nodesource.list" - " && apt-get update" - " && apt-get install -y nodejs" - " && apt-get clean" - " && rm -rf /var/lib/apt/lists/*", - ] - ) - .with_exec(["mix", "local.rebar", "--force"]) - .with_exec(["mix", "local.hex", "--force"]) - .with_directory("/opt/app", source) - .with_workdir("/opt/app") - .with_env_variable("MIX_ENV", "prod") - .with_exec(["mix", "deps.get", "--only", "prod"]) - .with_exec(["mix", "deps.compile"]) - .with_exec( - [ - "npm", - "ci", - "--prefix", - "./assets", - "--progress=false", - "--no-audit", - "--loglevel=error", - ] - ) - .with_exec(["mix", "assets.deploy"]) - .with_exec(["mix", "compile"]) - .with_exec( - ["bash", "-c", "SKIP_LOCALE_DOWNLOAD=true mix release --path /opt/built"] - ) - ) - - # Stage 2: Debian slim runtime - entrypoint = src.file("containers/teslamate/entrypoint.sh") - - runtime = ( - dag.container() - .from_("debian:trixie-slim") - .with_exec( - [ - "bash", - "-c", - "apt-get update && apt-get install -y --no-install-recommends" - " libodbc2 libsctp1 libssl3t64 libstdc++6" - " netcat-openbsd tini tzdata" - " && apt-get clean" - " && rm -rf /var/lib/apt/lists/*" - " && groupadd --gid 10001 --system nonroot" - " && useradd --uid 10000 --system --gid nonroot" - " --home-dir /home/nonroot --shell /sbin/nologin nonroot", - ] - ) - ) - runtime = oci_labels( - runtime, - title="TeslaMate", - description="Tesla data logger and visualization", - version=VERSION, - ) - return ( - runtime.with_env_variable("LANG", "C.UTF-8") - .with_env_variable("SRTM_CACHE", "/opt/app/.srtm_cache") - .with_env_variable("HOME", "/opt/app") - .with_workdir("/opt/app") - .with_directory("/opt/app", builder.directory("/opt/built"), owner="nonroot") - .with_exec(["mkdir", "-p", "/opt/app/.srtm_cache"]) - .with_file("/entrypoint.sh", entrypoint, permissions=0o555, owner="nonroot") - .with_user("nonroot") - .with_exposed_port(4000) - .with_entrypoint(["tini", "--", "/bin/dash", "/entrypoint.sh"]) - .with_default_args(args=["bin/teslamate", "start"]) - ) diff --git a/containers/teslamate/default.nix b/containers/teslamate/default.nix new file mode 100644 index 0000000..e126561 --- /dev/null +++ b/containers/teslamate/default.nix @@ -0,0 +1,122 @@ +# Nix-built TeslaMate for ringtail (amd64). +# +# Replaces the Dagger container.py (Elixir+Node builder -> Debian slim). +# TeslaMate is NOT in nixpkgs, so this is a from-scratch beamPackages +# mixRelease: an Elixir/Phoenix release with npm-built assets. +# +# Pinned to the same nixos-unstable rev as paperless/mealie for a +# consistent toolchain. The BEAM combo is pinned to erlang_27 + elixir_1_18 +# (teslamate requires elixir ~> 1.17; upstream's image uses OTP 26, so we +# stay off the default OTP 28 which elixir 1.18 does not target). +# +# Source comes from the forge mirror (supply-chain control), pinned by the +# v3.0.0 tag's commit so builtins.fetchGit needs no hash. +let + nixpkgs = fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/331800de5053fcebacf6813adb5db9c9dca22a0c.tar.gz"; + sha256 = "1p54fm6dkbq62kpi55cr4wyx7b1nsajpsnjgs64cmp073fwi15f7"; + }; + pkgs = import nixpkgs { system = "x86_64-linux"; }; + lib = pkgs.lib; + + version = "3.0.0"; + + beamPackages = pkgs.beam.packages.erlang_27; + elixir = beamPackages.elixir_1_18; + + src = builtins.fetchGit { + url = "https://forge.ops.eblu.me/mirrors/teslamate.git"; + ref = "refs/tags/v${version}"; + rev = "3281154d42330786a182c1bbe094ecda0b1c5578"; + }; + + # ex_cldr downloads locale JSON from GitHub at compile time, which the + # build sandbox blocks. teslamate's cldr.ex reads the data dir from the + # LOCALES env var; point it at the pre-fetched elixir-cldr data so no + # download is attempted (with SKIP_LOCALE_DOWNLOAD=true disabling the + # forced refresh). CLDR data version matches the compile-time errors. + cldrData = pkgs.fetchFromGitHub { + owner = "elixir-cldr"; + repo = "cldr"; + rev = "v2.46.0"; + sha256 = "1iwzk9dc754l72vpf8vsisdjncnjx26pz509552b6vnm49xbxyji"; + }; + + teslamate = beamPackages.mixRelease { + pname = "teslamate"; + inherit version src elixir; + + # Keep the build-generated Erlang cookie in the release. mixRelease + # strips it by default (expecting RELEASE_COOKIE at runtime), but the + # start script reads releases/COOKIE. teslamate is single-node (no + # distributed Erlang exposed), so a baked-in cookie is fine. + removeCookie = false; + + mixFodDeps = beamPackages.fetchMixDeps { + pname = "mix-deps-teslamate"; + inherit src version elixir; + hash = "sha256-DDrREiM1BIMgD2qFPTK8QyjOYlnfE3XlnaH/jk7G2go="; + }; + + # Frontend assets. esbuild + sass are devDeps and the esbuild platform + # binary is an optional dep, so npm ci must include both. We run npm ci + # here (not a separate derivation) because assets/package.json has + # file:../deps/phoenix references that only resolve once mixFodDeps has + # populated deps/. npmConfigHook wires up the offline cache from npmDeps; + # then `node scripts/build.js` (custom esbuild) + `mix phx.digest`. + nativeBuildInputs = [ pkgs.nodejs pkgs.npmHooks.npmConfigHook ]; + npmDeps = pkgs.fetchNpmDeps { + name = "teslamate-npm-deps"; + src = src + "/assets"; + hash = "sha256-XyiaUkT/c4rZnNxmxhVLb+vEXnc64A1hjOrnR5fhaEk="; + }; + npmRoot = "assets"; + + preBuild = '' + export SKIP_LOCALE_DOWNLOAD=true + export LOCALES=${cldrData}/priv/cldr + ( cd assets && npm ci --include=dev --include=optional && node scripts/build.js ) + mix phx.digest --no-deps-check + ''; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/teslamate"; + + contents = [ + teslamate + pkgs.bashInteractive + pkgs.coreutils + pkgs.dash + pkgs.netcat-openbsd + pkgs.cacert + pkgs.tzdata + ]; + + config = { + # Mirror entrypoint.sh: wait for postgres, run migrations, then start. + Entrypoint = [ + "${pkgs.dash}/bin/dash" + "-c" + '' + : "''${DATABASE_HOST:=127.0.0.1}" + : "''${DATABASE_PORT:=5432}" + while ! ${pkgs.netcat-openbsd}/bin/nc -z "$DATABASE_HOST" "$DATABASE_PORT" 2>/dev/null; do + echo "waiting for postgres at $DATABASE_HOST:$DATABASE_PORT"; sleep 1 + done + ${teslamate}/bin/teslamate eval "TeslaMate.Release.migrate" + exec ${teslamate}/bin/teslamate start + '' + ]; + Env = [ + "HOME=/opt/app" + "SRTM_CACHE=/opt/app/.srtm_cache" + "LANG=C.UTF-8" + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + ]; + ExposedPorts = { + "4000/tcp" = { }; + }; + }; +} diff --git a/containers/teslamate/entrypoint.sh b/containers/teslamate/entrypoint.sh deleted file mode 100644 index f66117e..0000000 --- a/containers/teslamate/entrypoint.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env dash -set -e - -: "${DATABASE_HOST:="127.0.0.1"}" -: "${DATABASE_PORT:=5432}" -: "${ULIMIT_MAX_NOFILE:=65536}" - -# prevent memory bloat in some misconfigured versions of Docker/containerd -# where the nofiles limit is very large. 0 means don't set it. -if test "${ULIMIT_MAX_NOFILE}" != 0 && test "$(ulimit -n)" -gt "${ULIMIT_MAX_NOFILE}"; then - ulimit -n "${ULIMIT_MAX_NOFILE}" -fi - -# wait until Postgres is ready -while ! nc -z "${DATABASE_HOST}" "${DATABASE_PORT}" 2>/dev/null; do - echo waiting for postgres at "${DATABASE_HOST}":"${DATABASE_PORT}" - sleep 1s -done - -# apply migrations -bin/teslamate eval "TeslaMate.Release.migrate" - -exec "$@" diff --git a/docs/changelog.d/migrate-wave1-ringtail.infra.md b/docs/changelog.d/migrate-wave1-ringtail.infra.md new file mode 100644 index 0000000..c44263a --- /dev/null +++ b/docs/changelog.d/migrate-wave1-ringtail.infra.md @@ -0,0 +1,13 @@ +Move paperless, teslamate, and mealie off `minikube-indri` onto +`k3s-ringtail`, shedding ~1.1 GiB of resident load from the +OOM-thrashing 8 GiB minikube node (the kernel OOM killer had been +killing `kube-apiserver`/`dockerd`/argocd, flapping every +minikube-hosted service at once). paperless + teslamate databases +move into a fresh CNPG `blumeops-pg` cluster on ringtail via a cold +`pg_dump`/`pg_restore` from the quiesced source — row counts verified +equal before any routing flip; source DBs dropped only after the +ringtail side serves traffic. mealie's SQLite PVC is copied as-is. +paperless media stays on sifaka NFS. Downtime-tolerant cold cutover +(no streaming replication); rollback is repoint-and-scale-up with the +source untouched. Second chain in the indri-k8s decommission after +[[migrate-immich-to-ringtail]]. diff --git a/docs/how-to/immich/migrate-immich-to-ringtail.md b/docs/how-to/immich/migrate-immich-to-ringtail.md index cd23384..e654b62 100644 --- a/docs/how-to/immich/migrate-immich-to-ringtail.md +++ b/docs/how-to/immich/migrate-immich-to-ringtail.md @@ -122,6 +122,8 @@ file). ## Related +- [[migrate-wave1-ringtail]] — the next chain in the indri-k8s + decommission: paperless, teslamate, and mealie - [[shower-on-ringtail]] — a previous migration to ringtail (simpler: no upstream cluster, SQLite, no GPU) - [[connect-to-postgres]] — getting a psql session against CNPG diff --git a/docs/how-to/ringtail/migrate-wave1-ringtail.md b/docs/how-to/ringtail/migrate-wave1-ringtail.md new file mode 100644 index 0000000..ffb8cdc --- /dev/null +++ b/docs/how-to/ringtail/migrate-wave1-ringtail.md @@ -0,0 +1,176 @@ +--- +title: Migrate Wave 1 (paperless, teslamate, mealie) to Ringtail +modified: 2026-06-03 +last-reviewed: 2026-06-03 +tags: + - how-to + - operations + - ringtail + - migration +--- + +# Migrate Wave 1 to Ringtail + +Move paperless, teslamate, and mealie off `minikube-indri` and onto +`k3s-ringtail`. This is the load-shedding response to minikube going +OOM: the kernel OOM killer was thrashing the 8 GiB node — killing +`kube-apiserver`, `dockerd`, and the argocd application-controller — +which made every minikube-hosted service probe-flap at once. These +three app pods are ~1.1 GiB resident combined and are the heaviest +non-observability tenants left on minikube. Following +[[migrate-immich-to-ringtail]], the first chain in the indri-k8s +decommission. + +## End state + +- `paperless`, `teslamate`, and `mealie` run on ringtail k3s in their + own namespaces, off minikube entirely. +- A CNPG `blumeops-pg` Cluster runs in a `databases` namespace on + ringtail (PostgreSQL, owned by ringtail's `cnpg-system` operator), + holding the `paperless` and `teslamate` databases. Apps reach it + in-cluster via `blumeops-pg-rw.databases.svc.cluster.local`. +- mealie keeps its SQLite database; its 2 GiB `mealie-data` PVC is + copied to a ringtail PVC. +- paperless media still lives on [[sifaka]] via NFS (RWX, 500 GiB), + mounted from ringtail pods. teslamate has no file state. +- Routing: `paperless.ops.eblu.me`, `teslamate.ops.eblu.me`, and + `mealie.ops.eblu.me` (Caddy on indri) proxy to Tailscale + ProxyGroup ingresses on ringtail. Service names are unchanged. +- The minikube manifests and the `paperless`/`teslamate`/`mealie` + databases inside indri's `blumeops-pg` are removed only after + cutover is verified. + +## Non-goals + +- Migrating the rest of `blumeops-pg` (e.g. miniflux) — that is a + later wave. This chain moves only the paperless + teslamate + databases out; the source cluster on indri stays up for the others. +- Version bumps or config changes. Lift-and-shift only. +- Public (Fly) exposure changes. These stay tailnet-only. +- The observability stack (prometheus/loki/tempo/grafana) — deferred; + it carries 50 GiB of local TSDB and is the riskiest move. + +## Critical constraint: no data loss + +**Downtime is acceptable — data loss is not.** We can take each +service fully offline for its cutover, which removes the entire +class of streaming-replication and double-writer hazards. The cold +dump is taken from a *quiesced* source, so it is internally +consistent. + +Data surfaces: + +1. **paperless postgres** — document metadata, tags, correspondents, + the search index state. The document *files* are on NFS and never + move, but losing the DB means files-without-index. This is the + surface to protect most carefully. +2. **teslamate postgres** — drive/charge history. Re-derivable only + from Tesla's API for a limited window; treat as unrecoverable. +3. **mealie SQLite** — recipes, meal plans. On the `mealie-data` PVC. + +The source databases on indri are **never dropped until the ringtail +side is verified and serving**. Rollback is "repoint and scale back +up," not "restore from backup." [[borgmatic]] remains the backstop. + +## Why a fresh CNPG cluster (not cross-cluster pg) + +indri's `blumeops-pg` is already exposed tailnet-wide at +`pg.ops.eblu.me` (Caddy L4), so we *could* leave the DBs on indri and +just move the app pods. We are not, because: + +- The goal is to retire minikube — keeping pg there blocks it and + leaves a cross-host runtime dependency (ringtail apps SPOF on + indri's pg over the tailnet). +- CNPG is the same operator on both clusters; a Cluster CR on ringtail + is mechanically equivalent to the one on minikube. +- Naming the ringtail cluster `blumeops-pg` in `databases` lets apps + use the same in-cluster DNS they would on indri. + +## Cold-cutover procedure (per service) + +Do these one service at a time. paperless first (heaviest, highest +data-sensitivity), then teslamate, then mealie. + +### 0. Prerequisites (once, before any service) + +- Confirm ringtail's `cnpg-system` operator and `databases` namespace + are healthy (immich-pg already runs there). +- Confirm ringtail pods can reach indri's `pg.ops.eblu.me:5432` (used + only to pull the dump) and the sifaka NFS export for paperless + media. See [[sifaka-nfs-from-ringtail]]. +- Define the ringtail `blumeops-pg` CNPG Cluster manifest (model on + `databases-ringtail/immich-pg.yaml`) and its ExternalSecrets for + the per-app roles. Sync it; let it come up empty and healthy. + +### 1. Quiesce the source + +```fish +kubectl --context=minikube-indri -n scale deploy/ --replicas=0 +# confirm 0 running, DB now has no writers +``` + +### 2. Dump from indri, restore to ringtail (postgres apps) + +```fish +# dump the single app DB from the quiesced source +kubectl --context=minikube-indri -n databases exec blumeops-pg-1 -- \ + pg_dump -Fc -d > /tmp/.dump + +# restore into the ringtail cluster +kubectl --context=k3s-ringtail -n databases exec -i blumeops-pg-1 -- \ + pg_restore --no-owner --role= -d < /tmp/.dump +``` + +For **mealie** (SQLite) instead: copy the `mealie-data` PVC contents +to the ringtail PVC (e.g. a one-shot rsync pod mounting both, or +`kubectl cp` via a helper pod). Verify the `.db` file size and that +mealie boots read-only against it. + +### 3. Verify the restore (before any routing flips) + +- Row counts match source for the key tables, scripted: + - paperless: `documents_document`, `documents_tag`, + `documents_correspondent`, `auth_user`. + - teslamate: `cars`, `drives`, `charging_processes`, `positions`. +- `pg_dump --schema-only --no-owner` diff between source and dest is + empty modulo CNPG-managed roles. +- Boot the app against the ringtail DB on its tailnet name *before* + Caddy is flipped, and smoke-test (paperless: documents list + + search; teslamate: dashboard loads recent drives; mealie: recipes + list). + +### 4. Release the service name + +```fish +# delete the minikube tailscale ingress so ringtail can claim the name +kubectl --context=minikube-indri -n delete ingress -tailscale +``` + +### 5. Bring up on ringtail + +- Apply the ringtail manifests (new ArgoCD app `-ringtail`, + `destination.server` = `https://ringtail.tail8d86e.ts.net:6443`). + App points at `blumeops-pg-rw.databases.svc.cluster.local`. +- Sync; wait for healthy + the ProxyGroup ingress to get its name. + +### 6. Flip routing + +- Repoint the Caddy `.ops.eblu.me` upstream at the ringtail + ProxyGroup ingress (provision-indri, caddy role). +- `mise run services-check` — confirm the service flips from FIRING + to OK and no neighbours regressed. + +### 7. Decommission the source (only after verification) + +- Remove the minikube manifests for the app. +- Drop the app DB from indri's `blumeops-pg` (paperless/teslamate) + **last**, once the ringtail side has served real traffic. + +## Rollback + +If a cutover fails verification at any step before §7: + +- Re-create the minikube tailscale ingress (if §4 ran). +- Scale the minikube app back to `1`. +- Repoint Caddy back to the minikube ingress. +- The source DB was never modified or dropped. Document the failure. diff --git a/service-versions.yaml b/service-versions.yaml index 5440f01..699f89c 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -47,7 +47,7 @@ services: - name: shower type: argocd last-reviewed: 2026-05-15 - current-version: "1.1.2" + current-version: "1.1.3" upstream-source: https://forge.eblu.me/eblume/adelaide-baby-shower-app notes: | Django app for Adelaide / Heidi / Addie's baby shower. Wheel @@ -222,9 +222,17 @@ services: - name: teslamate type: argocd - last-reviewed: 2026-04-14 + last-reviewed: "2026-06-03" current-version: "v3.0.0" upstream-source: https://github.com/teslamate-org/teslamate/releases + notes: >- + Tesla data logger. Container ported from Dagger (container.py) to Nix + (containers/teslamate/default.nix) — a from-scratch beamPackages + mixRelease (Elixir/Phoenix release with npm-built assets), since + teslamate is not in nixpkgs. Pins erlang_27 + elixir_1_18 from the + shared nixos-unstable rev; assets via in-release npm ci + esbuild; + ex_cldr locale data pre-fetched (LOCALES env) to avoid sandbox + downloads. Version unchanged (v3.0.0). Build verified on ringtail. - name: transmission type: argocd @@ -328,21 +336,31 @@ services: - name: mealie type: argocd - last-reviewed: 2026-05-11 - current-version: "v3.12.0" + last-reviewed: "2026-06-03" + current-version: "v3.16.0" upstream-source: https://github.com/mealie-recipes/mealie/releases notes: >- - Recipe manager; built from source via forge mirror. - Upstream is at v3.17.0 as of 2026-05-11 (5 minor versions ahead). - Container/manifest still pinned to v3.12.0 — upgrade deferred to a - separate task (build new image, review changelog for breaking changes). + Recipe manager. Container ported from Dockerfile to Nix + (containers/mealie/default.nix wraps nixpkgs mealie from a pinned + nixos-unstable; single gunicorn process, SQLite on the mealie-data + PVC). Bumped v3.12.0 -> v3.16.0 as part of the port (the deferred + upgrade). Breaking-change review v3.13-v3.16: no schema breaking + changes, SQLite auto-migrates forward via init_db; notable items are + minor (OIDC missing-claims log -> DEBUG, NLP parser uses user-defined + units, Nuxt 3->4 frontend, new Announcements feature, path-traversal + patches). Source PVC retained for rollback. Build verified on ringtail. - name: paperless type: argocd - last-reviewed: "2026-04-08" - current-version: "v2.20.13" + last-reviewed: "2026-06-03" + current-version: "v2.20.15" upstream-source: https://github.com/paperless-ngx/paperless-ngx/releases - notes: Document management; built from source via forge mirror + notes: >- + Document management. Container ported from Dockerfile to Nix + (containers/paperless/default.nix wraps nixpkgs paperless-ngx from a + pinned nixos-unstable). Runs as web/worker/beat/consumer containers on + ringtail (multi-process; no s6). Bumped v2.20.13 -> v2.20.15 (the + unstable package version, same-minor patch) as part of the port. - name: unpoller type: argocd From 92b54e7ba9a41b461a423cfdd5a53278a7e4ac40 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 10:36:15 -0700 Subject: [PATCH 103/122] C0: ringtail wave-1 images rebuilt from main (fcac8e5-nix tags) Post-merge rebuild of paperless/mealie/teslamate Nix images at the main merge commit, replacing the feature-branch -nix tags. Image content is identical; only the commit-sha suffix changes. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/mealie-ringtail/kustomization.yaml | 2 +- argocd/manifests/paperless-ringtail/kustomization.yaml | 2 +- argocd/manifests/teslamate-ringtail/kustomization.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/argocd/manifests/mealie-ringtail/kustomization.yaml b/argocd/manifests/mealie-ringtail/kustomization.yaml index 8428042..2b6a7ef 100644 --- a/argocd/manifests/mealie-ringtail/kustomization.yaml +++ b/argocd/manifests/mealie-ringtail/kustomization.yaml @@ -12,4 +12,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/mealie - newTag: v3.16.0-1d4cbbf-nix + newTag: v3.16.0-fcac8e5-nix diff --git a/argocd/manifests/paperless-ringtail/kustomization.yaml b/argocd/manifests/paperless-ringtail/kustomization.yaml index 0a691e0..41665b8 100644 --- a/argocd/manifests/paperless-ringtail/kustomization.yaml +++ b/argocd/manifests/paperless-ringtail/kustomization.yaml @@ -13,7 +13,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/paperless - newTag: v2.20.15-1d4cbbf-nix + newTag: v2.20.15-fcac8e5-nix # amd64 valkey built via nix (the v8.1.7-ecded30 tag without -nix is the # arm64 Alpine build for indri and fails on ringtail with exec format error) - name: docker.io/library/redis diff --git a/argocd/manifests/teslamate-ringtail/kustomization.yaml b/argocd/manifests/teslamate-ringtail/kustomization.yaml index f31fe09..acb623e 100644 --- a/argocd/manifests/teslamate-ringtail/kustomization.yaml +++ b/argocd/manifests/teslamate-ringtail/kustomization.yaml @@ -12,4 +12,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/teslamate - newTag: v3.0.0-191be1b-nix + newTag: v3.0.0-fcac8e5-nix From e0057b46e4c7266fc4c01db7a88af69ae65ff655 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 12:25:30 -0700 Subject: [PATCH 104/122] Wire ringtail blumeops-pg into backups + Grafana (#364) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prereq for the wave-1 decommission. The cutover moved paperless+teslamate (postgres) and mealie (SQLite) to ringtail, but borgmatic and the Grafana TeslaMate datasource still pointed at the minikube copies — the migrated live data was unbacked since cutover, and dropping the minikube DBs would break the TeslaMate dashboards. - Tailscale Service `blumeops-pg-ringtail` + Caddy L4 route `pg.ops.eblu.me:5434` - borgmatic: teslamate + paperless postgres → :5434; mealie SQLite → ssh:eblume@ringtail - Grafana TeslaMate datasource → pg.ops.eblu.me:5434 Deploy: sync databases-ringtail (tailscale svc) + grafana from branch; provision-indri --tags caddy,borgmatic; verify a backup run + dashboards. Unblocks the decommission PR. Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/364 --- ansible/roles/borgmatic/defaults/main.yml | 16 +++++++------ ansible/roles/borgmatic/tasks/main.yml | 2 ++ .../borgmatic/templates/k8s-sqlite-dump.sh.j2 | 4 +++- ansible/roles/caddy/defaults/main.yml | 2 ++ .../databases-ringtail/kustomization.yaml | 1 + .../service-blumeops-pg-tailscale.yaml | 24 +++++++++++++++++++ argocd/manifests/grafana/datasources.yaml | 4 +++- .../mealie-ringtail/kustomization.yaml | 2 +- containers/mealie/default.nix | 4 ++++ ...ckup-grafana-ringtail-blumeops-pg.infra.md | 8 +++++++ 10 files changed, 57 insertions(+), 10 deletions(-) create mode 100644 argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml create mode 100644 docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md diff --git a/ansible/roles/borgmatic/defaults/main.yml b/ansible/roles/borgmatic/defaults/main.yml index 3a89a09..a743161 100644 --- a/ansible/roles/borgmatic/defaults/main.yml +++ b/ansible/roles/borgmatic/defaults/main.yml @@ -56,8 +56,9 @@ borgmatic_k8s_sqlite_dumps: namespace: mealie label_selector: app=mealie db_path: /app/data/mealie.db - # local kubectl, --context=minikube (indri's only configured ctx) - target: local:minikube + # migrated to ringtail (wave-1); ssh to ringtail and run k3s kubectl + # there, same as shower below. + target: ssh:eblume@ringtail - name: shower namespace: shower label_selector: app=shower @@ -102,17 +103,18 @@ borgmatic_postgresql_databases: hostname: pg.ops.eblu.me port: 5432 username: borgmatic - - name: teslamate - hostname: pg.ops.eblu.me - port: 5432 - username: borgmatic - name: authentik hostname: pg.ops.eblu.me port: 5432 username: borgmatic + # migrated to ringtail blumeops-pg (wave-1); port 5434 = Caddy L4 route + - name: teslamate + hostname: pg.ops.eblu.me + port: 5434 + username: borgmatic - name: paperless hostname: pg.ops.eblu.me - port: 5432 + port: 5434 username: borgmatic # immich-pg cluster (VectorChord) via Caddy L4 on port 5433 - name: immich diff --git a/ansible/roles/borgmatic/tasks/main.yml b/ansible/roles/borgmatic/tasks/main.yml index 4ac242c..36d3bb6 100644 --- a/ansible/roles/borgmatic/tasks/main.yml +++ b/ansible/roles/borgmatic/tasks/main.yml @@ -19,8 +19,10 @@ ansible.builtin.copy: content: | # Managed by ansible (borgmatic role) - k8s PostgreSQL backup credentials + # 5432 = minikube blumeops-pg, 5433 = immich-pg, 5434 = ringtail blumeops-pg pg.ops.eblu.me:5432:*:borgmatic:{{ borgmatic_db_password }} pg.ops.eblu.me:5433:*:borgmatic:{{ borgmatic_db_password }} + pg.ops.eblu.me:5434:*:borgmatic:{{ borgmatic_db_password }} dest: ~/.pgpass mode: '0600' no_log: true diff --git a/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 b/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 index 323e717..9cc24da 100644 --- a/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 +++ b/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 @@ -28,7 +28,9 @@ db_path=${4:?missing db path} name=${5:?missing name} dump_target=${6:?missing dump target} -pod_tmp="/tmp/${name}-backup.db" +# Stage the backup next to the source DB (a guaranteed-writable volume); +# minimal nix images (e.g. mealie) have no /tmp. +pod_tmp="$(dirname "$db_path")/.borgmatic-backup-${name}.db" python_backup='import sqlite3; sqlite3.connect("'"$db_path"'").backup(sqlite3.connect("'"$pod_tmp"'"))' diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index da6f3f9..363d09e 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -117,6 +117,8 @@ caddy_tcp_services: backend: "pg.tail8d86e.ts.net:5432" # PostgreSQL (blumeops-pg) - port: 5433 backend: "immich-pg.tail8d86e.ts.net:5432" # PostgreSQL (immich-pg) + - port: 5434 + backend: "blumeops-pg-ringtail.tail8d86e.ts.net:5432" # PostgreSQL (blumeops-pg on ringtail) - port: "{{ sifaka_node_exporter_port }}" backend: "sifaka:{{ sifaka_node_exporter_port }}" # Sifaka node_exporter - port: "{{ sifaka_smartctl_exporter_port }}" diff --git a/argocd/manifests/databases-ringtail/kustomization.yaml b/argocd/manifests/databases-ringtail/kustomization.yaml index 2bc2af3..143345c 100644 --- a/argocd/manifests/databases-ringtail/kustomization.yaml +++ b/argocd/manifests/databases-ringtail/kustomization.yaml @@ -9,6 +9,7 @@ resources: - service-immich-pg-tailscale.yaml # wave-1 indri-k8s decommission: blumeops-pg (paperless + teslamate) - blumeops-pg.yaml + - service-blumeops-pg-tailscale.yaml - external-secret-eblume.yaml - external-secret-borgmatic.yaml - external-secret-paperless.yaml diff --git a/argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml b/argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml new file mode 100644 index 0000000..f7ca5ef --- /dev/null +++ b/argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml @@ -0,0 +1,24 @@ +# Tailscale LoadBalancer for the ringtail blumeops-pg cluster. +# Canonical hostname: blumeops-pg-ringtail.tail8d86e.ts.net (distinct from +# the minikube blumeops-pg, which still owns pg.tail8d86e.ts.net until the +# wave-1 decommission). Borgmatic on indri and the Grafana TeslaMate +# datasource reach it via the Caddy L4 route pg.ops.eblu.me:5434. +apiVersion: v1 +kind: Service +metadata: + name: blumeops-pg-tailscale + namespace: databases + annotations: + tailscale.com/hostname: "blumeops-pg-ringtail" + tailscale.com/proxy-class: "default" +spec: + type: LoadBalancer + loadBalancerClass: tailscale + selector: + cnpg.io/cluster: blumeops-pg + role: primary + ports: + - name: postgresql + port: 5432 + targetPort: 5432 + protocol: TCP diff --git a/argocd/manifests/grafana/datasources.yaml b/argocd/manifests/grafana/datasources.yaml index 5a3d0f3..64ed2bf 100644 --- a/argocd/manifests/grafana/datasources.yaml +++ b/argocd/manifests/grafana/datasources.yaml @@ -63,5 +63,7 @@ datasources: password: $TESLAMATE_DB_PASSWORD type: postgres uid: TeslaMate - url: blumeops-pg-rw.databases.svc.cluster.local:5432 + # teslamate DB migrated to ringtail blumeops-pg (wave-1); reached via the + # Caddy L4 route on indri (pg.ops.eblu.me:5434 -> blumeops-pg-ringtail). + url: pg.ops.eblu.me:5434 user: teslamate diff --git a/argocd/manifests/mealie-ringtail/kustomization.yaml b/argocd/manifests/mealie-ringtail/kustomization.yaml index 2b6a7ef..7679032 100644 --- a/argocd/manifests/mealie-ringtail/kustomization.yaml +++ b/argocd/manifests/mealie-ringtail/kustomization.yaml @@ -12,4 +12,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/mealie - newTag: v3.16.0-fcac8e5-nix + newTag: v3.16.0-22cfd86-nix diff --git a/containers/mealie/default.nix b/containers/mealie/default.nix index fdb1430..e55efe3 100644 --- a/containers/mealie/default.nix +++ b/containers/mealie/default.nix @@ -48,6 +48,10 @@ pkgs.dockerTools.buildLayeredImage { pkgs.coreutils pkgs.cacert pkgs.tzdata + # python3 (stdlib sqlite3) for the borgmatic k8s-sqlite-dump helper, + # which runs `python3 -c "...sqlite3...backup..."` inside the pod. + # Same nixpkgs python mealie is built against, so ~no added closure. + pkgs.python3 ]; config = { diff --git a/docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md b/docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md new file mode 100644 index 0000000..33b041f --- /dev/null +++ b/docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md @@ -0,0 +1,8 @@ +Wire the ringtail `blumeops-pg` cluster (which holds the wave-1-migrated +paperless + teslamate databases) into backups and Grafana. Adds a Tailscale +LoadBalancer Service (`blumeops-pg-ringtail.tail8d86e.ts.net`) and a Caddy L4 +route (`pg.ops.eblu.me:5434`), then repoints borgmatic's `teslamate` + +`paperless` postgres dumps and the `mealie` SQLite dump at ringtail, and the +Grafana TeslaMate datasource at the ringtail DB. Closes the backup gap that +opened at cutover (the migrated live data was still being backed up from the +now-frozen minikube copies) and unblocks the wave-1 decommission. From 44798a6429adea3822041755af5ddd22ac149b98 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 12:26:55 -0700 Subject: [PATCH 105/122] C0: mealie-ringtail image rebuilt from main (e0057b4-nix) Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/mealie-ringtail/kustomization.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/mealie-ringtail/kustomization.yaml b/argocd/manifests/mealie-ringtail/kustomization.yaml index 7679032..ad65785 100644 --- a/argocd/manifests/mealie-ringtail/kustomization.yaml +++ b/argocd/manifests/mealie-ringtail/kustomization.yaml @@ -12,4 +12,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/mealie - newTag: v3.16.0-22cfd86-nix + newTag: v3.16.0-e0057b4-nix From 46f00021781e835fddc80de06588fb4ae87d5f5f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 12:36:06 -0700 Subject: [PATCH 106/122] Decommission wave-1 minikube services (paperless, teslamate, mealie) (#365) Final step of the wave-1 indri-k8s migration. paperless, teslamate, mealie run on ringtail with data migrated, verified, and backed up (local + BorgBase offsite via PR #364). - Remove minikube paperless/teslamate/mealie manifest dirs + ArgoCD app defs (prunes the parked Deployments/Services + redundant minikube mealie/paperless PVCs) - Drop paperless/teslamate roles + ExternalSecrets from the minikube blumeops-pg cluster - miniflux + authentik stay on minikube (later waves) Finalization after merge: sync apps + databases to prune, then DROP DATABASE paperless/teslamate on indri's blumeops-pg (fresh safety dump taken first). Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/365 --- argocd/apps/mealie.yaml | 17 --- argocd/apps/paperless.yaml | 17 --- argocd/apps/teslamate.yaml | 32 ----- argocd/manifests/databases/blumeops-pg.yaml | 23 +-- .../databases/external-secret-paperless.yaml | 28 ---- .../databases/external-secret-teslamate.yaml | 30 ---- argocd/manifests/databases/kustomization.yaml | 2 - argocd/manifests/mealie/deployment.yaml | 96 ------------- argocd/manifests/mealie/external-secret.yaml | 23 --- argocd/manifests/mealie/kustomization.yaml | 15 -- argocd/manifests/mealie/pvc.yaml | 13 -- argocd/manifests/mealie/service.yaml | 13 -- argocd/manifests/paperless/deployment.yaml | 133 ------------------ .../manifests/paperless/external-secret.yaml | 31 ---- argocd/manifests/paperless/kustomization.yaml | 19 --- argocd/manifests/paperless/pv-nfs.yaml | 22 --- argocd/manifests/paperless/pvc.yaml | 15 -- argocd/manifests/paperless/service.yaml | 13 -- argocd/manifests/teslamate/README.md | 69 --------- argocd/manifests/teslamate/deployment.yaml | 68 --------- .../teslamate/external-secret-db.yaml | 25 ---- .../external-secret-encryption-key.yaml | 27 ---- argocd/manifests/teslamate/kustomization.yaml | 15 -- argocd/manifests/teslamate/service.yaml | 12 -- .../decommission-wave1-minikube.infra.md | 8 ++ 25 files changed, 11 insertions(+), 755 deletions(-) delete mode 100644 argocd/apps/mealie.yaml delete mode 100644 argocd/apps/paperless.yaml delete mode 100644 argocd/apps/teslamate.yaml delete mode 100644 argocd/manifests/databases/external-secret-paperless.yaml delete mode 100644 argocd/manifests/databases/external-secret-teslamate.yaml delete mode 100644 argocd/manifests/mealie/deployment.yaml delete mode 100644 argocd/manifests/mealie/external-secret.yaml delete mode 100644 argocd/manifests/mealie/kustomization.yaml delete mode 100644 argocd/manifests/mealie/pvc.yaml delete mode 100644 argocd/manifests/mealie/service.yaml delete mode 100644 argocd/manifests/paperless/deployment.yaml delete mode 100644 argocd/manifests/paperless/external-secret.yaml delete mode 100644 argocd/manifests/paperless/kustomization.yaml delete mode 100644 argocd/manifests/paperless/pv-nfs.yaml delete mode 100644 argocd/manifests/paperless/pvc.yaml delete mode 100644 argocd/manifests/paperless/service.yaml delete mode 100644 argocd/manifests/teslamate/README.md delete mode 100644 argocd/manifests/teslamate/deployment.yaml delete mode 100644 argocd/manifests/teslamate/external-secret-db.yaml delete mode 100644 argocd/manifests/teslamate/external-secret-encryption-key.yaml delete mode 100644 argocd/manifests/teslamate/kustomization.yaml delete mode 100644 argocd/manifests/teslamate/service.yaml create mode 100644 docs/changelog.d/decommission-wave1-minikube.infra.md diff --git a/argocd/apps/mealie.yaml b/argocd/apps/mealie.yaml deleted file mode 100644 index af33469..0000000 --- a/argocd/apps/mealie.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: mealie - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/mealie - destination: - server: https://kubernetes.default.svc - namespace: mealie - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/paperless.yaml b/argocd/apps/paperless.yaml deleted file mode 100644 index 88437eb..0000000 --- a/argocd/apps/paperless.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: paperless - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/paperless - destination: - server: https://kubernetes.default.svc - namespace: paperless - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/teslamate.yaml b/argocd/apps/teslamate.yaml deleted file mode 100644 index 60247da..0000000 --- a/argocd/apps/teslamate.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# TeslaMate Tesla Data Logger -# Requires: CloudNativePG PostgreSQL cluster and manual secret setup -# -# Before syncing, create the namespace and secrets: -# kubectl create namespace teslamate -# op inject -i argocd/manifests/databases/secret-teslamate.yaml.tpl | kubectl apply -f - -# op inject -i argocd/manifests/teslamate/secret-encryption-key.yaml.tpl | kubectl apply -f - -# op inject -i argocd/manifests/teslamate/secret-db.yaml.tpl | kubectl apply -f - -# -# Then create the database: -# PGPASSWORD=$(op read "op://blumeops/postgres/password") \ -# psql -h pg.ops.eblu.me -U eblume -c "CREATE DATABASE teslamate OWNER teslamate;" -# -# After syncing, access the TeslaMate UI at https://tesla.tail8d86e.ts.net to complete -# Tesla API authentication via OAuth flow. -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: teslamate - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/teslamate - destination: - server: https://kubernetes.default.svc - namespace: teslamate - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/manifests/databases/blumeops-pg.yaml b/argocd/manifests/databases/blumeops-pg.yaml index 58c771a..37aef23 100644 --- a/argocd/manifests/databases/blumeops-pg.yaml +++ b/argocd/manifests/databases/blumeops-pg.yaml @@ -44,18 +44,9 @@ spec: - pg_read_all_data passwordSecret: name: blumeops-pg-borgmatic - # teslamate user for TeslaMate Tesla data logger - # Superuser removed. Extension ownership (cube, earthdistance) - # transferred manually so teslamate can ALTER EXTENSION UPDATE. - # earthdistance is untrusted — DROP+CREATE needs temporary - # superuser escalation during upgrades. - - name: teslamate - login: true - connectionLimit: -1 - ensure: present - inherit: true - passwordSecret: - name: blumeops-pg-teslamate + # teslamate + paperless roles removed: migrated to ringtail blumeops-pg + # (wave-1 decommission). Their databases were dropped from this cluster + # after the cutover was verified and backed up. # authentik user for Authentik identity provider (runs on ringtail) - name: authentik login: true @@ -65,14 +56,6 @@ spec: createdb: true passwordSecret: name: blumeops-pg-authentik - # paperless user for Paperless-ngx document management - - name: paperless - login: true - connectionLimit: -1 - ensure: present - inherit: true - passwordSecret: - name: blumeops-pg-paperless # Resource limits for minikube environment resources: diff --git a/argocd/manifests/databases/external-secret-paperless.yaml b/argocd/manifests/databases/external-secret-paperless.yaml deleted file mode 100644 index e5742be..0000000 --- a/argocd/manifests/databases/external-secret-paperless.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# ExternalSecret for Paperless database user password -# -# 1Password item: "Paperless (blumeops)" in blumeops vault -# Field: "postgresql-password" -# -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: blumeops-pg-paperless - namespace: databases -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: blumeops-pg-paperless - creationPolicy: Owner - template: - type: kubernetes.io/basic-auth - data: - username: paperless - password: "{{ .password }}" - data: - - secretKey: password - remoteRef: - key: Paperless (blumeops) - property: postgresql-password diff --git a/argocd/manifests/databases/external-secret-teslamate.yaml b/argocd/manifests/databases/external-secret-teslamate.yaml deleted file mode 100644 index 0c52e0b..0000000 --- a/argocd/manifests/databases/external-secret-teslamate.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# ExternalSecret for TeslaMate database user password -# -# Replaces the manual op inject workflow from secret-teslamate.yaml.tpl -# -# 1Password item: "TeslaMate" in blumeops vault -# Field: "db_password" -# -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: blumeops-pg-teslamate - namespace: databases -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: blumeops-pg-teslamate - creationPolicy: Owner - template: - type: kubernetes.io/basic-auth - data: - username: teslamate - password: "{{ .password }}" - data: - - secretKey: password - remoteRef: - key: TeslaMate - property: db_password diff --git a/argocd/manifests/databases/kustomization.yaml b/argocd/manifests/databases/kustomization.yaml index 692285a..0393757 100644 --- a/argocd/manifests/databases/kustomization.yaml +++ b/argocd/manifests/databases/kustomization.yaml @@ -9,6 +9,4 @@ resources: - service-metrics-tailscale.yaml - external-secret-eblume.yaml - external-secret-borgmatic.yaml - - external-secret-teslamate.yaml - external-secret-authentik.yaml - - external-secret-paperless.yaml diff --git a/argocd/manifests/mealie/deployment.yaml b/argocd/manifests/mealie/deployment.yaml deleted file mode 100644 index 7cdd275..0000000 --- a/argocd/manifests/mealie/deployment.yaml +++ /dev/null @@ -1,96 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mealie - namespace: mealie -spec: - # Migrated to ringtail (mealie-ringtail). Scaled to 0; SQLite PVC retained - # for rollback until the decommission PR. See [[migrate-wave1-ringtail]]. - replicas: 0 - selector: - matchLabels: - app: mealie - template: - metadata: - labels: - app: mealie - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: mealie - image: registry.ops.eblu.me/blumeops/mealie:kustomized - ports: - - containerPort: 9000 - env: - - name: BASE_URL - value: "https://meals.ops.eblu.me" - - name: ALLOW_SIGNUP - value: "false" - - name: TZ - value: "America/Los_Angeles" - - name: MAX_WORKERS - value: "1" - - name: WEB_CONCURRENCY - value: "1" - # OIDC — Authentik (public client, PKCE) - - name: OIDC_AUTH_ENABLED - value: "true" - - name: OIDC_CONFIGURATION_URL - value: "https://authentik.ops.eblu.me/application/o/mealie/.well-known/openid-configuration" - - name: OIDC_CLIENT_ID - value: "mealie" - - name: OIDC_CLIENT_SECRET - valueFrom: - secretKeyRef: - name: mealie-secrets - key: oidc-client-secret - - name: OIDC_AUTO_REDIRECT - value: "false" - - name: OIDC_PROVIDER_NAME - value: "Authentik" - - name: OIDC_ADMIN_GROUP - value: "admins" - - name: OIDC_SIGNUP_ENABLED - value: "true" - - name: OIDC_USER_CLAIM - value: "email" - # OpenAI — recipe parsing, image OCR, ingredient extraction - - name: OPENAI_API_KEY - valueFrom: - secretKeyRef: - name: mealie-secrets - key: openai-api-key - - name: OPENAI_MODEL - value: "gpt-4o" - - name: OPENAI_REQUEST_TIMEOUT - value: "120" - - name: OPENAI_WORKERS - value: "1" - volumeMounts: - - name: data - mountPath: /app/data - resources: - requests: - memory: "128Mi" - cpu: "50m" - limits: - memory: "1000Mi" - cpu: "500m" - livenessProbe: - httpGet: - path: /api/app/about - port: 9000 - initialDelaySeconds: 30 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /api/app/about - port: 9000 - initialDelaySeconds: 10 - periodSeconds: 10 - volumes: - - name: data - persistentVolumeClaim: - claimName: mealie-data diff --git a/argocd/manifests/mealie/external-secret.yaml b/argocd/manifests/mealie/external-secret.yaml deleted file mode 100644 index 99c2793..0000000 --- a/argocd/manifests/mealie/external-secret.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: mealie-secrets - namespace: mealie -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: mealie-secrets - creationPolicy: Owner - data: - - secretKey: oidc-client-secret - remoteRef: - key: "Authentik (blumeops)" - property: mealie-client-secret - - secretKey: openai-api-key - remoteRef: - key: "openai (blumeops)" - property: credential diff --git a/argocd/manifests/mealie/kustomization.yaml b/argocd/manifests/mealie/kustomization.yaml deleted file mode 100644 index 02563f4..0000000 --- a/argocd/manifests/mealie/kustomization.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: mealie - -resources: - - deployment.yaml - - service.yaml - - pvc.yaml - # ingress removed: name 'meals' handed off to mealie-ringtail at cutover - - external-secret.yaml - -images: - - name: registry.ops.eblu.me/blumeops/mealie - newTag: v3.12.0-613f05d diff --git a/argocd/manifests/mealie/pvc.yaml b/argocd/manifests/mealie/pvc.yaml deleted file mode 100644 index f473e07..0000000 --- a/argocd/manifests/mealie/pvc.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: mealie-data - namespace: mealie -spec: - accessModes: - - ReadWriteOnce - storageClassName: standard - resources: - requests: - storage: 2Gi diff --git a/argocd/manifests/mealie/service.yaml b/argocd/manifests/mealie/service.yaml deleted file mode 100644 index 4162b96..0000000 --- a/argocd/manifests/mealie/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: mealie - namespace: mealie -spec: - selector: - app: mealie - ports: - - name: http - port: 9000 - targetPort: 9000 - protocol: TCP diff --git a/argocd/manifests/paperless/deployment.yaml b/argocd/manifests/paperless/deployment.yaml deleted file mode 100644 index 1730486..0000000 --- a/argocd/manifests/paperless/deployment.yaml +++ /dev/null @@ -1,133 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: paperless - namespace: paperless -spec: - # Migrated to ringtail (paperless-ringtail). Scaled to 0 to prevent - # double-writing the now-ringtail-owned database; manifest retained for - # rollback until the decommission PR. See [[migrate-wave1-ringtail]]. - replicas: 0 - selector: - matchLabels: - app: paperless - template: - metadata: - labels: - app: paperless - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: paperless - image: registry.ops.eblu.me/blumeops/paperless:kustomized - ports: - - containerPort: 8000 - name: http - env: - - name: PAPERLESS_URL - value: "https://paperless.ops.eblu.me" - - name: PAPERLESS_REDIS - value: "redis://localhost:6379" - - name: PAPERLESS_DBHOST - value: "pg.ops.eblu.me" - - name: PAPERLESS_DBPORT - value: "5432" - - name: PAPERLESS_DBNAME - value: "paperless" - # Explicit port to override k8s-injected PAPERLESS_PORT env var - # (k8s sets PAPERLESS_PORT=tcp://... for a service named 'paperless') - - name: PAPERLESS_PORT - value: "8000" - - name: PAPERLESS_DBUSER - value: "paperless" - - name: PAPERLESS_DBPASS - valueFrom: - secretKeyRef: - name: paperless-secrets - key: db-password - - name: PAPERLESS_SECRET_KEY - valueFrom: - secretKeyRef: - name: paperless-secrets - key: secret-key - - name: PAPERLESS_TIME_ZONE - value: "America/Los_Angeles" - - name: PAPERLESS_OCR_LANGUAGE - value: "eng" - - name: PAPERLESS_TASK_WORKERS - value: "1" - # Admin account (created on first startup) - - name: PAPERLESS_ADMIN_USER - value: "eblume" - - name: PAPERLESS_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: paperless-secrets - key: admin-password - - name: PAPERLESS_ADMIN_MAIL - value: "blume.erich@gmail.com" - # OIDC via Authentik - # Full JSON blob pulled from 1Password (includes client secret) - - name: PAPERLESS_APPS - value: "allauth.socialaccount.providers.openid_connect" - - name: PAPERLESS_SOCIALACCOUNT_PROVIDERS - valueFrom: - secretKeyRef: - name: paperless-secrets - key: socialaccount-providers - - name: PAPERLESS_SOCIALACCOUNT_ALLOW_SIGNUPS - value: "true" - - name: PAPERLESS_SOCIAL_AUTO_SIGNUP - value: "true" - - name: PAPERLESS_ACCOUNT_ALLOW_SIGNUPS - value: "false" - - name: PAPERLESS_REDIRECT_LOGIN_TO_SSO - value: "false" - volumeMounts: - - name: data - mountPath: /usr/src/paperless/data - - name: media - mountPath: /usr/src/paperless/media - - name: consume - mountPath: /usr/src/paperless/consume - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "2Gi" - cpu: "1000m" - livenessProbe: - httpGet: - path: / - port: 8000 - initialDelaySeconds: 60 - periodSeconds: 30 - readinessProbe: - httpGet: - path: / - port: 8000 - initialDelaySeconds: 30 - periodSeconds: 10 - - - name: redis - image: docker.io/library/redis:kustomized - ports: - - containerPort: 6379 - resources: - requests: - memory: "32Mi" - cpu: "10m" - limits: - memory: "128Mi" - - volumes: - - name: data - emptyDir: {} - - name: media - persistentVolumeClaim: - claimName: paperless-media - - name: consume - emptyDir: {} diff --git a/argocd/manifests/paperless/external-secret.yaml b/argocd/manifests/paperless/external-secret.yaml deleted file mode 100644 index 750b7c5..0000000 --- a/argocd/manifests/paperless/external-secret.yaml +++ /dev/null @@ -1,31 +0,0 @@ ---- -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: paperless-secrets - namespace: paperless -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: paperless-secrets - creationPolicy: Owner - data: - - secretKey: db-password - remoteRef: - key: "Paperless (blumeops)" - property: postgresql-password - - secretKey: secret-key - remoteRef: - key: "Paperless (blumeops)" - property: secret-key - - secretKey: admin-password - remoteRef: - key: "Paperless (blumeops)" - property: admin-password - - secretKey: socialaccount-providers - remoteRef: - key: "Paperless (blumeops)" - property: socialaccount-providers diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml deleted file mode 100644 index a92a769..0000000 --- a/argocd/manifests/paperless/kustomization.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: paperless - -resources: - - deployment.yaml - - service.yaml - - pv-nfs.yaml - - pvc.yaml - # ingress removed: name 'paperless' handed off to paperless-ringtail at cutover - - external-secret.yaml - -images: - - name: registry.ops.eblu.me/blumeops/paperless - newTag: v2.20.13-07f52e9 - - name: docker.io/library/redis - newName: registry.ops.eblu.me/blumeops/valkey - newTag: v8.1.7-ecded30 diff --git a/argocd/manifests/paperless/pv-nfs.yaml b/argocd/manifests/paperless/pv-nfs.yaml deleted file mode 100644 index 8ee7526..0000000 --- a/argocd/manifests/paperless/pv-nfs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# NFS PersistentVolume for Paperless document library -# Requires: NFS share on sifaka at /volume1/paperless with NFS permissions for indri -# -# To create on Synology: -# 1. Control Panel > Shared Folder > Create -# 2. Name: paperless, Location: Volume 1 -# 3. Control Panel > File Services > NFS > NFS Rules -# 4. Add rule for "paperless" share: Hostname=indri, Privilege=Read/Write, Squash=No mapping -apiVersion: v1 -kind: PersistentVolume -metadata: - name: paperless-media-nfs-pv -spec: - capacity: - storage: 500Gi - accessModes: - - ReadWriteMany - persistentVolumeReclaimPolicy: Retain - storageClassName: "" - nfs: - server: sifaka - path: /volume1/paperless diff --git a/argocd/manifests/paperless/pvc.yaml b/argocd/manifests/paperless/pvc.yaml deleted file mode 100644 index 4365c9f..0000000 --- a/argocd/manifests/paperless/pvc.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# PersistentVolumeClaim for Paperless document library -# Binds to the NFS PV for sifaka:/volume1/paperless -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: paperless-media - namespace: paperless -spec: - accessModes: - - ReadWriteMany - storageClassName: "" - volumeName: paperless-media-nfs-pv - resources: - requests: - storage: 500Gi diff --git a/argocd/manifests/paperless/service.yaml b/argocd/manifests/paperless/service.yaml deleted file mode 100644 index cff2972..0000000 --- a/argocd/manifests/paperless/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: paperless - namespace: paperless -spec: - selector: - app: paperless - ports: - - name: http - port: 8000 - targetPort: 8000 - protocol: TCP diff --git a/argocd/manifests/teslamate/README.md b/argocd/manifests/teslamate/README.md deleted file mode 100644 index 7e1f9fc..0000000 --- a/argocd/manifests/teslamate/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# TeslaMate - -TeslaMate is a self-hosted Tesla data logger that collects and visualizes vehicle data. - -## Prerequisites - -### 1. Create 1Password Secrets - -Create two items in the blumeops 1Password vault: - -1. **TeslaMate DB Password** - - Generate a secure password for the teslamate PostgreSQL user - - Add a field named `password` with the generated value - -2. **TeslaMate Encryption Key** - - Generate with: `openssl rand -base64 32` - - Add a field named `key` with the generated value - - This encrypts Tesla API tokens at rest in the database - -### 2. Apply Kubernetes Secrets - -```bash -# Create namespace -kubectl create namespace teslamate - -# Apply database user secret (for CNPG) -op inject -i argocd/manifests/databases/secret-teslamate.yaml.tpl | kubectl apply -f - - -# Apply teslamate secrets -op inject -i argocd/manifests/teslamate/secret-encryption-key.yaml.tpl | kubectl apply -f - -op inject -i argocd/manifests/teslamate/secret-db.yaml.tpl | kubectl apply -f - -``` - -### 3. Create Database - -After the teslamate user exists in PostgreSQL (sync blumeops-pg first): - -```bash -PGPASSWORD=$(op read "op://blumeops/postgres/password") \ - psql -h pg.ops.eblu.me -U eblume -c "CREATE DATABASE teslamate OWNER teslamate;" -``` - -## Deployment - -```bash -# Sync ArgoCD apps -argocd app sync apps -argocd app sync blumeops-pg teslamate grafana grafana-config -``` - -## Tesla API Setup - -1. Access TeslaMate UI at https://tesla.tail8d86e.ts.net -2. Click "Sign in with Tesla" -3. Complete OAuth flow in browser -4. Tokens are encrypted and stored in database -5. Verify vehicle appears and data collection starts - -## Grafana Dashboards - -TeslaMate dashboards are available in Grafana at https://grafana.tail8d86e.ts.net - -They use the "TeslaMate" PostgreSQL datasource (not Prometheus). - -## Notes - -- MQTT is disabled (can be enabled later for Home Assistant integration) -- Timezone is set to America/Los_Angeles -- Encryption key protects Tesla API tokens at rest diff --git a/argocd/manifests/teslamate/deployment.yaml b/argocd/manifests/teslamate/deployment.yaml deleted file mode 100644 index cf7f9bb..0000000 --- a/argocd/manifests/teslamate/deployment.yaml +++ /dev/null @@ -1,68 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: teslamate - namespace: teslamate -spec: - # Migrated to ringtail (teslamate-ringtail). Scaled to 0 to prevent - # double-writing the now-ringtail-owned database; manifest retained for - # rollback until the decommission PR. See [[migrate-wave1-ringtail]]. - replicas: 0 - selector: - matchLabels: - app: teslamate - template: - metadata: - labels: - app: teslamate - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: teslamate - image: registry.ops.eblu.me/blumeops/teslamate:kustomized - ports: - - containerPort: 4000 - env: - - name: DATABASE_USER - value: "teslamate" - - name: DATABASE_PASS - valueFrom: - secretKeyRef: - name: teslamate-db - key: password - - name: DATABASE_NAME - value: "teslamate" - - name: DATABASE_HOST - value: "blumeops-pg-rw.databases.svc.cluster.local" - - name: ENCRYPTION_KEY - valueFrom: - secretKeyRef: - name: teslamate-encryption - key: key - - name: DISABLE_MQTT - value: "true" - - name: CHECK_ORIGIN - value: "false" - - name: TZ - value: "America/Los_Angeles" - resources: - requests: - memory: "128Mi" - cpu: "100m" - limits: - memory: "512Mi" - cpu: "500m" - livenessProbe: - httpGet: - path: / - port: 4000 - initialDelaySeconds: 30 - periodSeconds: 30 - readinessProbe: - httpGet: - path: / - port: 4000 - initialDelaySeconds: 10 - periodSeconds: 10 diff --git a/argocd/manifests/teslamate/external-secret-db.yaml b/argocd/manifests/teslamate/external-secret-db.yaml deleted file mode 100644 index 11eeec6..0000000 --- a/argocd/manifests/teslamate/external-secret-db.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# ExternalSecret for TeslaMate database password -# -# Replaces the manual op inject workflow from secret-db.yaml.tpl -# -# 1Password item: "TeslaMate" in blumeops vault -# Field: "db_password" -# -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: teslamate-db - namespace: teslamate -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: teslamate-db - creationPolicy: Owner - data: - - secretKey: password - remoteRef: - key: TeslaMate - property: db_password diff --git a/argocd/manifests/teslamate/external-secret-encryption-key.yaml b/argocd/manifests/teslamate/external-secret-encryption-key.yaml deleted file mode 100644 index 96938bf..0000000 --- a/argocd/manifests/teslamate/external-secret-encryption-key.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# ExternalSecret for TeslaMate encryption key -# -# Replaces the manual op inject workflow from secret-encryption-key.yaml.tpl -# -# 1Password item: "TeslaMate" in blumeops vault -# Field: "api_enc_key" -# -# This key encrypts Tesla API tokens at rest in the database. -# -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: teslamate-encryption - namespace: teslamate -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: teslamate-encryption - creationPolicy: Owner - data: - - secretKey: key - remoteRef: - key: TeslaMate - property: api_enc_key diff --git a/argocd/manifests/teslamate/kustomization.yaml b/argocd/manifests/teslamate/kustomization.yaml deleted file mode 100644 index be9d39d..0000000 --- a/argocd/manifests/teslamate/kustomization.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: teslamate - -resources: - - deployment.yaml - - service.yaml - # ingress removed: name 'tesla' handed off to teslamate-ringtail at cutover - - external-secret-db.yaml - - external-secret-encryption-key.yaml - -images: - - name: registry.ops.eblu.me/blumeops/teslamate - newTag: v3.0.0-08c698e diff --git a/argocd/manifests/teslamate/service.yaml b/argocd/manifests/teslamate/service.yaml deleted file mode 100644 index b04f45e..0000000 --- a/argocd/manifests/teslamate/service.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: teslamate - namespace: teslamate -spec: - selector: - app: teslamate - ports: - - port: 4000 - targetPort: 4000 - type: ClusterIP diff --git a/docs/changelog.d/decommission-wave1-minikube.infra.md b/docs/changelog.d/decommission-wave1-minikube.infra.md new file mode 100644 index 0000000..63b3ab5 --- /dev/null +++ b/docs/changelog.d/decommission-wave1-minikube.infra.md @@ -0,0 +1,8 @@ +Decommission the wave-1 services on minikube-indri now that paperless, +teslamate, and mealie run on ringtail with their data backed up. Removes the +minikube `paperless`/`teslamate`/`mealie` manifest dirs + ArgoCD app +definitions (pruning the parked Deployments, Services, and the redundant +minikube mealie/paperless PVCs), and drops the `paperless`/`teslamate` roles +from the minikube `blumeops-pg` cluster. The `paperless` and `teslamate` +databases are dropped from indri's blumeops-pg as the finalization step. +miniflux + authentik remain on the minikube cluster (later waves). From eaa899cfc65fd5d704c88e39771bc293765b181d Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 13:02:05 -0700 Subject: [PATCH 107/122] C0: wave-1 decommission follow-ups (argocd admin RBAC, teslamate probe) - argocd: grant local break-glass admin the admin role (g, admin, role:admin); previously only the Authentik admins group had access, locking out admin once its token expired (policy.default is unset). - alloy-k8s: repoint the teslamate blackbox probe from the deleted minikube service to https://tesla.ops.eblu.me/ (Caddy over Tailscale), like immich. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/alloy-k8s/config.alloy | 3 ++- argocd/manifests/argocd/argocd-rbac-cm-patch.yaml | 4 ++++ docs/changelog.d/+wave1-decommission-followups.infra.md | 8 ++++++++ 3 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+wave1-decommission-followups.infra.md diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index 5a0a8f9..2940b0b 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -191,8 +191,9 @@ prometheus.exporter.blackbox "services" { } target { + // Migrated to ringtail (wave-1); probe through Caddy over Tailscale. name = "teslamate" - address = "http://teslamate.teslamate.svc.cluster.local:4000/" + address = "https://tesla.ops.eblu.me/" module = "http_2xx" } diff --git a/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml b/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml index c2ea095..4914587 100644 --- a/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml +++ b/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml @@ -2,6 +2,9 @@ # # - workflow-bot: minimal CI/CD permissions (sync, get) # - admins: Authentik admins group mapped to ArgoCD admin role +# - admin: local break-glass account — keeps ArgoCD admin rights for when +# Authentik SSO is unavailable (without this it has no permissions, since +# policy.default is unset) # apiVersion: v1 kind: ConfigMap @@ -14,3 +17,4 @@ data: p, role:workflow-bot, applications, get, *, allow g, workflow-bot, role:workflow-bot g, admins, role:admin + g, admin, role:admin diff --git a/docs/changelog.d/+wave1-decommission-followups.infra.md b/docs/changelog.d/+wave1-decommission-followups.infra.md new file mode 100644 index 0000000..7b54d52 --- /dev/null +++ b/docs/changelog.d/+wave1-decommission-followups.infra.md @@ -0,0 +1,8 @@ +Fix three follow-ups from the wave-1 decommission: grant the local +break-glass `admin` account ArgoCD admin rights (`g, admin, role:admin` — +previously only the Authentik `admins` group had access, so admin was +locked out whenever its token expired), and repoint the alloy blackbox +probe for teslamate from the deleted minikube service to +`https://tesla.ops.eblu.me/` (through Caddy over Tailscale). The orphaned +paperless/teslamate roles + ExternalSecrets left on the minikube +blumeops-pg are also cleaned up. From 308c8e3dad287b2de98891681db4c254ef1c181a Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 15:31:59 -0700 Subject: [PATCH 108/122] C0: drop duplicate Homepage static entries for ringtail-migrated services Mealie, Paperless, Immich, TeslaMate are now autodiscovered from their ringtail Ingress gethomepage.dev annotations; the static services.yaml entries (from when they were on minikube, which homepage-on-ringtail can't autodiscover) were duplicating them. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/homepage/services.yaml | 16 ---------------- .../changelog.d/+homepage-dedup-migrated.misc.md | 5 +++++ 2 files changed, 5 insertions(+), 16 deletions(-) create mode 100644 docs/changelog.d/+homepage-dedup-migrated.misc.md diff --git a/argocd/manifests/homepage/services.yaml b/argocd/manifests/homepage/services.yaml index d552ff2..cc1adf4 100644 --- a/argocd/manifests/homepage/services.yaml +++ b/argocd/manifests/homepage/services.yaml @@ -71,10 +71,6 @@ enableBlocks: true enableNowPlaying: false fields: ["movies", "series", "episodes"] - - Mealie: - href: https://meals.ops.eblu.me - icon: mealie.png - description: Recipe manager - DJ: href: https://dj.ops.eblu.me icon: navidrome.png @@ -85,15 +81,7 @@ user: "{{HOMEPAGE_VAR_NAVIDROME_USER}}" token: "{{HOMEPAGE_VAR_NAVIDROME_TOKEN}}" salt: "{{HOMEPAGE_VAR_NAVIDROME_SALT}}" - - Paperless: - href: https://paperless.ops.eblu.me - icon: paperless-ngx.png - description: Document management - Content: - - Immich: - href: https://photos.ops.eblu.me - icon: immich.png - description: Photo management - Kiwix: href: https://kiwix.ops.eblu.me icon: kiwix.png @@ -138,10 +126,6 @@ href: https://docs.eblu.me icon: mdi-book-open-page-variant description: BlumeOps Documentation - - TeslaMate: - href: https://tesla.ops.eblu.me - icon: teslamate.png - description: Tesla data logger - Transmission: href: https://torrent.ops.eblu.me icon: transmission.png diff --git a/docs/changelog.d/+homepage-dedup-migrated.misc.md b/docs/changelog.d/+homepage-dedup-migrated.misc.md new file mode 100644 index 0000000..9efc5ba --- /dev/null +++ b/docs/changelog.d/+homepage-dedup-migrated.misc.md @@ -0,0 +1,5 @@ +Remove the duplicate Homepage tiles for Mealie, Paperless, Immich, and +TeslaMate. Homepage runs on ringtail and autodiscovers ringtail Ingresses via +`gethomepage.dev/*` annotations; once these services migrated to ringtail they +were discovered automatically, making their leftover static `services.yaml` +entries (needed only while they lived on minikube) redundant. From 214871458478a6b9aaa6dcc1b5aabab1336e8c7c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 21:32:10 -0700 Subject: [PATCH 109/122] C0: retire Todoist blumeops-tasks; point task discovery at heph Replace the Todoist-backed blumeops-tasks mise task with `heph list --project Blumeops --json` (hephaestus, now at v1 prototype on gilbert). Update task-discovery, rotation-reminder, and zk references across docs; note the zk zettelkasten is migrating into heph docs. Co-Authored-By: Claude Opus 4.8 (1M context) --- AGENTS.md | 12 +- .../+blumeops-tasks-due-recurrence.feature.md | 1 - .../+retire-todoist-for-heph.infra.md | 1 + .../configuration/rotate-fly-deploy-token.md | 2 +- docs/how-to/configuration/rotate-gandi-pat.md | 2 +- docs/reference/services/borgmatic.md | 2 +- docs/reference/storage/backups.md | 2 +- docs/reference/tools/mise-tasks.md | 1 - docs/tutorials/ai-assistance-guide.md | 3 +- mise-tasks/blumeops-tasks | 216 ------------------ 10 files changed, 16 insertions(+), 226 deletions(-) delete mode 100644 docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md create mode 100644 docs/changelog.d/+retire-todoist-for-heph.infra.md delete mode 100755 mise-tasks/blumeops-tasks diff --git a/AGENTS.md b/AGENTS.md index 9e7350d..c64af40 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -65,7 +65,7 @@ See [[agent-change-process]] for the full methodology. ./pulumi/ # Pulumi IaC (tailnet ACLs, dns, cloud) ~/.config/{nvim,fish} # user's shell config, managed by chezmoi ~/code/personal/ # user's projects -~/code/personal/zk # user's Obsidian-sync managed zettelkasten. Potential source for reference data. +~/code/personal/zk # user's zettelkasten (Obsidian-sync). Reference-data source; migrating into heph docs (hephaestus). ~/code/3rd/ # mirrored external projects ~/code/work # FORBIDDEN ``` @@ -147,10 +147,16 @@ Create a new spork: `mise run spork-create ` ## Task Discovery +BlumeOps tasks live in [hephaestus](https://github.com/eblume/hephaestus) (`heph`), +the user's self-hosted context/task system. Fetch them with the CLI: + ```fish -mise run blumeops-tasks # fetch from Todoist, sorted by priority +heph list --project Blumeops --json # outstanding Blumeops tasks as JSON ``` -Most tasks are stored in `./mise-tasks/`. For scripts with any logic or + +(This replaced the retired `blumeops-tasks` mise task, which read from Todoist.) + +Most operational scripts are stored in `./mise-tasks/`. For scripts with any logic or complexity, use uv run --script 's with explicit dependencies. Complex workflows with artifacts should become dagger pipelines. Mise tasks are for development processes and operations - tools for the user or the agent. diff --git a/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md b/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md deleted file mode 100644 index 83072dd..0000000 --- a/docs/changelog.d/+blumeops-tasks-due-recurrence.feature.md +++ /dev/null @@ -1 +0,0 @@ -`blumeops-tasks` now annotates each task with a human-readable due offset (`5d overdue` / `due in 2d` / `due today`) and a `↻ ` marker for recurring tasks, and sorts by overdue-ness (most overdue first, no-due-date last) with priority as tiebreaker. diff --git a/docs/changelog.d/+retire-todoist-for-heph.infra.md b/docs/changelog.d/+retire-todoist-for-heph.infra.md new file mode 100644 index 0000000..f6284d0 --- /dev/null +++ b/docs/changelog.d/+retire-todoist-for-heph.infra.md @@ -0,0 +1 @@ +Retired the `blumeops-tasks` mise task (Todoist API) in favor of `heph list --project Blumeops --json` from the self-hosted [hephaestus](https://github.com/eblume/hephaestus) system. Updated docs to point task discovery and rotation reminders at heph, and noted that the `~/code/personal/zk` zettelkasten is migrating into heph docs. diff --git a/docs/how-to/configuration/rotate-fly-deploy-token.md b/docs/how-to/configuration/rotate-fly-deploy-token.md index 5863f54..9abe5f0 100644 --- a/docs/how-to/configuration/rotate-fly-deploy-token.md +++ b/docs/how-to/configuration/rotate-fly-deploy-token.md @@ -14,7 +14,7 @@ How to rotate the Fly.io API token used to deploy [[flyio-proxy]]. The token liv ## When to rotate -- Every 75 days (Todoist recurring task) +- Every 75 days (heph recurring task) - After any compromise / accidental disclosure - If `fly deploy` starts returning auth errors diff --git a/docs/how-to/configuration/rotate-gandi-pat.md b/docs/how-to/configuration/rotate-gandi-pat.md index 94a0b4e..5ce6f81 100644 --- a/docs/how-to/configuration/rotate-gandi-pat.md +++ b/docs/how-to/configuration/rotate-gandi-pat.md @@ -14,7 +14,7 @@ How to rotate the Gandi Personal Access Token. **One PAT** is shared by [[caddy] ## When to rotate -- Every 60 days (Todoist recurring task) +- Every 60 days (heph recurring task) - After any compromise / accidental disclosure - Whenever Gandi starts rejecting the PAT (see [Debugging](#debugging)) diff --git a/docs/reference/services/borgmatic.md b/docs/reference/services/borgmatic.md index fea4551..37f1a60 100644 --- a/docs/reference/services/borgmatic.md +++ b/docs/reference/services/borgmatic.md @@ -25,7 +25,7 @@ Daily backup system using Borg backup, running on indri. ## What Gets Backed Up **Directories:** -- `~/code/personal/zk` - Zettelkasten +- `~/code/personal/zk` - Zettelkasten (migrating into heph docs; see [hephaestus](https://github.com/eblume/hephaestus)) - `/opt/homebrew/var/forgejo` - Git forge data - `~/.config/borgmatic` - Borgmatic config - `~/Documents` - Personal documents diff --git a/docs/reference/storage/backups.md b/docs/reference/storage/backups.md index 14dbcea..2dfbae4 100644 --- a/docs/reference/storage/backups.md +++ b/docs/reference/storage/backups.md @@ -22,7 +22,7 @@ Daily automated backups from [[indri]] to [[sifaka|Sifaka]] NAS. | Path | Description | Priority | |------|-------------|----------| -| `~/code/personal/zk` | Zettelkasten notes | Critical | +| `~/code/personal/zk` | Zettelkasten notes (migrating into heph docs) | Critical | | `/opt/homebrew/var/forgejo` | Git repositories | Critical | | `~/.config/borgmatic` | Backup config | High | | `~/Documents` | Personal documents (includes [[1password]] encrypted export) | High | diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md index 4ec3438..b614cb1 100644 --- a/docs/reference/tools/mise-tasks.md +++ b/docs/reference/tools/mise-tasks.md @@ -69,7 +69,6 @@ Run `mise tasks --sort name` for the live list with descriptions. |------|-------------| | `services-check` | Check all services are online and responding | | `service-review` | Review the most stale service for version freshness | -| `blumeops-tasks` | List tasks from Todoist sorted by priority | | `op-backup` | Encrypt 1Password export and send to indri for borgmatic | ## Infrastructure Setup diff --git a/docs/tutorials/ai-assistance-guide.md b/docs/tutorials/ai-assistance-guide.md index 3ee1ffa..4f0c595 100644 --- a/docs/tutorials/ai-assistance-guide.md +++ b/docs/tutorials/ai-assistance-guide.md @@ -98,7 +98,6 @@ BlumeOps operations are driven by mise tasks. Run `mise tasks` to list all avail | `provision-indri` | Deploy changes to [[indri]]-hosted services via Ansible | | `services-check` | After deployments - verify all services are healthy | | `pr-comments` | Check unresolved PR comments during review | -| `blumeops-tasks` | Find pending tasks from Todoist | | `container-list` | View available container images and tags | | `container-build-and-release` | Trigger container build workflows | | `dns-preview` | Preview DNS changes before applying | @@ -111,6 +110,8 @@ BlumeOps operations are driven by mise tasks. Run `mise tasks` to list all avail | `docs-review` | Review the most stale doc by last-reviewed date | | `runner-logs` | View Forgejo workflow logs (indri or ringtail runner) | +For task discovery, BlumeOps tasks live in [hephaestus](https://github.com/eblume/hephaestus) (`heph`), not Todoist. List outstanding work with `heph list --project Blumeops --json`. + For ArgoCD operations, use the `argocd` CLI directly: - `argocd app diff ` - Preview changes - `argocd app sync ` - Deploy changes diff --git a/mise-tasks/blumeops-tasks b/mise-tasks/blumeops-tasks deleted file mode 100755 index 035aa3b..0000000 --- a/mise-tasks/blumeops-tasks +++ /dev/null @@ -1,216 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.12" -# dependencies = ["httpx==0.28.1", "rich==15.0.0"] -# /// -#MISE description="List Blumeops tasks from Todoist sorted by priority" -"""Fetch and display Blumeops tasks from Todoist, sorted by priority. - -This script is specific to Erich Blume's personal development workflow and -is not intended for general use. It requires: - - - A 1Password CLI (`op`) configured with access to the author's vault - - A Todoist account with a project named "Blumeops" - -The script fetches tasks and displays them sorted by a custom priority order: -p1 (urgent), p2 (high), p4 (normal/default), p3 (backlog). The p3-last ordering -reflects a deliberate choice to treat p3 as "backlog" rather than moderate -priority. - -Usage: mise run blumeops-tasks -""" - -import subprocess -import sys -from datetime import date - -import httpx -from rich.console import Console -from rich.markup import escape -from rich.text import Text - -TODOIST_API_BASE = "https://api.todoist.com/api/v1" -PROJECT_NAME = "Blumeops" - -# Priority mapping: Todoist API uses 1=normal(p4), 2=moderate(p3), 3=high(p2), 4=urgent(p1) -# User wants order: p1, p2, p4, p3 (p3 is backlog, goes last) -PRIORITY_LABELS = {4: "p1", 3: "p2", 1: "p4", 2: "p3"} -PRIORITY_SORT_ORDER = {4: 1, 3: 2, 1: 3, 2: 4} # Lower = earlier - - -def get_todoist_token() -> str: - """Retrieve Todoist API token from 1Password.""" - result = subprocess.run( - ["op", "read", "op://vg6xf6vvfmoh5hqjjhlhbeoaie/c53h3xnmswhvexa5mntoyvhgpm/credential"], - capture_output=True, - text=True, - ) - if result.returncode != 0: - raise RuntimeError(f"Failed to get Todoist token from 1Password: {result.stderr}") - return result.stdout.strip() - - -def get_project_id(client: httpx.Client, project_name: str) -> str: - """Find project ID by name.""" - cursor = None - while True: - params = {} - if cursor: - params["cursor"] = cursor - response = client.get(f"{TODOIST_API_BASE}/projects", params=params) - response.raise_for_status() - data = response.json() - for project in data.get("results", data if isinstance(data, list) else []): - if project["name"] == project_name: - return project["id"] - cursor = data.get("next_cursor") if isinstance(data, dict) else None - if not cursor: - break - - raise RuntimeError(f"Project '{project_name}' not found in Todoist") - - -def get_tasks(client: httpx.Client, project_id: str) -> list[dict]: - """Get all tasks for a project.""" - tasks = [] - cursor = None - while True: - params = {"project_id": project_id} - if cursor: - params["cursor"] = cursor - response = client.get(f"{TODOIST_API_BASE}/tasks", params=params) - response.raise_for_status() - data = response.json() - tasks.extend(data.get("results", data if isinstance(data, list) else [])) - cursor = data.get("next_cursor") if isinstance(data, dict) else None - if not cursor: - break - return tasks - - -def is_due(task: dict) -> bool: - """Check if a task should be displayed based on its due date. - - Tasks without a due date are always shown. Tasks with a due date - are only shown when the date is today or in the past. - """ - due = task.get("due") - if due is None: - return True - due_date = date.fromisoformat(due["date"][:10]) - return due_date <= date.today() - - -def days_until_due(task: dict) -> int | None: - """Return signed days offset from today, or None if no due date. - - Negative = days remaining before due (e.g. -2 = due in 2 days). - Positive = days past due (overdue). Zero = due today. - """ - due = task.get("due") - if due is None: - return None - due_date = date.fromisoformat(due["date"][:10]) - return (date.today() - due_date).days - - -def recurrence_string(task: dict) -> str | None: - """Return the Todoist natural-language recurrence string, or None. - - Todoist's REST API doesn't expose RFC 5545 RRULE; the natural-language - `due.string` (e.g. "every monday", "every 2 weeks") is the terse form. - """ - due = task.get("due") - if due is None or not due.get("is_recurring"): - return None - return due.get("string") - - -def sort_tasks(tasks: list[dict]) -> list[dict]: - """Sort by overdue-ness, then priority. - - Most overdue first (largest +N); tasks with no due date come last. - Within a given day, tiebreaker is the custom priority order p1, p2, p4, p3. - """ - - def key(task: dict) -> tuple[int, int, int]: - days = days_until_due(task) - no_due = 1 if days is None else 0 - days_key = -(days if days is not None else 0) # descending - return (no_due, days_key, PRIORITY_SORT_ORDER.get(task["priority"], 5)) - - return sorted(tasks, key=key) - - -def main() -> int: - console = Console() - - # Get API token - try: - token = get_todoist_token() - except RuntimeError as e: - console.print(f"[red]Error:[/red] {e}") - return 1 - - # Create HTTP client with auth header - with httpx.Client(headers={"Authorization": f"Bearer {token}"}) as client: - # Find project - try: - project_id = get_project_id(client, PROJECT_NAME) - except RuntimeError as e: - console.print(f"[red]Error:[/red] {e}") - return 1 - - # Get, filter, and sort tasks - tasks = get_tasks(client, project_id) - tasks = [t for t in tasks if is_due(t)] - sorted_tasks = sort_tasks(tasks) - - if not sorted_tasks: - console.print("No tasks found in Blumeops project") - return 0 - - # Display tasks - console.print(f"[bold]Blumeops Tasks[/bold] ({len(sorted_tasks)} tasks)") - console.print("=" * 40) - console.print() - - for task in sorted_tasks: - priority = task["priority"] - label = PRIORITY_LABELS.get(priority, "p?") - content = task["content"] - description = task.get("description", "") - - # Header line with priority and content - header = Text() - header.append(f"[{label}]", style="bold") - header.append(f" {content}") - - meta = [] - days = days_until_due(task) - if days is not None: - if days == 0: - meta.append("due today") - elif days > 0: - meta.append(f"{days}d overdue") - else: - meta.append(f"due in {-days}d") - recurrence = recurrence_string(task) - if recurrence: - meta.append(f"↻ {recurrence}") - if meta: - header.append(f" ({', '.join(meta)})", style="dim") - console.print(header) - - # Description indented (escape rich markup to preserve brackets) - if description: - for line in description.split("\n"): - console.print(f" {escape(line)}", style="dim") - - console.print() - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) From 29e0f012cd43d7185ed37a0a037695c6b52abc03 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 3 Jun 2026 21:39:41 -0700 Subject: [PATCH 110/122] C0: pin Quartz docs build to v4.5.2 (v5.0.0 broke build) The Dagger build_docs pipeline cloned Quartz from the default branch unpinned. Quartz v5.0.0 restructured its config layout (.quartz/plugins, ../quartz imports), breaking the docs build against our existing quartz.config.ts / quartz.layout.ts. Pin the clone to the last v4 release (v4.5.2) to restore known-good behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/changelog.d/+pin-quartz-v4.bugfix.md | 1 + src/blumeops/main.py | 4 ++++ 2 files changed, 5 insertions(+) create mode 100644 docs/changelog.d/+pin-quartz-v4.bugfix.md diff --git a/docs/changelog.d/+pin-quartz-v4.bugfix.md b/docs/changelog.d/+pin-quartz-v4.bugfix.md new file mode 100644 index 0000000..e073bbb --- /dev/null +++ b/docs/changelog.d/+pin-quartz-v4.bugfix.md @@ -0,0 +1 @@ +Pin the Quartz docs build to v4.5.2. The Dagger `build_docs` pipeline cloned Quartz from the default branch unpinned; Quartz v5.0.0 restructured its config layout (`.quartz/plugins`, `../quartz` imports) and broke the docs build against our existing `quartz.config.ts`/`quartz.layout.ts`. diff --git a/src/blumeops/main.py b/src/blumeops/main.py index 94b932b..9bbd12f 100644 --- a/src/blumeops/main.py +++ b/src/blumeops/main.py @@ -80,6 +80,10 @@ class Blumeops: "git", "clone", "--depth=1", + # Pin to last v4 release. v5.0.0 restructured config + # layout (.quartz/plugins, ../quartz imports) and breaks + # our quartz.config.ts/quartz.layout.ts. See changelog. + "--branch=v4.5.2", "https://github.com/jackyzha0/quartz.git", "/tmp/quartz", ] From 8f72f04d5cf5c507d0a9e8163d07d666975b53b7 Mon Sep 17 00:00:00 2001 From: Forgejo Actions Date: Wed, 3 Jun 2026 21:52:22 -0700 Subject: [PATCH 111/122] Update docs release to v1.17.0 - Built changelog from towncrier fragments [skip ci] --- CHANGELOG.md | 253 ++++++++++++++++++ ansible/roles/docs/defaults/main.yml | 3 +- .../+1password-backup-doc-export-name.doc.md | 1 - .../+agent-file-neutralization.ai.md | 1 - .../+ai-scraper-mitigation-doc.doc.md | 1 - .../+alloy-main-sha-rebuild.infra.md | 5 - .../+alloy-native-macos-v1.16.0.infra.md | 6 - .../+argocd-resource-limits.infra.md | 1 - .../+claude-md-import-agents.ai.md | 1 - ...ontainer-build-suggest-runner-logs.misc.md | 1 - .../+fix-forge-static-assets.bugfix.md | 1 - .../+fly-deploy-immediate-strategy.infra.md | 1 - .../+forge-mirrors-blackhole.infra.md | 1 - .../+frigate-notify-local.infra.md | 1 - .../+grafana-recreate-strategy.infra.md | 1 - .../+homepage-config-perms-fix.bugfix.md | 5 - .../+homepage-dedup-migrated.misc.md | 5 - .../+immich-probe-ringtail.infra.md | 1 - ...anage-forgejo-mirrors-sync-location.doc.md | 1 - docs/changelog.d/+pin-quartz-v4.bugfix.md | 1 - .../+prowler-rebuild-on-main.infra.md | 1 - .../+remove-devpi-container-build.misc.md | 1 - .../+retire-todoist-for-heph.infra.md | 1 - docs/changelog.d/+review-1password-doc.doc.md | 1 - .../+review-compliance-image-iac.feature.md | 1 - .../+review-contributing-doc.doc.md | 1 - docs/changelog.d/+review-index-doc.doc.md | 1 - docs/changelog.d/+review-navidrome-doc.doc.md | 1 - docs/changelog.d/+review-ollama-doc.doc.md | 1 - .../+ringtail-clone-via-tailnet.infra.md | 1 - .../+ringtail-coredump-size-cap.infra.md | 1 - ...+ringtail-flake-update-2026-06-01.infra.md | 4 - docs/changelog.d/+ringtail-proton-ge.infra.md | 4 - .../+ringtail-sn2-prelaunch.infra.md | 6 - .../+ringtail-sway-fuzzel.bugfix.md | 3 - .../+ringtail-vrr-flicker.bugfix.md | 1 - ...ate-fly-deploy-token-shell-examples.doc.md | 1 - docs/changelog.d/+runner-logs-auth.feature.md | 1 - .../+runner-logs-missing-log.misc.md | 1 - .../changelog.d/+shower-1.1.1-deploy.infra.md | 1 - .../+shower-1.1.1-fod-pin.infra.md | 1 - docs/changelog.d/+shower-1.1.1.infra.md | 1 - .../changelog.d/+shower-1.1.3-deploy.infra.md | 1 - docs/changelog.d/+shower-1.1.3.infra.md | 1 - .../+shower-main-sha-rebuild.infra.md | 5 - .../+shower-rebuild-from-main-sha.misc.md | 6 - ...hower-v1.1.2-rebuild-from-main-sha.misc.md | 1 - .../+tailscale-main-sha-rebuild.infra.md | 1 - .../+transmission-doc-review.doc.md | 1 - .../+unpoller-rebuild-on-main.infra.md | 1 - .../+valkey-main-tag-bump.infra.md | 1 - .../+valkey-rebuild-on-main.infra.md | 1 - .../+wave1-decommission-followups.infra.md | 8 - .../+zot-ci-rotation-op-syntax.doc.md | 1 - docs/changelog.d/+zot-v2.1.16.infra.md | 1 - docs/changelog.d/alloy-v1.16.0.infra.md | 5 - ...ckup-grafana-ringtail-blumeops-pg.infra.md | 8 - ...cleanup-cv-docs-minikube-artifacts.misc.md | 1 - ...dagger-0-20-6-runner-image-alpine.infra.md | 1 - .../decommission-wave1-minikube.infra.md | 8 - .../doc-review-replicating-blumeops.doc.md | 1 - .../fix-borgmatic-shower-via-ssh.bugfix.md | 14 - ...o-runner-v12-8-server-connections.infra.md | 1 - .../changelog.d/homepage-to-ringtail.infra.md | 8 - .../migrate-cv-docs-to-indri.infra.md | 1 - .../migrate-devpi-to-indri.infra.md | 1 - .../migrate-immich-to-ringtail.infra.md | 13 - .../migrate-wave1-ringtail.infra.md | 13 - .../mirror-tailscale-container.infra.md | 1 - .../changelog.d/prowler-iac-mutelist.infra.md | 1 - .../recurring-maintenance-2026-05-27.doc.md | 1 - .../recurring-maintenance-2026-05-27.infra.md | 4 - .../review-ringtail-flake-2026-05-11.infra.md | 1 - docs/changelog.d/ringtail-static-ip.infra.md | 1 - .../rip-out-compensating-controls.infra.md | 1 - .../service-review-mealie-2026-05-11.infra.md | 1 - docs/changelog.d/shower-app-deploy.bugfix.md | 13 - docs/changelog.d/shower-app-deploy.feature.md | 4 - docs/changelog.d/shower-app-deploy.infra.md | 9 - docs/changelog.d/shower-v1.1.0.feature.md | 15 -- docs/changelog.d/shower-v1.1.2.infra.md | 1 - docs/changelog.d/unpoller-v3.infra.md | 1 - .../update-tooling-deps-2026-04.doc.md | 1 - .../update-tooling-deps-2026-04.infra.md | 1 - docs/changelog.d/valkey-mirror.infra.md | 1 - docs/changelog.d/valkey-nix.infra.md | 1 - 86 files changed, 254 insertions(+), 234 deletions(-) delete mode 100644 docs/changelog.d/+1password-backup-doc-export-name.doc.md delete mode 100644 docs/changelog.d/+agent-file-neutralization.ai.md delete mode 100644 docs/changelog.d/+ai-scraper-mitigation-doc.doc.md delete mode 100644 docs/changelog.d/+alloy-main-sha-rebuild.infra.md delete mode 100644 docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md delete mode 100644 docs/changelog.d/+argocd-resource-limits.infra.md delete mode 100644 docs/changelog.d/+claude-md-import-agents.ai.md delete mode 100644 docs/changelog.d/+container-build-suggest-runner-logs.misc.md delete mode 100644 docs/changelog.d/+fix-forge-static-assets.bugfix.md delete mode 100644 docs/changelog.d/+fly-deploy-immediate-strategy.infra.md delete mode 100644 docs/changelog.d/+forge-mirrors-blackhole.infra.md delete mode 100644 docs/changelog.d/+frigate-notify-local.infra.md delete mode 100644 docs/changelog.d/+grafana-recreate-strategy.infra.md delete mode 100644 docs/changelog.d/+homepage-config-perms-fix.bugfix.md delete mode 100644 docs/changelog.d/+homepage-dedup-migrated.misc.md delete mode 100644 docs/changelog.d/+immich-probe-ringtail.infra.md delete mode 100644 docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md delete mode 100644 docs/changelog.d/+pin-quartz-v4.bugfix.md delete mode 100644 docs/changelog.d/+prowler-rebuild-on-main.infra.md delete mode 100644 docs/changelog.d/+remove-devpi-container-build.misc.md delete mode 100644 docs/changelog.d/+retire-todoist-for-heph.infra.md delete mode 100644 docs/changelog.d/+review-1password-doc.doc.md delete mode 100644 docs/changelog.d/+review-compliance-image-iac.feature.md delete mode 100644 docs/changelog.d/+review-contributing-doc.doc.md delete mode 100644 docs/changelog.d/+review-index-doc.doc.md delete mode 100644 docs/changelog.d/+review-navidrome-doc.doc.md delete mode 100644 docs/changelog.d/+review-ollama-doc.doc.md delete mode 100644 docs/changelog.d/+ringtail-clone-via-tailnet.infra.md delete mode 100644 docs/changelog.d/+ringtail-coredump-size-cap.infra.md delete mode 100644 docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md delete mode 100644 docs/changelog.d/+ringtail-proton-ge.infra.md delete mode 100644 docs/changelog.d/+ringtail-sn2-prelaunch.infra.md delete mode 100644 docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md delete mode 100644 docs/changelog.d/+ringtail-vrr-flicker.bugfix.md delete mode 100644 docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md delete mode 100644 docs/changelog.d/+runner-logs-auth.feature.md delete mode 100644 docs/changelog.d/+runner-logs-missing-log.misc.md delete mode 100644 docs/changelog.d/+shower-1.1.1-deploy.infra.md delete mode 100644 docs/changelog.d/+shower-1.1.1-fod-pin.infra.md delete mode 100644 docs/changelog.d/+shower-1.1.1.infra.md delete mode 100644 docs/changelog.d/+shower-1.1.3-deploy.infra.md delete mode 100644 docs/changelog.d/+shower-1.1.3.infra.md delete mode 100644 docs/changelog.d/+shower-main-sha-rebuild.infra.md delete mode 100644 docs/changelog.d/+shower-rebuild-from-main-sha.misc.md delete mode 100644 docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md delete mode 100644 docs/changelog.d/+tailscale-main-sha-rebuild.infra.md delete mode 100644 docs/changelog.d/+transmission-doc-review.doc.md delete mode 100644 docs/changelog.d/+unpoller-rebuild-on-main.infra.md delete mode 100644 docs/changelog.d/+valkey-main-tag-bump.infra.md delete mode 100644 docs/changelog.d/+valkey-rebuild-on-main.infra.md delete mode 100644 docs/changelog.d/+wave1-decommission-followups.infra.md delete mode 100644 docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md delete mode 100644 docs/changelog.d/+zot-v2.1.16.infra.md delete mode 100644 docs/changelog.d/alloy-v1.16.0.infra.md delete mode 100644 docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md delete mode 100644 docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md delete mode 100644 docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md delete mode 100644 docs/changelog.d/decommission-wave1-minikube.infra.md delete mode 100644 docs/changelog.d/doc-review-replicating-blumeops.doc.md delete mode 100644 docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md delete mode 100644 docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md delete mode 100644 docs/changelog.d/homepage-to-ringtail.infra.md delete mode 100644 docs/changelog.d/migrate-cv-docs-to-indri.infra.md delete mode 100644 docs/changelog.d/migrate-devpi-to-indri.infra.md delete mode 100644 docs/changelog.d/migrate-immich-to-ringtail.infra.md delete mode 100644 docs/changelog.d/migrate-wave1-ringtail.infra.md delete mode 100644 docs/changelog.d/mirror-tailscale-container.infra.md delete mode 100644 docs/changelog.d/prowler-iac-mutelist.infra.md delete mode 100644 docs/changelog.d/recurring-maintenance-2026-05-27.doc.md delete mode 100644 docs/changelog.d/recurring-maintenance-2026-05-27.infra.md delete mode 100644 docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md delete mode 100644 docs/changelog.d/ringtail-static-ip.infra.md delete mode 100644 docs/changelog.d/rip-out-compensating-controls.infra.md delete mode 100644 docs/changelog.d/service-review-mealie-2026-05-11.infra.md delete mode 100644 docs/changelog.d/shower-app-deploy.bugfix.md delete mode 100644 docs/changelog.d/shower-app-deploy.feature.md delete mode 100644 docs/changelog.d/shower-app-deploy.infra.md delete mode 100644 docs/changelog.d/shower-v1.1.0.feature.md delete mode 100644 docs/changelog.d/shower-v1.1.2.infra.md delete mode 100644 docs/changelog.d/unpoller-v3.infra.md delete mode 100644 docs/changelog.d/update-tooling-deps-2026-04.doc.md delete mode 100644 docs/changelog.d/update-tooling-deps-2026-04.infra.md delete mode 100644 docs/changelog.d/valkey-mirror.infra.md delete mode 100644 docs/changelog.d/valkey-nix.infra.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae5f8e..0499154 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,259 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [v1.17.0] - 2026-06-03 + +### Features + +- Deploy the Adelaide / Heidi / Addie baby shower app — guest splash, raffle + picker, and prize assignment console — on ringtail k3s with `shower.eblu.me` + as the public entry and `shower.ops.eblu.me` as the tailnet admin host. App + source: [`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app). +- Deploy adelaide-baby-shower-app v1.1.0 to ringtail k3s. Replaces the + boolean lock with a four-phase `ShowerState` (`pre_event` → `party` → + `prizes_locked` → `event_locked`), adds an append-only "guest memories" + panel where guests can leave photos and comments for the baby, and + polishes the admin and QR views. Three Django migrations + (`0009_shower_phase`, `0010_guest_memories`, `0011_book_description`) + run automatically in the entrypoint against the SQLite PV. No config + or env-var changes. + + Container build also gains a Forgejo-PyPI workaround: Forgejo's simple + index returns absolute file URLs hardcoded to the public ROOT_URL + (`forge.eblu.me`), which the Fly edge 403s on `/api/packages/*`. The + wheel and sdist are now both pulled via direct `fetchurl` against + `forge.ops.eblu.me` (tailnet-only) and the wheel is handed to pip as + a local path. +- `review-compliance-reports` now also fetches and summarizes the weekly Prowler container-image and IaC scans (previously only the K8s CIS in-cluster scan was processed). For each scan it shows status counts, severity breakdown, week-over-week delta, and — for the high-volume image/IaC scans — top-N tables grouped by check ID and resource instead of per-finding listings. +- runner-logs now authenticates with Forgejo API token and auto-detects the repo from git remote. Job logs are fetched via SSH to indri (reading Forgejo's on-disk zstd log files) instead of the web endpoint, which doesn't support token auth for private repos. + +### Bug Fixes + +- Fix nightly borgmatic backups failing for 2 days. The shower SQLite + dump hook referenced `kubectl --context=k3s-ringtail`, but indri's + kubeconfig deliberately doesn't carry the ringtail credentials. The + `before_backup` hook's failure aborted the entire run, taking out + *both* the local sifaka repo and the BorgBase offsite. Replaced + the inline-shell dump with a `~/bin/borgmatic-k8s-sqlite-dump` + helper deployed by the ansible role. Each dump entry now declares a + `target` of either `local:` (mealie — kubectl uses indri's + kubeconfig) or `ssh:` (shower — ssh into ringtail and + run `k3s kubectl` there, no indri-side kubeconfig needed; k3s.yaml + on ringtail is mode 644 so no sudo required). Bytes stream back via + `kubectl exec ... -- cat` rather than `kubectl cp`, since `kubectl + cp` requires `tar` inside the pod and nix-built images like shower + don't bundle it. +- Shower app container now bakes the wheel + Python deps into the image + at build time via `buildPythonPackage` instead of pip-installing on + first boot. Boots are deterministic and don't depend on forge PyPI + being reachable from the pod. The `wheelHash` in + `containers/shower/default.nix` is the sha256 sourced from the + [forge PyPI simple index](https://forge.eblu.me/api/packages/eblume/pypi/simple/adelaide-baby-shower-app/); + bumping the version means bumping that hash too. + + Borgmatic now covers the shower app: SQLite is dumped from the live + pod via `kubectl exec` (mirroring the existing mealie entry, with + `context: k3s-ringtail`), and the prize-photo media share is picked up + through `/Volumes/shower` (sifaka SMB mount on indri, same pattern as + `/Volumes/photos`). +- Disabled adaptive sync (VRR) on ringtail's DP-1 output. The OMEN 27i IPS panel pumps brightness when its refresh rate swings into the low VRR range during low-framerate content (e.g. game cutscenes), producing a flicker that worsened over a session until a reboot. Pinning the panel to a fixed 165Hz eliminates it. +- Fixed forge.eblu.me static assets (CSS, JS, images, fonts) not loading — the proxy's static asset cache block was missing the `Host` header, so Caddy couldn't route the requests. +- Fixed homepage container EACCES on cold start: the nix-built image now chowns + `/app/config` to uid 1000 at build time via `fakeRootCommands`, matching the + behavior of the old Dockerfile. Without this, homepage couldn't seed missing + skeleton configs (proxmox.yaml etc.) or create `/app/config/logs`, crashing on + its first uncached request. Caught during the ringtail cutover. +- Fixed sway keybindings on ringtail — the home-manager `keybindings` block was replacing the module's defaults entirely, leaving only explicit overrides (no workspace switching, focus, move, splits, resize mode, etc). Switched to `lib.mkOptionDefault` with `lib.mkForce` on the conflicting custom binds (`Mod+Return`, `Mod+d`, `Mod+space`, `Mod+l`) so defaults merge back in. Also added `Mod+F1` to show a filterable fuzzel list of current keybindings. + + Fixed fuzzel config errors on launch — `border-radius` and `border-width` were under `[main]`, but fuzzel expects them as `radius`/`width` under a `[border]` section. +- Pin the Quartz docs build to v4.5.2. The Dagger `build_docs` pipeline cloned Quartz from the default branch unpinned; Quartz v5.0.0 restructured its config layout (`.quartz/plugins`, `../quartz` imports) and broke the docs build against our existing `quartz.config.ts`/`quartz.layout.ts`. + +### Infrastructure + +- Wire the ringtail `blumeops-pg` cluster (which holds the wave-1-migrated + paperless + teslamate databases) into backups and Grafana. Adds a Tailscale + LoadBalancer Service (`blumeops-pg-ringtail.tail8d86e.ts.net`) and a Caddy L4 + route (`pg.ops.eblu.me:5434`), then repoints borgmatic's `teslamate` + + `paperless` postgres dumps and the `mealie` SQLite dump at ringtail, and the + Grafana TeslaMate datasource at the ringtail DB. Closes the backup gap that + opened at cutover (the migrated live data was still being backed up from the + now-frozen minikube copies) and unblocks the wave-1 decommission. +- Migrated homepage dashboard from minikube (indri/arm64) to k3s (ringtail/amd64). + The container is now built via nix (`containers/homepage/default.nix`), adapted + from nixpkgs `homepage-dashboard` with the upstream Next.js cache patches and + wrapped with `dockerTools.buildLayeredImage`. Autodiscovery shifts: services on + minikube (ArgoCD, Immich, Kiwix, Mealie, Miniflux, Grafana, Prometheus, + Navidrome, Paperless, TeslaMate, Transmission) become explicit static entries + in `services.yaml`; ringtail services (Authentik, Frigate/NVR, Ntfy, Ollama) + auto-populate via Ingress annotations. +- Migrated CV (`cv.eblu.me`) and Docs (`docs.eblu.me`) from minikube Deployments to indri-native ansible roles. Caddy now serves the extracted release tarballs directly via a new `kind: static` service-block in the Caddy template — no daemon, no container — replacing the prior nginx-in-a-pod layer. Removes a network hop on every request and shrinks minikube's footprint. See [[cv-on-indri]] and [[docs-on-indri]]. Part of the broader minikube wind-down. +- Migrated devpi (PyPI mirror at `pypi.ops.eblu.me`) from a minikube StatefulSet to a launchd-managed service on indri. devpi-server now runs in a uv-managed venv with pinned `devpi-server` and `devpi-web` versions, listens on `127.0.0.1:3141`, and is fronted by Caddy. The minikube StatefulSet was crash-looping under memory pressure (and breaking the Python toolchain everywhere); the new layout removes a layer of dependency on cluster health for critical-path tooling. See [[devpi-on-indri]]. +- Move the entire Immich stack — server, machine-learning, valkey, + and the PostgreSQL+VectorChord cluster — off `minikube-indri` and + onto `k3s-ringtail`. Postgres data migrated zero-loss via CNPG + `pg_basebackup` (replica catch-up then promote); row counts on + `asset`, `user`, `album`, `smart_search`, `activity`, `asset_face` + verified equal between source and replica before cutover. The ML + pod now uses ringtail's RTX 4080 via the nvidia-device-plugin + (time-slicing bumped 2 → 4 to share with frigate + ollama). Caddy + routing at `photos.ops.eblu.me` is unchanged (still + `photos.tail8d86e.ts.net`, the device just lives on ringtail now). + Borgmatic backups continue against the same `immich-pg` tailnet + hostname. First concrete chain in the broader indri-k8s + decommission effort. +- Add local nix container build for `tailscale` (`containers/tailscale/default.nix`) so ringtail's tailscale-operator ProxyClass proxy pods pull from the forge mirror instead of `docker.io/tailscale/tailscale`. Pinned at v1.94.2 to match `service-versions.yaml`. Indri's tailscale-operator continues to use upstream during the k8s-to-ringtail migration. +- Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var, muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`. +- Pin ringtail's wired IP to `192.168.1.21` via NixOS scripted networking; NetworkManager no longer manages `enp5s0`. Removes DHCP lease renewal as a failure mode after a silent lease teardown took ringtail offline. Also explicitly enables `net.ipv4.ip_forward` (previously set implicitly by scripted-DHCP) so k3s pod networking and Tailscale routing continue to work with static networking. +- Ripped out the compensating-controls (CC) framework: deleted `compensating-controls.yaml`, the `review-compensating-controls` mise task, and the associated how-to / explanation docs. Prowler and Kingfisher continue to run weekly and produce reports; the Prowler mutelist YAML files remain in place but no longer carry `CC: ` prefixes — each entry just keeps a free-form `Description` of why the finding is muted. The CC review cadence proved to be more overhead than this single-operator homelab needed. +- Wire shower app for public exposure: fly nginx `shower.eblu.me` server + block as a guest-only surface — splash page, `/prizes//`, static + assets, media. Everything authenticated (`/admin/`, `/host/`, + `/accounts/`) returns 403 with a "tailnet only" pointer. Staff hit + `shower.ops.eblu.me` for the operator console + admin; the app's + v1.0.1 `DJANGO_PUBLIC_URL_BASE` setting makes QR codes generated on + the tailnet point back at the WAN host for guests. Plus a Caddy route + on indri, Pulumi Gandi CNAME, and a Grafana APM dashboard tracking + request rate, error rate, latency, bandwidth, and access logs. +- Mirror Valkey 8.1 locally as `registry.ops.eblu.me/blumeops/valkey`. Replaces direct pulls of `docker.io/valkey/valkey:8.1-alpine` for paperless and immich sidecars. Built via native Dagger pipeline on Alpine 3.22. Stateless swap — no data migration. Authentik's nix-built Redis remains separate. +- Add nix-built amd64 valkey for ringtail (`containers/valkey/default.nix`) so immich-ringtail can stop pulling the upstream multi-arch `docker.io/valkey/valkey` image. Existing `container.py` continues to build Alpine arm64 for paperless on indri. Both bump to valkey 8.1.7 (Alpine 3.22 8.1.7-r0 / nixpkgs 8.1.7). +- Upgrade Grafana Alloy v1.14.0 → v1.16.0 across all four service deployments + (alloy-k8s, alloy-ringtail, alloy-tracing-ringtail on k8s; alloy native on + indri). Pulls in stable database observability (v1.15) and the OTel Collector + v0.147.0 bump. Container build also migrated from Dockerfile to native Dagger + `container.py` per the build-container-image migration playbook. +- Upgraded Dagger from v0.20.1 to v0.20.6 (engine, CLI pin, and SDK regen) and migrated `runner-job-image` from a Debian-based Dockerfile to a native Dagger `container.py` on Alpine 3.23, reusing the shared `alpine_runtime` helper. +- Decommission the wave-1 services on minikube-indri now that paperless, + teslamate, and mealie run on ringtail with their data backed up. Removes the + minikube `paperless`/`teslamate`/`mealie` manifest dirs + ArgoCD app + definitions (pruning the parked Deployments, Services, and the redundant + minikube mealie/paperless PVCs), and drops the `paperless`/`teslamate` roles + from the minikube `blumeops-pg` cluster. The `paperless` and `teslamate` + databases are dropped from indri's blumeops-pg as the finalization step. + miniflux + authentik remain on the minikube cluster (later waves). +- Upgraded the k8s Forgejo runner to the v12.8 line, switched it from first-boot registration to declarative `server.connections` credentials from 1Password, and consolidated the supporting runner how-to documentation. +- Move paperless, teslamate, and mealie off `minikube-indri` onto + `k3s-ringtail`, shedding ~1.1 GiB of resident load from the + OOM-thrashing 8 GiB minikube node (the kernel OOM killer had been + killing `kube-apiserver`/`dockerd`/argocd, flapping every + minikube-hosted service at once). paperless + teslamate databases + move into a fresh CNPG `blumeops-pg` cluster on ringtail via a cold + `pg_dump`/`pg_restore` from the quiesced source — row counts verified + equal before any routing flip; source DBs dropped only after the + ringtail side serves traffic. mealie's SQLite PVC is copied as-is. + paperless media stays on sifaka NFS. Downtime-tolerant cold cutover + (no streaming replication); rollback is repoint-and-scale-up with the + source untouched. Second chain in the indri-k8s decommission after + [[migrate-immich-to-ringtail]]. +- Recurring maintenance batch: + + - Ringtail flake inputs refreshed (`disko`, `home-manager`, `nixpkgs`). + - Tooling deps bumped: prek hooks (trufflehog v3.95.3, kingfisher v1.101.0, ruff v0.15.14, `ansible-core` 2.21.0); fly proxy base images (nginx 1.30.1-alpine, alloy v1.16.1); `typer==0.26.2` in mise tasks. +- Updated `nixos/ringtail/flake.lock` (weekly cadence): `disko`, `home-manager`, and `nixpkgs` inputs refreshed. `nixpkgs-services` skipped per overlay convention. +- Reviewed `mealie` service version freshness; upstream is 5 minor versions ahead (v3.17.0 vs deployed v3.12.0). Marked reviewed; upgrade deferred. +- Deploy shower v1.1.2 — bump container build to new app release. +- Upgrade unpoller v2.34.0 → v3.2.0 and migrate container build from Dockerfile to native Dagger (container.py). v3.0.0 carries breaking UniFi API changes; v3.2.0 introduces a 60s background poll (cached scrapes) by default — set `interval = 0` in `up.conf` to restore on-demand polling. +- Monthly tooling dependency refresh: prek hooks (trufflehog, kingfisher, ruff, shfmt, prettier, actionlint, ansible-lint), fly proxy base images (nginx 1.30.0, tailscale v1.94.2, alloy v1.16.0), normalize pyyaml lower bound in mise-tasks. +- Add GE-Proton (`pkgs.proton-ge-bin`) to `programs.steam.extraCompatPackages` + on ringtail. Subnautica 2 hangs at Mercuna plugin init under Proton + Experimental + DXVK D3D12; GE-Proton is available as a Steam per-game + compatibility option to work around it. +- Add `sn2-prelaunch` Steam launch wrapper on ringtail that removes + Subnautica 2's stale `Saved/running.dat` and `Saved/beforelobby.dat` + lockfiles before each launch. SN2 pops up an invisible (0×0-sized) + Error dialog when it detects an unclean exit, blocking GameThread + forever; this is observable only as a black screen with a spinning + loader. Use via Steam launch option: `sn2-prelaunch %command%`. +- Add local nix container build for `frigate-notify` (`containers/frigate-notify/default.nix`) so the Frigate→ntfy bridge is rebuilt on ringtail from the forge mirror instead of pulled from `ghcr.io/0x2142/frigate-notify`. +- Add resource limits to all ArgoCD pods to prevent unbounded resource consumption during node-wide pressure events. +- Black-hole the `/mirrors/*` repositories at the Fly proxy edge (`return 403` → `forge.ops.eblu.me`). A surprise $29.60 Fly bill traced to ~1.24 TB/30d of egress on `forge.eblu.me`, 99.95% of all proxy egress — of which ~71% was AI scrapers (Meta `meta-externalagent`, OpenAI `GPTBot`, Amazonbot) crawling the near-infinite git-history URL space of the public mirror repos and timing out Forgejo in the process. Mirrors exist for supply-chain control and are consumed over the tailnet, so their public web UI had no legitimate audience. `robots.txt` already disallowed `/mirrors/`, but the offending agents ignore it. Tier-2 mitigations (user-agent denylist, Anubis proof-of-work gateway) are documented in `docs/explanation/ai-scraper-mitigation.md`. +- Bump paperless and immich kustomizations to the main-SHA-built valkey tag (`v8.1.6-r0-fabca04`). Routine post-merge follow-up to keep production manifests pointing at images built from a commit on main. +- Bump shower container to v1.1.1 (probe FOD hash). +- Bumped shower app to v1.1.3 (wheel/sdist + FOD hashes probed on ringtail). +- Cap systemd-coredump on ringtail (ProcessSizeMax/ExternalSizeMax 1G, MaxUse 2G) so multi-GB Wine/Proton game crash dumps no longer thrash the disk and lock up the desktop. +- Deploy shower v1.1.1 to ringtail (kustomize newTag bump). +- Deployed shower v1.1.3 to ringtail (image built and pushed from ringtail; runner bypassed due to indri overload). +- Fix three follow-ups from the wave-1 decommission: grant the local + break-glass `admin` account ArgoCD admin rights (`g, admin, role:admin` — + previously only the Authentik `admins` group had access, so admin was + locked out whenever its token expired), and repoint the alloy blackbox + probe for teslamate from the deleted minikube service to + `https://tesla.ops.eblu.me/` (through Caddy over Tailscale). The orphaned + paperless/teslamate roles + ExternalSecrets left on the minikube + blumeops-pg are also cleaned up. +- Moved the Immich blackbox health probe from indri's alloy to ringtail's alloy. After the immich migration to ringtail, the probe still targeted `immich-server.immich.svc.cluster.local` on indri's cluster where the service no longer exists, causing a persistent `ServiceProbeFailure` alert. +- Pin shower v1.1.1 FOD outputHash (probed locally on ringtail). +- Rebuild Prowler container against main HEAD (v5.23.0-495e45d) after merging the IaC mutelist Dockerfile changes. +- Rebuild and retag alloy v1.16.0 container images from the main-branch SHA + following the squash-merge of #345, per the build-container-image + squash-merge convention. Both images (`registry.ops.eblu.me/blumeops/alloy`) + now reference `9564435` rather than the branch SHA `26a3ab5`, restoring + source traceability after branch cleanup. +- Rebuild shower from the post-merge commit on main so the container's + SHA tag points at a commit that will still exist after the 30-day + branch-cleanup window. Functionally identical to the branch-tag image + already deployed, just preserves source traceability per + [[build-container-image#Squash-merge and container tags]]. +- Rebuild unpoller container from squashed main commit so the image SHA tag matches a commit in main's history (was tagged with the pre-squash branch SHA). +- Rebuild valkey container from squashed main commit (both arm64 dagger and amd64 nix variants), and update paperless + immich-ringtail kustomizations to the main-SHA tags `v8.1.7-ecded30` and `v8.1.7-ecded30-nix`. +- Retired the `blumeops-tasks` mise task (Todoist API) in favor of `heph list --project Blumeops --json` from the self-hosted [hephaestus](https://github.com/eblume/hephaestus) system. Updated docs to point task discovery and rotation reminders at heph, and noted that the `~/code/personal/zk` zettelkasten is migrating into heph docs. +- Switch the Fly proxy deploy strategy from `bluegreen` to `immediate` in `fly/fly.toml`. With a single proxy machine, bluegreen offers little benefit — the green machine routinely failed to reach "started" inside Fly's default 5-minute deploy timeout (the cold-start sequence of `tailscaled` → `tailscale up` → wait-for-MagicDNS → nginx startup eats most of the budget), and the failed deploys would roll back. `immediate` replaces the machine in place with a brief downtime (~5–10s) but actually completes. +- Switch the ringtail provisioning playbook's blumeops clone URL from `forge.eblu.me` (public, via Fly proxy) to `forge.ops.eblu.me` (tailnet, direct via Caddy on indri). Ringtail is always on the tailnet, so the WAN round-trip is pure overhead — it also made `provision-ringtail` brittle whenever the Fly proxy was slow or down. +- Switched Grafana's deployment strategy from `RollingUpdate` to `Recreate`. With an RWO PVC holding the SQLite database and Bleve search index, `RollingUpdate` reliably crashloops the new pod on the index lock until rollout timeout. `Recreate` terminates the old pod first so the new one acquires the lock cleanly. +- Update `tailscale-operator-ringtail` ProxyClass to reference the `0108b68` main-SHA build of the tailscale container. Routine post-merge cleanup so the deployed image traces to a commit that survives PR branch cleanup. +- Update the ringtail NixOS flake lockfile (`nixos/ringtail/flake.lock`): bump + `nixpkgs` (b77b3de → 25f5383) and `disko` (5ba0c95 → 115e521) to latest. + `nixpkgs-services` was intentionally left pinned (skipped by the + `flake-update` pipeline). Routine recurring maintenance per [[manage-lockfile]]. +- Upgrade native macOS Alloy on indri to v1.16.0. Built on gilbert with Go + 1.26.2 + CGO (required for the macOS native DNS resolver, which Tailscale + MagicDNS depends on), scp'd to `~/.local/bin/alloy` on indri, codesigned, + and the LaunchAgent reloaded. Completes the v1.16.0 fleet upgrade started + in #345 — all four Alloy services (alloy-k8s, alloy-ringtail, + alloy-tracing-ringtail, alloy ansible) now run v1.16.0. +- Upgraded zot on indri from v2.1.15 to v2.1.16 (security fixes: TLS verification on metrics client, CORS Allow-Credentials suppression on wildcard origins, manifest/API-key body size limits). + +### Documentation + +- Reviewed `replicating-blumeops` tutorial: fixed "BluemeOps" typos (also in `contributing.md`) and added `last-reviewed` frontmatter. +- Reviewed [[indri]] reference card: added `devpi`, `cv`, and `docs` to the native-services list; widened the k8s note to reflect the growing set of apps now on ringtail and the planned indri-minikube decommission; added CPU/RAM specs. +- New how-to: rotate-fly-deploy-token. Documents the 75-day rotation cadence, why we use `org`-scoped tokens (silences the cosmetic metrics-token warning on `fly status` with marginal blast-radius cost given the single-app personal org), and the procedure for rotation + Forgejo Actions secret sync. +- Add `docs/explanation/ai-scraper-mitigation.md` — the egress-cost / AI-crawler threat model for the public Fly proxy, the tiered mitigation plan (Tier 1: mirror black-hole, shipped; Tier 2: user-agent denylist + Anubis; Tier 3: Cloudflare, rejected on principle), and the data behind it. +- Fix manage-forgejo-mirrors verify step — sync button is on the repo settings page ("Synchronize now"), not the main repo page. +- Fixed the `op item edit` invocation in the [[zot]] API-key rotation procedure: the previous `pbpaste | op item edit ... "field[password]=-"` stdin syntax is rejected by op 2.34 as "invalid JSON" (recent op versions treat piped input as a full JSON template, not a single field value). Procedure now reads the clipboard into a local fish variable and passes it as an inline assignment. +- Fixed the export-filename step in [[run-1password-backup]]: 1Password's desktop app names the export `1PasswordExport--.1pux` automatically rather than letting you save to a fixed name, so the procedure now points the task at that glob instead of pretending the default name is `1Password-export.1pux`. +- Refresh the contributing tutorial: add `last-reviewed`, include the `.ai.md` changelog fragment type, and clarify that `prek` is pinned via `mise`. +- Review and refresh the Navidrome reference card: add `last-reviewed`, correct the scanner env var name, document the current image/version, and record routing and runtime details from the manifests. +- Review and refresh the Ollama reference card: add `last-reviewed`, bump the documented image tag to 0.20.4, and add the two `qwen3.5` models now declared in `models.txt`. +- Reviewed [[1password]] reference card: added the `blumeops` vs `Personal` vault split, noted that `onepassword-connect` runs on both indri and ringtail (not just one cluster), and pulled the `op read` vs `op item get --fields` guidance up from agent memory into the card. +- Reviewed `index.md`; added ringtail to the infrastructure overview and stamped `last-reviewed`. +- Reviewed transmission card: corrected storage layout (`/config/` is emptyDir, watch dir disabled) and noted the Prometheus exporter sidecar. +- rotate-fly-deploy-token: combine mint+store into one command with both fish and bash forms; document the `op item edit` "Password item requires ps value" validator gotcha and the placeholder-password workaround. + +### AI Assistance + +- Adopt `AGENTS.md` as the canonical agent instruction file, keep `CLAUDE.md` as a compatibility shim, and update docs to reference the neutral file and the correct agent-change-process path. +- CLAUDE.md now imports AGENTS.md via `@AGENTS.md` instead of telling agents to go read it. Claude Code only auto-loads CLAUDE.md, so the prose shim was easy to skip; the import inlines AGENTS.md into the session prompt unconditionally. + +### Miscellaneous + +- Removed the dead minikube manifests, container builds, and tooling shims left behind after the cv + docs migration to indri-native (#342). Deletes `argocd/{apps,manifests}/{cv,docs}/`, `containers/{cv,quartz}/`, and the `quartz`→`docs` mapping in `mise-tasks/container-version-check`. Bumps `docs.current-version` to `v1.16.0` (the blumeops release tag) now that the legacy nginx-base version pin is gone. +- Rebuild shower v1.1.0 container from main HEAD (`3c7967e`) and bump the + kustomization tag to `v1.1.0-3c7967e-nix`. The PR was squash-merged, so + the branch commit `444ff91` baked into the prior tag isn't reachable + from main's history. The new tag points at a commit that exists on + main; image content is byte-identical because the FOD output is content + addressed and the inputs didn't change. +- Rebuild shower v1.1.2 from main HEAD (a33fa47) and retag — PR #358 was squash-merged so the branch SHA baked into the prior image tag isn't reachable from main. FOD is content-addressed, so image bytes are identical; only provenance changes. +- Remove the duplicate Homepage tiles for Mealie, Paperless, Immich, and + TeslaMate. Homepage runs on ringtail and autodiscovers ringtail Ingresses via + `gethomepage.dev/*` annotations; once these services migrated to ringtail they + were discovered automatically, making their leftover static `services.yaml` + entries (needed only while they lived on minikube) redundant. +- Removed the now-unused `containers/devpi/` Dagger build artifact. Devpi runs natively on indri via uv venv; the container image is no longer referenced anywhere. Doc examples in `docs/reference/tools/dagger.md` updated to use `miniflux` as the example container name. +- `container-build-and-release` now prints the specific `mise run runner-logs ` command after dispatching, polling the Forgejo API to resolve the run number for the commit it just triggered. +- `mise run runner-logs -j ` now reports a clear error when the log file doesn't exist on indri (e.g. a runner crash that left `action_task.log_in_storage = 0`). Previously it printed only the header and exited 0, because `zstdcat` exits 0 with a "can't stat … -- ignored" stderr message and ssh+fish on indri swallows the remote exit code. + + ## [v1.16.0] - 2026-04-18 ### Infrastructure diff --git a/ansible/roles/docs/defaults/main.yml b/ansible/roles/docs/defaults/main.yml index f09221b..a5a1a8a 100644 --- a/ansible/roles/docs/defaults/main.yml +++ b/ansible/roles/docs/defaults/main.yml @@ -3,9 +3,8 @@ # Caddy serves docs_content_dir directly via the static-kind service block, # with Quartz-style try_files (path → path/ → path.html → 404). -docs_version: "v1.16.0" +docs_version: "v1.17.0" docs_release_url: "https://forge.eblu.me/eblume/blumeops/releases/download/{{ docs_version }}/docs-{{ docs_version }}.tar.gz" - docs_home: /Users/erichblume/blumeops/docs docs_content_dir: "{{ docs_home }}/content" docs_version_sentinel: "{{ docs_home }}/.installed-version" diff --git a/docs/changelog.d/+1password-backup-doc-export-name.doc.md b/docs/changelog.d/+1password-backup-doc-export-name.doc.md deleted file mode 100644 index 6c4d262..0000000 --- a/docs/changelog.d/+1password-backup-doc-export-name.doc.md +++ /dev/null @@ -1 +0,0 @@ -Fixed the export-filename step in [[run-1password-backup]]: 1Password's desktop app names the export `1PasswordExport--.1pux` automatically rather than letting you save to a fixed name, so the procedure now points the task at that glob instead of pretending the default name is `1Password-export.1pux`. diff --git a/docs/changelog.d/+agent-file-neutralization.ai.md b/docs/changelog.d/+agent-file-neutralization.ai.md deleted file mode 100644 index da16fba..0000000 --- a/docs/changelog.d/+agent-file-neutralization.ai.md +++ /dev/null @@ -1 +0,0 @@ -Adopt `AGENTS.md` as the canonical agent instruction file, keep `CLAUDE.md` as a compatibility shim, and update docs to reference the neutral file and the correct agent-change-process path. diff --git a/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md b/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md deleted file mode 100644 index 246fedb..0000000 --- a/docs/changelog.d/+ai-scraper-mitigation-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Add `docs/explanation/ai-scraper-mitigation.md` — the egress-cost / AI-crawler threat model for the public Fly proxy, the tiered mitigation plan (Tier 1: mirror black-hole, shipped; Tier 2: user-agent denylist + Anubis; Tier 3: Cloudflare, rejected on principle), and the data behind it. diff --git a/docs/changelog.d/+alloy-main-sha-rebuild.infra.md b/docs/changelog.d/+alloy-main-sha-rebuild.infra.md deleted file mode 100644 index 42a7b37..0000000 --- a/docs/changelog.d/+alloy-main-sha-rebuild.infra.md +++ /dev/null @@ -1,5 +0,0 @@ -Rebuild and retag alloy v1.16.0 container images from the main-branch SHA -following the squash-merge of #345, per the build-container-image -squash-merge convention. Both images (`registry.ops.eblu.me/blumeops/alloy`) -now reference `9564435` rather than the branch SHA `26a3ab5`, restoring -source traceability after branch cleanup. diff --git a/docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md b/docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md deleted file mode 100644 index 471990f..0000000 --- a/docs/changelog.d/+alloy-native-macos-v1.16.0.infra.md +++ /dev/null @@ -1,6 +0,0 @@ -Upgrade native macOS Alloy on indri to v1.16.0. Built on gilbert with Go -1.26.2 + CGO (required for the macOS native DNS resolver, which Tailscale -MagicDNS depends on), scp'd to `~/.local/bin/alloy` on indri, codesigned, -and the LaunchAgent reloaded. Completes the v1.16.0 fleet upgrade started -in #345 — all four Alloy services (alloy-k8s, alloy-ringtail, -alloy-tracing-ringtail, alloy ansible) now run v1.16.0. diff --git a/docs/changelog.d/+argocd-resource-limits.infra.md b/docs/changelog.d/+argocd-resource-limits.infra.md deleted file mode 100644 index ba24a5a..0000000 --- a/docs/changelog.d/+argocd-resource-limits.infra.md +++ /dev/null @@ -1 +0,0 @@ -Add resource limits to all ArgoCD pods to prevent unbounded resource consumption during node-wide pressure events. diff --git a/docs/changelog.d/+claude-md-import-agents.ai.md b/docs/changelog.d/+claude-md-import-agents.ai.md deleted file mode 100644 index f63231e..0000000 --- a/docs/changelog.d/+claude-md-import-agents.ai.md +++ /dev/null @@ -1 +0,0 @@ -CLAUDE.md now imports AGENTS.md via `@AGENTS.md` instead of telling agents to go read it. Claude Code only auto-loads CLAUDE.md, so the prose shim was easy to skip; the import inlines AGENTS.md into the session prompt unconditionally. diff --git a/docs/changelog.d/+container-build-suggest-runner-logs.misc.md b/docs/changelog.d/+container-build-suggest-runner-logs.misc.md deleted file mode 100644 index d10ea51..0000000 --- a/docs/changelog.d/+container-build-suggest-runner-logs.misc.md +++ /dev/null @@ -1 +0,0 @@ -`container-build-and-release` now prints the specific `mise run runner-logs ` command after dispatching, polling the Forgejo API to resolve the run number for the commit it just triggered. diff --git a/docs/changelog.d/+fix-forge-static-assets.bugfix.md b/docs/changelog.d/+fix-forge-static-assets.bugfix.md deleted file mode 100644 index de0517e..0000000 --- a/docs/changelog.d/+fix-forge-static-assets.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Fixed forge.eblu.me static assets (CSS, JS, images, fonts) not loading — the proxy's static asset cache block was missing the `Host` header, so Caddy couldn't route the requests. diff --git a/docs/changelog.d/+fly-deploy-immediate-strategy.infra.md b/docs/changelog.d/+fly-deploy-immediate-strategy.infra.md deleted file mode 100644 index 205bd6a..0000000 --- a/docs/changelog.d/+fly-deploy-immediate-strategy.infra.md +++ /dev/null @@ -1 +0,0 @@ -Switch the Fly proxy deploy strategy from `bluegreen` to `immediate` in `fly/fly.toml`. With a single proxy machine, bluegreen offers little benefit — the green machine routinely failed to reach "started" inside Fly's default 5-minute deploy timeout (the cold-start sequence of `tailscaled` → `tailscale up` → wait-for-MagicDNS → nginx startup eats most of the budget), and the failed deploys would roll back. `immediate` replaces the machine in place with a brief downtime (~5–10s) but actually completes. diff --git a/docs/changelog.d/+forge-mirrors-blackhole.infra.md b/docs/changelog.d/+forge-mirrors-blackhole.infra.md deleted file mode 100644 index 29a5e6a..0000000 --- a/docs/changelog.d/+forge-mirrors-blackhole.infra.md +++ /dev/null @@ -1 +0,0 @@ -Black-hole the `/mirrors/*` repositories at the Fly proxy edge (`return 403` → `forge.ops.eblu.me`). A surprise $29.60 Fly bill traced to ~1.24 TB/30d of egress on `forge.eblu.me`, 99.95% of all proxy egress — of which ~71% was AI scrapers (Meta `meta-externalagent`, OpenAI `GPTBot`, Amazonbot) crawling the near-infinite git-history URL space of the public mirror repos and timing out Forgejo in the process. Mirrors exist for supply-chain control and are consumed over the tailnet, so their public web UI had no legitimate audience. `robots.txt` already disallowed `/mirrors/`, but the offending agents ignore it. Tier-2 mitigations (user-agent denylist, Anubis proof-of-work gateway) are documented in `docs/explanation/ai-scraper-mitigation.md`. diff --git a/docs/changelog.d/+frigate-notify-local.infra.md b/docs/changelog.d/+frigate-notify-local.infra.md deleted file mode 100644 index 120f915..0000000 --- a/docs/changelog.d/+frigate-notify-local.infra.md +++ /dev/null @@ -1 +0,0 @@ -Add local nix container build for `frigate-notify` (`containers/frigate-notify/default.nix`) so the Frigate→ntfy bridge is rebuilt on ringtail from the forge mirror instead of pulled from `ghcr.io/0x2142/frigate-notify`. diff --git a/docs/changelog.d/+grafana-recreate-strategy.infra.md b/docs/changelog.d/+grafana-recreate-strategy.infra.md deleted file mode 100644 index 3662e10..0000000 --- a/docs/changelog.d/+grafana-recreate-strategy.infra.md +++ /dev/null @@ -1 +0,0 @@ -Switched Grafana's deployment strategy from `RollingUpdate` to `Recreate`. With an RWO PVC holding the SQLite database and Bleve search index, `RollingUpdate` reliably crashloops the new pod on the index lock until rollout timeout. `Recreate` terminates the old pod first so the new one acquires the lock cleanly. diff --git a/docs/changelog.d/+homepage-config-perms-fix.bugfix.md b/docs/changelog.d/+homepage-config-perms-fix.bugfix.md deleted file mode 100644 index 20e1135..0000000 --- a/docs/changelog.d/+homepage-config-perms-fix.bugfix.md +++ /dev/null @@ -1,5 +0,0 @@ -Fixed homepage container EACCES on cold start: the nix-built image now chowns -`/app/config` to uid 1000 at build time via `fakeRootCommands`, matching the -behavior of the old Dockerfile. Without this, homepage couldn't seed missing -skeleton configs (proxmox.yaml etc.) or create `/app/config/logs`, crashing on -its first uncached request. Caught during the ringtail cutover. diff --git a/docs/changelog.d/+homepage-dedup-migrated.misc.md b/docs/changelog.d/+homepage-dedup-migrated.misc.md deleted file mode 100644 index 9efc5ba..0000000 --- a/docs/changelog.d/+homepage-dedup-migrated.misc.md +++ /dev/null @@ -1,5 +0,0 @@ -Remove the duplicate Homepage tiles for Mealie, Paperless, Immich, and -TeslaMate. Homepage runs on ringtail and autodiscovers ringtail Ingresses via -`gethomepage.dev/*` annotations; once these services migrated to ringtail they -were discovered automatically, making their leftover static `services.yaml` -entries (needed only while they lived on minikube) redundant. diff --git a/docs/changelog.d/+immich-probe-ringtail.infra.md b/docs/changelog.d/+immich-probe-ringtail.infra.md deleted file mode 100644 index f2d3dee..0000000 --- a/docs/changelog.d/+immich-probe-ringtail.infra.md +++ /dev/null @@ -1 +0,0 @@ -Moved the Immich blackbox health probe from indri's alloy to ringtail's alloy. After the immich migration to ringtail, the probe still targeted `immich-server.immich.svc.cluster.local` on indri's cluster where the service no longer exists, causing a persistent `ServiceProbeFailure` alert. diff --git a/docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md b/docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md deleted file mode 100644 index f71fc81..0000000 --- a/docs/changelog.d/+manage-forgejo-mirrors-sync-location.doc.md +++ /dev/null @@ -1 +0,0 @@ -Fix manage-forgejo-mirrors verify step — sync button is on the repo settings page ("Synchronize now"), not the main repo page. diff --git a/docs/changelog.d/+pin-quartz-v4.bugfix.md b/docs/changelog.d/+pin-quartz-v4.bugfix.md deleted file mode 100644 index e073bbb..0000000 --- a/docs/changelog.d/+pin-quartz-v4.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Pin the Quartz docs build to v4.5.2. The Dagger `build_docs` pipeline cloned Quartz from the default branch unpinned; Quartz v5.0.0 restructured its config layout (`.quartz/plugins`, `../quartz` imports) and broke the docs build against our existing `quartz.config.ts`/`quartz.layout.ts`. diff --git a/docs/changelog.d/+prowler-rebuild-on-main.infra.md b/docs/changelog.d/+prowler-rebuild-on-main.infra.md deleted file mode 100644 index 107b687..0000000 --- a/docs/changelog.d/+prowler-rebuild-on-main.infra.md +++ /dev/null @@ -1 +0,0 @@ -Rebuild Prowler container against main HEAD (v5.23.0-495e45d) after merging the IaC mutelist Dockerfile changes. diff --git a/docs/changelog.d/+remove-devpi-container-build.misc.md b/docs/changelog.d/+remove-devpi-container-build.misc.md deleted file mode 100644 index 8ebec54..0000000 --- a/docs/changelog.d/+remove-devpi-container-build.misc.md +++ /dev/null @@ -1 +0,0 @@ -Removed the now-unused `containers/devpi/` Dagger build artifact. Devpi runs natively on indri via uv venv; the container image is no longer referenced anywhere. Doc examples in `docs/reference/tools/dagger.md` updated to use `miniflux` as the example container name. diff --git a/docs/changelog.d/+retire-todoist-for-heph.infra.md b/docs/changelog.d/+retire-todoist-for-heph.infra.md deleted file mode 100644 index f6284d0..0000000 --- a/docs/changelog.d/+retire-todoist-for-heph.infra.md +++ /dev/null @@ -1 +0,0 @@ -Retired the `blumeops-tasks` mise task (Todoist API) in favor of `heph list --project Blumeops --json` from the self-hosted [hephaestus](https://github.com/eblume/hephaestus) system. Updated docs to point task discovery and rotation reminders at heph, and noted that the `~/code/personal/zk` zettelkasten is migrating into heph docs. diff --git a/docs/changelog.d/+review-1password-doc.doc.md b/docs/changelog.d/+review-1password-doc.doc.md deleted file mode 100644 index bba9591..0000000 --- a/docs/changelog.d/+review-1password-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed [[1password]] reference card: added the `blumeops` vs `Personal` vault split, noted that `onepassword-connect` runs on both indri and ringtail (not just one cluster), and pulled the `op read` vs `op item get --fields` guidance up from agent memory into the card. diff --git a/docs/changelog.d/+review-compliance-image-iac.feature.md b/docs/changelog.d/+review-compliance-image-iac.feature.md deleted file mode 100644 index 1125359..0000000 --- a/docs/changelog.d/+review-compliance-image-iac.feature.md +++ /dev/null @@ -1 +0,0 @@ -`review-compliance-reports` now also fetches and summarizes the weekly Prowler container-image and IaC scans (previously only the K8s CIS in-cluster scan was processed). For each scan it shows status counts, severity breakdown, week-over-week delta, and — for the high-volume image/IaC scans — top-N tables grouped by check ID and resource instead of per-finding listings. diff --git a/docs/changelog.d/+review-contributing-doc.doc.md b/docs/changelog.d/+review-contributing-doc.doc.md deleted file mode 100644 index c394a01..0000000 --- a/docs/changelog.d/+review-contributing-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Refresh the contributing tutorial: add `last-reviewed`, include the `.ai.md` changelog fragment type, and clarify that `prek` is pinned via `mise`. diff --git a/docs/changelog.d/+review-index-doc.doc.md b/docs/changelog.d/+review-index-doc.doc.md deleted file mode 100644 index 7016a7a..0000000 --- a/docs/changelog.d/+review-index-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed `index.md`; added ringtail to the infrastructure overview and stamped `last-reviewed`. diff --git a/docs/changelog.d/+review-navidrome-doc.doc.md b/docs/changelog.d/+review-navidrome-doc.doc.md deleted file mode 100644 index fbe5e79..0000000 --- a/docs/changelog.d/+review-navidrome-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Review and refresh the Navidrome reference card: add `last-reviewed`, correct the scanner env var name, document the current image/version, and record routing and runtime details from the manifests. diff --git a/docs/changelog.d/+review-ollama-doc.doc.md b/docs/changelog.d/+review-ollama-doc.doc.md deleted file mode 100644 index 05ef23e..0000000 --- a/docs/changelog.d/+review-ollama-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Review and refresh the Ollama reference card: add `last-reviewed`, bump the documented image tag to 0.20.4, and add the two `qwen3.5` models now declared in `models.txt`. diff --git a/docs/changelog.d/+ringtail-clone-via-tailnet.infra.md b/docs/changelog.d/+ringtail-clone-via-tailnet.infra.md deleted file mode 100644 index d664163..0000000 --- a/docs/changelog.d/+ringtail-clone-via-tailnet.infra.md +++ /dev/null @@ -1 +0,0 @@ -Switch the ringtail provisioning playbook's blumeops clone URL from `forge.eblu.me` (public, via Fly proxy) to `forge.ops.eblu.me` (tailnet, direct via Caddy on indri). Ringtail is always on the tailnet, so the WAN round-trip is pure overhead — it also made `provision-ringtail` brittle whenever the Fly proxy was slow or down. diff --git a/docs/changelog.d/+ringtail-coredump-size-cap.infra.md b/docs/changelog.d/+ringtail-coredump-size-cap.infra.md deleted file mode 100644 index 824b2df..0000000 --- a/docs/changelog.d/+ringtail-coredump-size-cap.infra.md +++ /dev/null @@ -1 +0,0 @@ -Cap systemd-coredump on ringtail (ProcessSizeMax/ExternalSizeMax 1G, MaxUse 2G) so multi-GB Wine/Proton game crash dumps no longer thrash the disk and lock up the desktop. diff --git a/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md b/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md deleted file mode 100644 index dd488b6..0000000 --- a/docs/changelog.d/+ringtail-flake-update-2026-06-01.infra.md +++ /dev/null @@ -1,4 +0,0 @@ -Update the ringtail NixOS flake lockfile (`nixos/ringtail/flake.lock`): bump -`nixpkgs` (b77b3de → 25f5383) and `disko` (5ba0c95 → 115e521) to latest. -`nixpkgs-services` was intentionally left pinned (skipped by the -`flake-update` pipeline). Routine recurring maintenance per [[manage-lockfile]]. diff --git a/docs/changelog.d/+ringtail-proton-ge.infra.md b/docs/changelog.d/+ringtail-proton-ge.infra.md deleted file mode 100644 index 0d8bc04..0000000 --- a/docs/changelog.d/+ringtail-proton-ge.infra.md +++ /dev/null @@ -1,4 +0,0 @@ -Add GE-Proton (`pkgs.proton-ge-bin`) to `programs.steam.extraCompatPackages` -on ringtail. Subnautica 2 hangs at Mercuna plugin init under Proton -Experimental + DXVK D3D12; GE-Proton is available as a Steam per-game -compatibility option to work around it. diff --git a/docs/changelog.d/+ringtail-sn2-prelaunch.infra.md b/docs/changelog.d/+ringtail-sn2-prelaunch.infra.md deleted file mode 100644 index f9c68e2..0000000 --- a/docs/changelog.d/+ringtail-sn2-prelaunch.infra.md +++ /dev/null @@ -1,6 +0,0 @@ -Add `sn2-prelaunch` Steam launch wrapper on ringtail that removes -Subnautica 2's stale `Saved/running.dat` and `Saved/beforelobby.dat` -lockfiles before each launch. SN2 pops up an invisible (0×0-sized) -Error dialog when it detects an unclean exit, blocking GameThread -forever; this is observable only as a black screen with a spinning -loader. Use via Steam launch option: `sn2-prelaunch %command%`. diff --git a/docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md b/docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md deleted file mode 100644 index 6801040..0000000 --- a/docs/changelog.d/+ringtail-sway-fuzzel.bugfix.md +++ /dev/null @@ -1,3 +0,0 @@ -Fixed sway keybindings on ringtail — the home-manager `keybindings` block was replacing the module's defaults entirely, leaving only explicit overrides (no workspace switching, focus, move, splits, resize mode, etc). Switched to `lib.mkOptionDefault` with `lib.mkForce` on the conflicting custom binds (`Mod+Return`, `Mod+d`, `Mod+space`, `Mod+l`) so defaults merge back in. Also added `Mod+F1` to show a filterable fuzzel list of current keybindings. - -Fixed fuzzel config errors on launch — `border-radius` and `border-width` were under `[main]`, but fuzzel expects them as `radius`/`width` under a `[border]` section. diff --git a/docs/changelog.d/+ringtail-vrr-flicker.bugfix.md b/docs/changelog.d/+ringtail-vrr-flicker.bugfix.md deleted file mode 100644 index cb23344..0000000 --- a/docs/changelog.d/+ringtail-vrr-flicker.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Disabled adaptive sync (VRR) on ringtail's DP-1 output. The OMEN 27i IPS panel pumps brightness when its refresh rate swings into the low VRR range during low-framerate content (e.g. game cutscenes), producing a flicker that worsened over a session until a reboot. Pinning the panel to a fixed 165Hz eliminates it. diff --git a/docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md b/docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md deleted file mode 100644 index 24ffcb9..0000000 --- a/docs/changelog.d/+rotate-fly-deploy-token-shell-examples.doc.md +++ /dev/null @@ -1 +0,0 @@ -rotate-fly-deploy-token: combine mint+store into one command with both fish and bash forms; document the `op item edit` "Password item requires ps value" validator gotcha and the placeholder-password workaround. diff --git a/docs/changelog.d/+runner-logs-auth.feature.md b/docs/changelog.d/+runner-logs-auth.feature.md deleted file mode 100644 index 9ee6fa1..0000000 --- a/docs/changelog.d/+runner-logs-auth.feature.md +++ /dev/null @@ -1 +0,0 @@ -runner-logs now authenticates with Forgejo API token and auto-detects the repo from git remote. Job logs are fetched via SSH to indri (reading Forgejo's on-disk zstd log files) instead of the web endpoint, which doesn't support token auth for private repos. diff --git a/docs/changelog.d/+runner-logs-missing-log.misc.md b/docs/changelog.d/+runner-logs-missing-log.misc.md deleted file mode 100644 index c06704a..0000000 --- a/docs/changelog.d/+runner-logs-missing-log.misc.md +++ /dev/null @@ -1 +0,0 @@ -`mise run runner-logs -j ` now reports a clear error when the log file doesn't exist on indri (e.g. a runner crash that left `action_task.log_in_storage = 0`). Previously it printed only the header and exited 0, because `zstdcat` exits 0 with a "can't stat … -- ignored" stderr message and ssh+fish on indri swallows the remote exit code. diff --git a/docs/changelog.d/+shower-1.1.1-deploy.infra.md b/docs/changelog.d/+shower-1.1.1-deploy.infra.md deleted file mode 100644 index 61244ac..0000000 --- a/docs/changelog.d/+shower-1.1.1-deploy.infra.md +++ /dev/null @@ -1 +0,0 @@ -Deploy shower v1.1.1 to ringtail (kustomize newTag bump). diff --git a/docs/changelog.d/+shower-1.1.1-fod-pin.infra.md b/docs/changelog.d/+shower-1.1.1-fod-pin.infra.md deleted file mode 100644 index a19b578..0000000 --- a/docs/changelog.d/+shower-1.1.1-fod-pin.infra.md +++ /dev/null @@ -1 +0,0 @@ -Pin shower v1.1.1 FOD outputHash (probed locally on ringtail). diff --git a/docs/changelog.d/+shower-1.1.1.infra.md b/docs/changelog.d/+shower-1.1.1.infra.md deleted file mode 100644 index eb9476c..0000000 --- a/docs/changelog.d/+shower-1.1.1.infra.md +++ /dev/null @@ -1 +0,0 @@ -Bump shower container to v1.1.1 (probe FOD hash). diff --git a/docs/changelog.d/+shower-1.1.3-deploy.infra.md b/docs/changelog.d/+shower-1.1.3-deploy.infra.md deleted file mode 100644 index 833fac6..0000000 --- a/docs/changelog.d/+shower-1.1.3-deploy.infra.md +++ /dev/null @@ -1 +0,0 @@ -Deployed shower v1.1.3 to ringtail (image built and pushed from ringtail; runner bypassed due to indri overload). diff --git a/docs/changelog.d/+shower-1.1.3.infra.md b/docs/changelog.d/+shower-1.1.3.infra.md deleted file mode 100644 index 33ee49d..0000000 --- a/docs/changelog.d/+shower-1.1.3.infra.md +++ /dev/null @@ -1 +0,0 @@ -Bumped shower app to v1.1.3 (wheel/sdist + FOD hashes probed on ringtail). diff --git a/docs/changelog.d/+shower-main-sha-rebuild.infra.md b/docs/changelog.d/+shower-main-sha-rebuild.infra.md deleted file mode 100644 index f1751b5..0000000 --- a/docs/changelog.d/+shower-main-sha-rebuild.infra.md +++ /dev/null @@ -1,5 +0,0 @@ -Rebuild shower from the post-merge commit on main so the container's -SHA tag points at a commit that will still exist after the 30-day -branch-cleanup window. Functionally identical to the branch-tag image -already deployed, just preserves source traceability per -[[build-container-image#Squash-merge and container tags]]. diff --git a/docs/changelog.d/+shower-rebuild-from-main-sha.misc.md b/docs/changelog.d/+shower-rebuild-from-main-sha.misc.md deleted file mode 100644 index a9495cd..0000000 --- a/docs/changelog.d/+shower-rebuild-from-main-sha.misc.md +++ /dev/null @@ -1,6 +0,0 @@ -Rebuild shower v1.1.0 container from main HEAD (`3c7967e`) and bump the -kustomization tag to `v1.1.0-3c7967e-nix`. The PR was squash-merged, so -the branch commit `444ff91` baked into the prior tag isn't reachable -from main's history. The new tag points at a commit that exists on -main; image content is byte-identical because the FOD output is content -addressed and the inputs didn't change. diff --git a/docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md b/docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md deleted file mode 100644 index 9355a54..0000000 --- a/docs/changelog.d/+shower-v1.1.2-rebuild-from-main-sha.misc.md +++ /dev/null @@ -1 +0,0 @@ -Rebuild shower v1.1.2 from main HEAD (a33fa47) and retag — PR #358 was squash-merged so the branch SHA baked into the prior image tag isn't reachable from main. FOD is content-addressed, so image bytes are identical; only provenance changes. diff --git a/docs/changelog.d/+tailscale-main-sha-rebuild.infra.md b/docs/changelog.d/+tailscale-main-sha-rebuild.infra.md deleted file mode 100644 index 24bb81c..0000000 --- a/docs/changelog.d/+tailscale-main-sha-rebuild.infra.md +++ /dev/null @@ -1 +0,0 @@ -Update `tailscale-operator-ringtail` ProxyClass to reference the `0108b68` main-SHA build of the tailscale container. Routine post-merge cleanup so the deployed image traces to a commit that survives PR branch cleanup. diff --git a/docs/changelog.d/+transmission-doc-review.doc.md b/docs/changelog.d/+transmission-doc-review.doc.md deleted file mode 100644 index 418504f..0000000 --- a/docs/changelog.d/+transmission-doc-review.doc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed transmission card: corrected storage layout (`/config/` is emptyDir, watch dir disabled) and noted the Prometheus exporter sidecar. diff --git a/docs/changelog.d/+unpoller-rebuild-on-main.infra.md b/docs/changelog.d/+unpoller-rebuild-on-main.infra.md deleted file mode 100644 index 60ae8fa..0000000 --- a/docs/changelog.d/+unpoller-rebuild-on-main.infra.md +++ /dev/null @@ -1 +0,0 @@ -Rebuild unpoller container from squashed main commit so the image SHA tag matches a commit in main's history (was tagged with the pre-squash branch SHA). diff --git a/docs/changelog.d/+valkey-main-tag-bump.infra.md b/docs/changelog.d/+valkey-main-tag-bump.infra.md deleted file mode 100644 index cd19f60..0000000 --- a/docs/changelog.d/+valkey-main-tag-bump.infra.md +++ /dev/null @@ -1 +0,0 @@ -Bump paperless and immich kustomizations to the main-SHA-built valkey tag (`v8.1.6-r0-fabca04`). Routine post-merge follow-up to keep production manifests pointing at images built from a commit on main. diff --git a/docs/changelog.d/+valkey-rebuild-on-main.infra.md b/docs/changelog.d/+valkey-rebuild-on-main.infra.md deleted file mode 100644 index c743e61..0000000 --- a/docs/changelog.d/+valkey-rebuild-on-main.infra.md +++ /dev/null @@ -1 +0,0 @@ -Rebuild valkey container from squashed main commit (both arm64 dagger and amd64 nix variants), and update paperless + immich-ringtail kustomizations to the main-SHA tags `v8.1.7-ecded30` and `v8.1.7-ecded30-nix`. diff --git a/docs/changelog.d/+wave1-decommission-followups.infra.md b/docs/changelog.d/+wave1-decommission-followups.infra.md deleted file mode 100644 index 7b54d52..0000000 --- a/docs/changelog.d/+wave1-decommission-followups.infra.md +++ /dev/null @@ -1,8 +0,0 @@ -Fix three follow-ups from the wave-1 decommission: grant the local -break-glass `admin` account ArgoCD admin rights (`g, admin, role:admin` — -previously only the Authentik `admins` group had access, so admin was -locked out whenever its token expired), and repoint the alloy blackbox -probe for teslamate from the deleted minikube service to -`https://tesla.ops.eblu.me/` (through Caddy over Tailscale). The orphaned -paperless/teslamate roles + ExternalSecrets left on the minikube -blumeops-pg are also cleaned up. diff --git a/docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md b/docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md deleted file mode 100644 index ec8834f..0000000 --- a/docs/changelog.d/+zot-ci-rotation-op-syntax.doc.md +++ /dev/null @@ -1 +0,0 @@ -Fixed the `op item edit` invocation in the [[zot]] API-key rotation procedure: the previous `pbpaste | op item edit ... "field[password]=-"` stdin syntax is rejected by op 2.34 as "invalid JSON" (recent op versions treat piped input as a full JSON template, not a single field value). Procedure now reads the clipboard into a local fish variable and passes it as an inline assignment. diff --git a/docs/changelog.d/+zot-v2.1.16.infra.md b/docs/changelog.d/+zot-v2.1.16.infra.md deleted file mode 100644 index f007164..0000000 --- a/docs/changelog.d/+zot-v2.1.16.infra.md +++ /dev/null @@ -1 +0,0 @@ -Upgraded zot on indri from v2.1.15 to v2.1.16 (security fixes: TLS verification on metrics client, CORS Allow-Credentials suppression on wildcard origins, manifest/API-key body size limits). diff --git a/docs/changelog.d/alloy-v1.16.0.infra.md b/docs/changelog.d/alloy-v1.16.0.infra.md deleted file mode 100644 index cd9a1ef..0000000 --- a/docs/changelog.d/alloy-v1.16.0.infra.md +++ /dev/null @@ -1,5 +0,0 @@ -Upgrade Grafana Alloy v1.14.0 → v1.16.0 across all four service deployments -(alloy-k8s, alloy-ringtail, alloy-tracing-ringtail on k8s; alloy native on -indri). Pulls in stable database observability (v1.15) and the OTel Collector -v0.147.0 bump. Container build also migrated from Dockerfile to native Dagger -`container.py` per the build-container-image migration playbook. diff --git a/docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md b/docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md deleted file mode 100644 index 33b041f..0000000 --- a/docs/changelog.d/backup-grafana-ringtail-blumeops-pg.infra.md +++ /dev/null @@ -1,8 +0,0 @@ -Wire the ringtail `blumeops-pg` cluster (which holds the wave-1-migrated -paperless + teslamate databases) into backups and Grafana. Adds a Tailscale -LoadBalancer Service (`blumeops-pg-ringtail.tail8d86e.ts.net`) and a Caddy L4 -route (`pg.ops.eblu.me:5434`), then repoints borgmatic's `teslamate` + -`paperless` postgres dumps and the `mealie` SQLite dump at ringtail, and the -Grafana TeslaMate datasource at the ringtail DB. Closes the backup gap that -opened at cutover (the migrated live data was still being backed up from the -now-frozen minikube copies) and unblocks the wave-1 decommission. diff --git a/docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md b/docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md deleted file mode 100644 index 79a81cf..0000000 --- a/docs/changelog.d/cleanup-cv-docs-minikube-artifacts.misc.md +++ /dev/null @@ -1 +0,0 @@ -Removed the dead minikube manifests, container builds, and tooling shims left behind after the cv + docs migration to indri-native (#342). Deletes `argocd/{apps,manifests}/{cv,docs}/`, `containers/{cv,quartz}/`, and the `quartz`→`docs` mapping in `mise-tasks/container-version-check`. Bumps `docs.current-version` to `v1.16.0` (the blumeops release tag) now that the legacy nginx-base version pin is gone. diff --git a/docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md b/docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md deleted file mode 100644 index 35f77c2..0000000 --- a/docs/changelog.d/dagger-0-20-6-runner-image-alpine.infra.md +++ /dev/null @@ -1 +0,0 @@ -Upgraded Dagger from v0.20.1 to v0.20.6 (engine, CLI pin, and SDK regen) and migrated `runner-job-image` from a Debian-based Dockerfile to a native Dagger `container.py` on Alpine 3.23, reusing the shared `alpine_runtime` helper. diff --git a/docs/changelog.d/decommission-wave1-minikube.infra.md b/docs/changelog.d/decommission-wave1-minikube.infra.md deleted file mode 100644 index 63b3ab5..0000000 --- a/docs/changelog.d/decommission-wave1-minikube.infra.md +++ /dev/null @@ -1,8 +0,0 @@ -Decommission the wave-1 services on minikube-indri now that paperless, -teslamate, and mealie run on ringtail with their data backed up. Removes the -minikube `paperless`/`teslamate`/`mealie` manifest dirs + ArgoCD app -definitions (pruning the parked Deployments, Services, and the redundant -minikube mealie/paperless PVCs), and drops the `paperless`/`teslamate` roles -from the minikube `blumeops-pg` cluster. The `paperless` and `teslamate` -databases are dropped from indri's blumeops-pg as the finalization step. -miniflux + authentik remain on the minikube cluster (later waves). diff --git a/docs/changelog.d/doc-review-replicating-blumeops.doc.md b/docs/changelog.d/doc-review-replicating-blumeops.doc.md deleted file mode 100644 index e9e6d0f..0000000 --- a/docs/changelog.d/doc-review-replicating-blumeops.doc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed `replicating-blumeops` tutorial: fixed "BluemeOps" typos (also in `contributing.md`) and added `last-reviewed` frontmatter. diff --git a/docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md b/docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md deleted file mode 100644 index e18272c..0000000 --- a/docs/changelog.d/fix-borgmatic-shower-via-ssh.bugfix.md +++ /dev/null @@ -1,14 +0,0 @@ -Fix nightly borgmatic backups failing for 2 days. The shower SQLite -dump hook referenced `kubectl --context=k3s-ringtail`, but indri's -kubeconfig deliberately doesn't carry the ringtail credentials. The -`before_backup` hook's failure aborted the entire run, taking out -*both* the local sifaka repo and the BorgBase offsite. Replaced -the inline-shell dump with a `~/bin/borgmatic-k8s-sqlite-dump` -helper deployed by the ansible role. Each dump entry now declares a -`target` of either `local:` (mealie — kubectl uses indri's -kubeconfig) or `ssh:` (shower — ssh into ringtail and -run `k3s kubectl` there, no indri-side kubeconfig needed; k3s.yaml -on ringtail is mode 644 so no sudo required). Bytes stream back via -`kubectl exec ... -- cat` rather than `kubectl cp`, since `kubectl -cp` requires `tar` inside the pod and nix-built images like shower -don't bundle it. diff --git a/docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md b/docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md deleted file mode 100644 index cc35684..0000000 --- a/docs/changelog.d/forgejo-runner-v12-8-server-connections.infra.md +++ /dev/null @@ -1 +0,0 @@ -Upgraded the k8s Forgejo runner to the v12.8 line, switched it from first-boot registration to declarative `server.connections` credentials from 1Password, and consolidated the supporting runner how-to documentation. diff --git a/docs/changelog.d/homepage-to-ringtail.infra.md b/docs/changelog.d/homepage-to-ringtail.infra.md deleted file mode 100644 index 1e3e795..0000000 --- a/docs/changelog.d/homepage-to-ringtail.infra.md +++ /dev/null @@ -1,8 +0,0 @@ -Migrated homepage dashboard from minikube (indri/arm64) to k3s (ringtail/amd64). -The container is now built via nix (`containers/homepage/default.nix`), adapted -from nixpkgs `homepage-dashboard` with the upstream Next.js cache patches and -wrapped with `dockerTools.buildLayeredImage`. Autodiscovery shifts: services on -minikube (ArgoCD, Immich, Kiwix, Mealie, Miniflux, Grafana, Prometheus, -Navidrome, Paperless, TeslaMate, Transmission) become explicit static entries -in `services.yaml`; ringtail services (Authentik, Frigate/NVR, Ntfy, Ollama) -auto-populate via Ingress annotations. diff --git a/docs/changelog.d/migrate-cv-docs-to-indri.infra.md b/docs/changelog.d/migrate-cv-docs-to-indri.infra.md deleted file mode 100644 index 608a6b9..0000000 --- a/docs/changelog.d/migrate-cv-docs-to-indri.infra.md +++ /dev/null @@ -1 +0,0 @@ -Migrated CV (`cv.eblu.me`) and Docs (`docs.eblu.me`) from minikube Deployments to indri-native ansible roles. Caddy now serves the extracted release tarballs directly via a new `kind: static` service-block in the Caddy template — no daemon, no container — replacing the prior nginx-in-a-pod layer. Removes a network hop on every request and shrinks minikube's footprint. See [[cv-on-indri]] and [[docs-on-indri]]. Part of the broader minikube wind-down. diff --git a/docs/changelog.d/migrate-devpi-to-indri.infra.md b/docs/changelog.d/migrate-devpi-to-indri.infra.md deleted file mode 100644 index 418db70..0000000 --- a/docs/changelog.d/migrate-devpi-to-indri.infra.md +++ /dev/null @@ -1 +0,0 @@ -Migrated devpi (PyPI mirror at `pypi.ops.eblu.me`) from a minikube StatefulSet to a launchd-managed service on indri. devpi-server now runs in a uv-managed venv with pinned `devpi-server` and `devpi-web` versions, listens on `127.0.0.1:3141`, and is fronted by Caddy. The minikube StatefulSet was crash-looping under memory pressure (and breaking the Python toolchain everywhere); the new layout removes a layer of dependency on cluster health for critical-path tooling. See [[devpi-on-indri]]. diff --git a/docs/changelog.d/migrate-immich-to-ringtail.infra.md b/docs/changelog.d/migrate-immich-to-ringtail.infra.md deleted file mode 100644 index b47742f..0000000 --- a/docs/changelog.d/migrate-immich-to-ringtail.infra.md +++ /dev/null @@ -1,13 +0,0 @@ -Move the entire Immich stack — server, machine-learning, valkey, -and the PostgreSQL+VectorChord cluster — off `minikube-indri` and -onto `k3s-ringtail`. Postgres data migrated zero-loss via CNPG -`pg_basebackup` (replica catch-up then promote); row counts on -`asset`, `user`, `album`, `smart_search`, `activity`, `asset_face` -verified equal between source and replica before cutover. The ML -pod now uses ringtail's RTX 4080 via the nvidia-device-plugin -(time-slicing bumped 2 → 4 to share with frigate + ollama). Caddy -routing at `photos.ops.eblu.me` is unchanged (still -`photos.tail8d86e.ts.net`, the device just lives on ringtail now). -Borgmatic backups continue against the same `immich-pg` tailnet -hostname. First concrete chain in the broader indri-k8s -decommission effort. diff --git a/docs/changelog.d/migrate-wave1-ringtail.infra.md b/docs/changelog.d/migrate-wave1-ringtail.infra.md deleted file mode 100644 index c44263a..0000000 --- a/docs/changelog.d/migrate-wave1-ringtail.infra.md +++ /dev/null @@ -1,13 +0,0 @@ -Move paperless, teslamate, and mealie off `minikube-indri` onto -`k3s-ringtail`, shedding ~1.1 GiB of resident load from the -OOM-thrashing 8 GiB minikube node (the kernel OOM killer had been -killing `kube-apiserver`/`dockerd`/argocd, flapping every -minikube-hosted service at once). paperless + teslamate databases -move into a fresh CNPG `blumeops-pg` cluster on ringtail via a cold -`pg_dump`/`pg_restore` from the quiesced source — row counts verified -equal before any routing flip; source DBs dropped only after the -ringtail side serves traffic. mealie's SQLite PVC is copied as-is. -paperless media stays on sifaka NFS. Downtime-tolerant cold cutover -(no streaming replication); rollback is repoint-and-scale-up with the -source untouched. Second chain in the indri-k8s decommission after -[[migrate-immich-to-ringtail]]. diff --git a/docs/changelog.d/mirror-tailscale-container.infra.md b/docs/changelog.d/mirror-tailscale-container.infra.md deleted file mode 100644 index 54ca3ba..0000000 --- a/docs/changelog.d/mirror-tailscale-container.infra.md +++ /dev/null @@ -1 +0,0 @@ -Add local nix container build for `tailscale` (`containers/tailscale/default.nix`) so ringtail's tailscale-operator ProxyClass proxy pods pull from the forge mirror instead of `docker.io/tailscale/tailscale`. Pinned at v1.94.2 to match `service-versions.yaml`. Indri's tailscale-operator continues to use upstream during the k8s-to-ringtail migration. diff --git a/docs/changelog.d/prowler-iac-mutelist.infra.md b/docs/changelog.d/prowler-iac-mutelist.infra.md deleted file mode 100644 index 077cfa8..0000000 --- a/docs/changelog.d/prowler-iac-mutelist.infra.md +++ /dev/null @@ -1 +0,0 @@ -Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var, muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`. diff --git a/docs/changelog.d/recurring-maintenance-2026-05-27.doc.md b/docs/changelog.d/recurring-maintenance-2026-05-27.doc.md deleted file mode 100644 index af30489..0000000 --- a/docs/changelog.d/recurring-maintenance-2026-05-27.doc.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed [[indri]] reference card: added `devpi`, `cv`, and `docs` to the native-services list; widened the k8s note to reflect the growing set of apps now on ringtail and the planned indri-minikube decommission; added CPU/RAM specs. diff --git a/docs/changelog.d/recurring-maintenance-2026-05-27.infra.md b/docs/changelog.d/recurring-maintenance-2026-05-27.infra.md deleted file mode 100644 index f2d48ad..0000000 --- a/docs/changelog.d/recurring-maintenance-2026-05-27.infra.md +++ /dev/null @@ -1,4 +0,0 @@ -Recurring maintenance batch: - -- Ringtail flake inputs refreshed (`disko`, `home-manager`, `nixpkgs`). -- Tooling deps bumped: prek hooks (trufflehog v3.95.3, kingfisher v1.101.0, ruff v0.15.14, `ansible-core` 2.21.0); fly proxy base images (nginx 1.30.1-alpine, alloy v1.16.1); `typer==0.26.2` in mise tasks. diff --git a/docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md b/docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md deleted file mode 100644 index f39f9f4..0000000 --- a/docs/changelog.d/review-ringtail-flake-2026-05-11.infra.md +++ /dev/null @@ -1 +0,0 @@ -Updated `nixos/ringtail/flake.lock` (weekly cadence): `disko`, `home-manager`, and `nixpkgs` inputs refreshed. `nixpkgs-services` skipped per overlay convention. diff --git a/docs/changelog.d/ringtail-static-ip.infra.md b/docs/changelog.d/ringtail-static-ip.infra.md deleted file mode 100644 index 8474b0a..0000000 --- a/docs/changelog.d/ringtail-static-ip.infra.md +++ /dev/null @@ -1 +0,0 @@ -Pin ringtail's wired IP to `192.168.1.21` via NixOS scripted networking; NetworkManager no longer manages `enp5s0`. Removes DHCP lease renewal as a failure mode after a silent lease teardown took ringtail offline. Also explicitly enables `net.ipv4.ip_forward` (previously set implicitly by scripted-DHCP) so k3s pod networking and Tailscale routing continue to work with static networking. diff --git a/docs/changelog.d/rip-out-compensating-controls.infra.md b/docs/changelog.d/rip-out-compensating-controls.infra.md deleted file mode 100644 index d41fd1a..0000000 --- a/docs/changelog.d/rip-out-compensating-controls.infra.md +++ /dev/null @@ -1 +0,0 @@ -Ripped out the compensating-controls (CC) framework: deleted `compensating-controls.yaml`, the `review-compensating-controls` mise task, and the associated how-to / explanation docs. Prowler and Kingfisher continue to run weekly and produce reports; the Prowler mutelist YAML files remain in place but no longer carry `CC: ` prefixes — each entry just keeps a free-form `Description` of why the finding is muted. The CC review cadence proved to be more overhead than this single-operator homelab needed. diff --git a/docs/changelog.d/service-review-mealie-2026-05-11.infra.md b/docs/changelog.d/service-review-mealie-2026-05-11.infra.md deleted file mode 100644 index 074cd21..0000000 --- a/docs/changelog.d/service-review-mealie-2026-05-11.infra.md +++ /dev/null @@ -1 +0,0 @@ -Reviewed `mealie` service version freshness; upstream is 5 minor versions ahead (v3.17.0 vs deployed v3.12.0). Marked reviewed; upgrade deferred. diff --git a/docs/changelog.d/shower-app-deploy.bugfix.md b/docs/changelog.d/shower-app-deploy.bugfix.md deleted file mode 100644 index 91d2b3b..0000000 --- a/docs/changelog.d/shower-app-deploy.bugfix.md +++ /dev/null @@ -1,13 +0,0 @@ -Shower app container now bakes the wheel + Python deps into the image -at build time via `buildPythonPackage` instead of pip-installing on -first boot. Boots are deterministic and don't depend on forge PyPI -being reachable from the pod. The `wheelHash` in -`containers/shower/default.nix` is the sha256 sourced from the -[forge PyPI simple index](https://forge.eblu.me/api/packages/eblume/pypi/simple/adelaide-baby-shower-app/); -bumping the version means bumping that hash too. - -Borgmatic now covers the shower app: SQLite is dumped from the live -pod via `kubectl exec` (mirroring the existing mealie entry, with -`context: k3s-ringtail`), and the prize-photo media share is picked up -through `/Volumes/shower` (sifaka SMB mount on indri, same pattern as -`/Volumes/photos`). diff --git a/docs/changelog.d/shower-app-deploy.feature.md b/docs/changelog.d/shower-app-deploy.feature.md deleted file mode 100644 index 96218be..0000000 --- a/docs/changelog.d/shower-app-deploy.feature.md +++ /dev/null @@ -1,4 +0,0 @@ -Deploy the Adelaide / Heidi / Addie baby shower app — guest splash, raffle -picker, and prize assignment console — on ringtail k3s with `shower.eblu.me` -as the public entry and `shower.ops.eblu.me` as the tailnet admin host. App -source: [`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app). diff --git a/docs/changelog.d/shower-app-deploy.infra.md b/docs/changelog.d/shower-app-deploy.infra.md deleted file mode 100644 index 157a068..0000000 --- a/docs/changelog.d/shower-app-deploy.infra.md +++ /dev/null @@ -1,9 +0,0 @@ -Wire shower app for public exposure: fly nginx `shower.eblu.me` server -block as a guest-only surface — splash page, `/prizes//`, static -assets, media. Everything authenticated (`/admin/`, `/host/`, -`/accounts/`) returns 403 with a "tailnet only" pointer. Staff hit -`shower.ops.eblu.me` for the operator console + admin; the app's -v1.0.1 `DJANGO_PUBLIC_URL_BASE` setting makes QR codes generated on -the tailnet point back at the WAN host for guests. Plus a Caddy route -on indri, Pulumi Gandi CNAME, and a Grafana APM dashboard tracking -request rate, error rate, latency, bandwidth, and access logs. diff --git a/docs/changelog.d/shower-v1.1.0.feature.md b/docs/changelog.d/shower-v1.1.0.feature.md deleted file mode 100644 index d2c3400..0000000 --- a/docs/changelog.d/shower-v1.1.0.feature.md +++ /dev/null @@ -1,15 +0,0 @@ -Deploy adelaide-baby-shower-app v1.1.0 to ringtail k3s. Replaces the -boolean lock with a four-phase `ShowerState` (`pre_event` → `party` → -`prizes_locked` → `event_locked`), adds an append-only "guest memories" -panel where guests can leave photos and comments for the baby, and -polishes the admin and QR views. Three Django migrations -(`0009_shower_phase`, `0010_guest_memories`, `0011_book_description`) -run automatically in the entrypoint against the SQLite PV. No config -or env-var changes. - -Container build also gains a Forgejo-PyPI workaround: Forgejo's simple -index returns absolute file URLs hardcoded to the public ROOT_URL -(`forge.eblu.me`), which the Fly edge 403s on `/api/packages/*`. The -wheel and sdist are now both pulled via direct `fetchurl` against -`forge.ops.eblu.me` (tailnet-only) and the wheel is handed to pip as -a local path. diff --git a/docs/changelog.d/shower-v1.1.2.infra.md b/docs/changelog.d/shower-v1.1.2.infra.md deleted file mode 100644 index aa2db0d..0000000 --- a/docs/changelog.d/shower-v1.1.2.infra.md +++ /dev/null @@ -1 +0,0 @@ -Deploy shower v1.1.2 — bump container build to new app release. diff --git a/docs/changelog.d/unpoller-v3.infra.md b/docs/changelog.d/unpoller-v3.infra.md deleted file mode 100644 index fa6eaf9..0000000 --- a/docs/changelog.d/unpoller-v3.infra.md +++ /dev/null @@ -1 +0,0 @@ -Upgrade unpoller v2.34.0 → v3.2.0 and migrate container build from Dockerfile to native Dagger (container.py). v3.0.0 carries breaking UniFi API changes; v3.2.0 introduces a 60s background poll (cached scrapes) by default — set `interval = 0` in `up.conf` to restore on-demand polling. diff --git a/docs/changelog.d/update-tooling-deps-2026-04.doc.md b/docs/changelog.d/update-tooling-deps-2026-04.doc.md deleted file mode 100644 index 141e975..0000000 --- a/docs/changelog.d/update-tooling-deps-2026-04.doc.md +++ /dev/null @@ -1 +0,0 @@ -New how-to: rotate-fly-deploy-token. Documents the 75-day rotation cadence, why we use `org`-scoped tokens (silences the cosmetic metrics-token warning on `fly status` with marginal blast-radius cost given the single-app personal org), and the procedure for rotation + Forgejo Actions secret sync. diff --git a/docs/changelog.d/update-tooling-deps-2026-04.infra.md b/docs/changelog.d/update-tooling-deps-2026-04.infra.md deleted file mode 100644 index 4731eca..0000000 --- a/docs/changelog.d/update-tooling-deps-2026-04.infra.md +++ /dev/null @@ -1 +0,0 @@ -Monthly tooling dependency refresh: prek hooks (trufflehog, kingfisher, ruff, shfmt, prettier, actionlint, ansible-lint), fly proxy base images (nginx 1.30.0, tailscale v1.94.2, alloy v1.16.0), normalize pyyaml lower bound in mise-tasks. diff --git a/docs/changelog.d/valkey-mirror.infra.md b/docs/changelog.d/valkey-mirror.infra.md deleted file mode 100644 index 06f8d98..0000000 --- a/docs/changelog.d/valkey-mirror.infra.md +++ /dev/null @@ -1 +0,0 @@ -Mirror Valkey 8.1 locally as `registry.ops.eblu.me/blumeops/valkey`. Replaces direct pulls of `docker.io/valkey/valkey:8.1-alpine` for paperless and immich sidecars. Built via native Dagger pipeline on Alpine 3.22. Stateless swap — no data migration. Authentik's nix-built Redis remains separate. diff --git a/docs/changelog.d/valkey-nix.infra.md b/docs/changelog.d/valkey-nix.infra.md deleted file mode 100644 index e41eb63..0000000 --- a/docs/changelog.d/valkey-nix.infra.md +++ /dev/null @@ -1 +0,0 @@ -Add nix-built amd64 valkey for ringtail (`containers/valkey/default.nix`) so immich-ringtail can stop pulling the upstream multi-arch `docker.io/valkey/valkey` image. Existing `container.py` continues to build Alpine arm64 for paperless on indri. Both bump to valkey 8.1.7 (Alpine 3.22 8.1.7-r0 / nixpkgs 8.1.7). From 02ea1cc72af43928e9105479ded91da9b51ca18a Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 12:39:50 -0700 Subject: [PATCH 112/122] C0: point tailscale-operator base mirror fetch at tailnet forge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The public forge.eblu.me now black-holes /mirrors/ at the Fly edge (AI-scraper mitigation), so the in-cluster ArgoCD repo-server got a 403 fetching the upstream operator manifest — leaving tailscale-operator and tailscale-operator-ringtail in Unknown sync. Use forge.ops.eblu.me. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/tailscale-operator-base/kustomization.yaml | 5 ++++- .../+tailscale-operator-mirror-tailnet-url.bugfix.md | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md diff --git a/argocd/manifests/tailscale-operator-base/kustomization.yaml b/argocd/manifests/tailscale-operator-base/kustomization.yaml index 4519af6..9d117ef 100644 --- a/argocd/manifests/tailscale-operator-base/kustomization.yaml +++ b/argocd/manifests/tailscale-operator-base/kustomization.yaml @@ -6,8 +6,11 @@ namespace: tailscale # Upstream Tailscale operator manifest from forge mirror. # To upgrade: update the ref in the URL AND the newTag below. +# Must use the tailnet host forge.ops.eblu.me — the public forge.eblu.me +# black-holes /mirrors/ at the Fly edge (AI-scraper mitigation), which the +# in-cluster ArgoCD repo-server would otherwise hit and fail with a 403. resources: - - https://forge.eblu.me/mirrors/tailscale/raw/tag/v1.94.2/cmd/k8s-operator/deploy/manifests/operator.yaml + - https://forge.ops.eblu.me/mirrors/tailscale/raw/tag/v1.94.2/cmd/k8s-operator/deploy/manifests/operator.yaml - proxyclass.yaml - dnsconfig.yaml diff --git a/docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md b/docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md new file mode 100644 index 0000000..cc29cf7 --- /dev/null +++ b/docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md @@ -0,0 +1 @@ +Fixed the `tailscale-operator` and `tailscale-operator-ringtail` ArgoCD apps showing `Unknown` sync status. Their shared base kustomization fetched the upstream operator manifest from the public `forge.eblu.me/mirrors/...`, which the AI-scraper mitigation now black-holes (403). Pointed the remote resource at the tailnet host `forge.ops.eblu.me` instead, which the in-cluster repo-server can reach. From bb55fa95667903e1b38c084a46690e7da61eef0d Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 13:37:02 -0700 Subject: [PATCH 113/122] Recurring review sweep: 4 doc cards + nvidia-device-plugin v0.19.2 (#366) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Knocks out the two daily recurring review tasks (doc review + service review) in one PR. ## Doc review (4 never-reviewed reference cards, `last-reviewed: 2026-06-04`) - **cluster.md** — Kubernetes version v1.34.0 → **v1.35.0**; refreshed the stale ringtail workload list and noted the in-progress minikube→k3s migration (points to `[[ringtail]]` as the canonical list). - **ntfy.md / tempo.md / alloy.md** — corrected image references: these are now **locally-built `registry.ops.eblu.me/blumeops/*` nix containers** (ntfy v2.19.2, tempo v2.10.3, alloy-k8s v1.16.0), not upstream Docker Hub. Fly.io alloy binary bumped to v1.16.1. ## Service review - **nvidia-device-plugin** (ringtail GPU): v0.19.0 → **v0.19.2**. Upstream patch releases — CDI/Tegra fixes + dependency bumps, no breaking changes for our manifest-based CDI + RuntimeClass setup (the service-account change in the notes is helm-only). ## Not in this PR (need container rebuilds, deferred) The other stale services are locally-built nix images, so upgrading them is a forge-runner rebuild rather than a clean tag bump — left untouched (not date-bumped, so they resurface): **prometheus** (v3.10.0→v3.12.0), **loki** (3.6.7→3.7.2), **kube-state-metrics**, **homepage**. Happy to do these as a follow-up rebuild PR. ## Deploy / verify Not yet deployed — `nvidia-device-plugin` still points at `main`. After review: ``` argocd app set nvidia-device-plugin --revision reviews-jun4 && argocd app sync nvidia-device-plugin # after merge: argocd app set nvidia-device-plugin --revision main && argocd app sync nvidia-device-plugin ``` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/366 --- argocd/manifests/nvidia-device-plugin/kustomization.yaml | 2 +- docs/changelog.d/reviews-jun4.doc.md | 1 + docs/changelog.d/reviews-jun4.infra.md | 1 + docs/reference/kubernetes/cluster.md | 9 ++++++--- docs/reference/services/alloy.md | 7 ++++--- docs/reference/services/ntfy.md | 5 +++-- docs/reference/services/tempo.md | 5 +++-- service-versions.yaml | 4 ++-- 8 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 docs/changelog.d/reviews-jun4.doc.md create mode 100644 docs/changelog.d/reviews-jun4.infra.md diff --git a/argocd/manifests/nvidia-device-plugin/kustomization.yaml b/argocd/manifests/nvidia-device-plugin/kustomization.yaml index a46edf6..f5a33ae 100644 --- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml +++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml @@ -10,4 +10,4 @@ resources: images: - name: nvcr.io/nvidia/k8s-device-plugin - newTag: v0.19.0 + newTag: v0.19.2 diff --git a/docs/changelog.d/reviews-jun4.doc.md b/docs/changelog.d/reviews-jun4.doc.md new file mode 100644 index 0000000..f1aeaa8 --- /dev/null +++ b/docs/changelog.d/reviews-jun4.doc.md @@ -0,0 +1 @@ +Reviewed four never-reviewed reference cards (`cluster`, `ntfy`, `tempo`, `alloy`) and corrected drift: minikube is now Kubernetes v1.35.0; ntfy, tempo, and alloy-k8s images are now locally-built `registry.ops.eblu.me/blumeops/*` nix containers (v2.19.2, v2.10.3, v1.16.0) rather than upstream Docker Hub; the Fly.io alloy binary is v1.16.1; and the ringtail workload list reflects the in-progress minikube→k3s migration. diff --git a/docs/changelog.d/reviews-jun4.infra.md b/docs/changelog.d/reviews-jun4.infra.md new file mode 100644 index 0000000..c128e70 --- /dev/null +++ b/docs/changelog.d/reviews-jun4.infra.md @@ -0,0 +1 @@ +Upgraded the nvidia-device-plugin on ringtail from v0.19.0 to v0.19.2 (upstream patch release: CDI/Tegra fixes and dependency bumps, no breaking changes for our manifest-based CDI + RuntimeClass setup). diff --git a/docs/reference/kubernetes/cluster.md b/docs/reference/kubernetes/cluster.md index 9b632bd..07c14af 100644 --- a/docs/reference/kubernetes/cluster.md +++ b/docs/reference/kubernetes/cluster.md @@ -1,6 +1,7 @@ --- title: Cluster -modified: 2026-02-19 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - kubernetes --- @@ -15,7 +16,7 @@ BlumeOps runs two Kubernetes clusters: a Minikube cluster on [[indri]] (most ser |----------|-------| | **Driver** | docker | | **Container Runtime** | docker | -| **Kubernetes Version** | v1.34.0 | +| **Kubernetes Version** | v1.35.0 | | **CPUs** | 6 | | **Memory** | 11GB | | **Disk** | 200GB | @@ -41,7 +42,9 @@ Single-node k3s cluster for workloads requiring amd64 or GPU access. See [[ringt |----------|-------| | **Context** | `k3s-ringtail` | | **API Server** | `https://ringtail.tail8d86e.ts.net:6443` | -| **Workloads** | Frigate (GPU), ntfy, frigate-notify, nvidia-device-plugin | +| **Workloads** | GPU workloads (Frigate, Ollama), notifications (ntfy, frigate-notify), [[authentik]], and services migrated off indri minikube (Immich, Mealie, Paperless, TeslaMate). See [[ringtail]] for the authoritative list. | + +Services are being progressively migrated from indri's minikube to ringtail's k3s; the split above reflects an in-progress state, not a fixed boundary. ## Related diff --git a/docs/reference/services/alloy.md b/docs/reference/services/alloy.md index d781f2f..97d1e77 100644 --- a/docs/reference/services/alloy.md +++ b/docs/reference/services/alloy.md @@ -1,6 +1,7 @@ --- title: Alloy -modified: 2026-03-13 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - service - observability @@ -20,10 +21,10 @@ Unified observability collector for metrics and logs with three deployments: | **Indri Binary** | `~/.local/bin/alloy` | | **Indri Config** | `~/.config/grafana-alloy/config.alloy` | | **K8s Namespace** | `alloy` | -| **K8s Image** | `grafana/alloy:v1.14.0` | +| **K8s Image** | `registry.ops.eblu.me/blumeops/alloy:v1.16.0-9564435` (locally built) | | **ArgoCD App** | `alloy-k8s` | | **Fly.io Config** | `fly/alloy.river` | -| **Fly.io Image** | `grafana/alloy:v1.5.1` (binary copied into nginx container) | +| **Fly.io Image** | `grafana/alloy:v1.16.1` (binary copied into nginx container, sha-pinned) | ## Metrics Collected diff --git a/docs/reference/services/ntfy.md b/docs/reference/services/ntfy.md index b549a6d..1bf45af 100644 --- a/docs/reference/services/ntfy.md +++ b/docs/reference/services/ntfy.md @@ -1,6 +1,7 @@ --- title: Ntfy -modified: 2026-02-17 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - service - notifications @@ -17,7 +18,7 @@ Self-hosted push notification service. Ntfy receives HTTP POST messages and deli | **URL** | https://ntfy.ops.eblu.me | | **Tailscale URL** | https://ntfy.tail8d86e.ts.net | | **Namespace** | `ntfy` | -| **Image** | `binwiederhier/ntfy:v2.17.0` | +| **Image** | `registry.ops.eblu.me/blumeops/ntfy:v2.19.2-fd0bebb-nix` (locally built) | | **Upstream** | https://github.com/binwiederhier/ntfy | | **Manifests** | `argocd/manifests/ntfy/` | diff --git a/docs/reference/services/tempo.md b/docs/reference/services/tempo.md index 771b97f..5eb5d87 100644 --- a/docs/reference/services/tempo.md +++ b/docs/reference/services/tempo.md @@ -1,6 +1,7 @@ --- title: Tempo -modified: 2026-03-05 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - service - observability @@ -18,7 +19,7 @@ Distributed tracing backend for BlumeOps infrastructure. Receives traces via OTL | **Tailscale URL** | https://tempo.tail8d86e.ts.net | | **OTLP Endpoint** | https://tempo-otlp.tail8d86e.ts.net | | **Namespace** | `monitoring` | -| **Image** | `grafana/tempo:2.10.1` | +| **Image** | `registry.ops.eblu.me/blumeops/tempo:v2.10.3-75f9ba4` (locally built) | | **Storage** | 10Gi PVC (local filesystem) | | **Retention** | 7 days | diff --git a/service-versions.yaml b/service-versions.yaml index 699f89c..11ec9f9 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -56,8 +56,8 @@ services: - name: nvidia-device-plugin type: argocd - last-reviewed: 2026-03-27 - current-version: "v0.19.0" + last-reviewed: 2026-06-04 + current-version: "v0.19.2" upstream-source: https://github.com/NVIDIA/k8s-device-plugin/releases notes: DaemonSet + RuntimeClass on ringtail for GPU workloads From 0e70a1b5242183170a5d7d8ac96ee864063f65bb Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 14:55:55 -0700 Subject: [PATCH 114/122] Localize external-secrets container (native container.py build) (#367) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Knocks out the weekly "pick one non-local container and make it local" task by moving **external-secrets** off `ghcr.io` onto a locally-built image, under our own supply-chain control. Doubles as its overdue service review. ## What changed - **`containers/external-secrets/container.py`** (new) — native Dagger build (the Dockerfile→container.py migration pattern). Clones the forge mirror at `v2.2.0` and builds the single `all_providers` static Go binary, faithful to upstream's `make build` (CGO off, no version ldflags upstream). ENTRYPOINT is `/bin/external-secrets` so the controller/webhook/cert-controller Deployments select their role via `args:` exactly as before. - **`argocd/manifests/external-secrets/kustomization.yaml`** — image swapped to `registry.ops.eblu.me/blumeops/external-secrets:v2.2.0-2985007`. **Like-for-like (v2.2.0)**, not an upgrade. - **`service-versions.yaml`** — marked reviewed (2026-06-04), noted the local build. ## Build Built on the indri forge runner (run #579, ~4 min) → pushed to Zot. Image config verified: `Entrypoint=/bin/external-secrets`, `User=65534`, version label `v2.2.0`. ## Deployed from branch & verified - All 3 pods (controller / webhook / cert-controller) rolled to the local image, `1/1 Running` - Controller + webhook logs clean (no errors; webhook serving TLS) - **End-to-end secret fetch proven:** force-synced `monitoring/grafana-admin` → `refreshTime` advanced to now, `Ready=True` - All 10 ExternalSecrets cluster-wide remain `SecretSynced=True` — no collateral damage - App `Healthy` ## Post-merge `external-secrets` currently points at this branch (so `apps` reads OutOfSync — expected). After merge: ``` argocd app set external-secrets --revision main && argocd app sync external-secrets ``` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/367 --- .../external-secrets/kustomization.yaml | 3 +- containers/external-secrets/container.py | 51 +++++++++++++++++++ .../local-external-secrets.infra.md | 1 + service-versions.yaml | 7 ++- 4 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 containers/external-secrets/container.py create mode 100644 docs/changelog.d/local-external-secrets.infra.md diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml index 574aaa7..c25a7d5 100644 --- a/argocd/manifests/external-secrets/kustomization.yaml +++ b/argocd/manifests/external-secrets/kustomization.yaml @@ -12,4 +12,5 @@ resources: images: - name: ghcr.io/external-secrets/external-secrets - newTag: v2.2.0 + newName: registry.ops.eblu.me/blumeops/external-secrets + newTag: v2.2.0-2985007 diff --git a/containers/external-secrets/container.py b/containers/external-secrets/container.py new file mode 100644 index 0000000..6be5765 --- /dev/null +++ b/containers/external-secrets/container.py @@ -0,0 +1,51 @@ +"""External Secrets Operator — native Dagger build. + +Two-stage build: Go binary (all providers), Alpine runtime. +Source cloned from forge mirror. + +A single binary serves as the controller, webhook, and cert-controller; the +Deployments select the role via a subcommand passed in `args:`, so the image +ENTRYPOINT must be the binary itself (matching upstream's distroless image). +""" + +import dagger + +from blumeops.containers import ( + alpine_runtime, + clone_from_forge, + go_build, + oci_labels, +) + +VERSION = "v2.2.0" + + +async def build(src: dagger.Directory) -> dagger.Container: + source = clone_from_forge("external-secrets", VERSION) + + # Upstream `make build` compiles every secret provider into a single + # static binary (`-tags all_providers`, CGO disabled). Mirror that so the + # local image is functionally identical to ghcr.io/.../external-secrets. + backend = go_build( + source, + "/external-secrets", + tags="all_providers", + ) + + runtime = alpine_runtime( + extra_apk=["ca-certificates"], + create_user=False, + ) + runtime = oci_labels( + runtime, + title="External Secrets Operator", + description=( + "Kubernetes operator that integrates external secret management systems" + ), + version=VERSION, + ) + return ( + runtime.with_file("/bin/external-secrets", backend.file("/external-secrets")) + .with_user("65534") + .with_entrypoint(["/bin/external-secrets"]) + ) diff --git a/docs/changelog.d/local-external-secrets.infra.md b/docs/changelog.d/local-external-secrets.infra.md new file mode 100644 index 0000000..13cbb05 --- /dev/null +++ b/docs/changelog.d/local-external-secrets.infra.md @@ -0,0 +1 @@ +Localized the external-secrets controller image. It now builds from the forge mirror via a native Dagger `container.py` (single `all_providers` static Go binary, faithful to upstream's `make build`) and is served from `registry.ops.eblu.me/blumeops/external-secrets` instead of `ghcr.io`, bringing another platform component under local supply-chain control. diff --git a/service-versions.yaml b/service-versions.yaml index 11ec9f9..cc9dc9e 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -159,10 +159,13 @@ services: - name: external-secrets type: argocd - last-reviewed: 2026-03-25 + last-reviewed: 2026-06-04 current-version: "v2.2.0" upstream-source: https://github.com/external-secrets/external-secrets/releases - notes: Static kustomize manifests rendered from upstream Helm chart + notes: >- + Static kustomize manifests rendered from upstream Helm chart. Controller + image is locally built from the forge mirror via containers/external-secrets/container.py + (single all_providers static Go binary). - name: 1password-connect type: argocd From 30c82079b9dbb8e2492586d979cd4ec5b04cd08d Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 14:59:17 -0700 Subject: [PATCH 115/122] C0: rebuild external-secrets image off main (v2.2.0-0e70a1b) Repoint to the main-branch-built image so the deployed tag traces to a main commit rather than the merged feature branch. Same v2.2.0 source, stable provenance. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/external-secrets/kustomization.yaml | 2 +- docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml index c25a7d5..8b1aea5 100644 --- a/argocd/manifests/external-secrets/kustomization.yaml +++ b/argocd/manifests/external-secrets/kustomization.yaml @@ -13,4 +13,4 @@ resources: images: - name: ghcr.io/external-secrets/external-secrets newName: registry.ops.eblu.me/blumeops/external-secrets - newTag: v2.2.0-2985007 + newTag: v2.2.0-0e70a1b diff --git a/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md b/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md new file mode 100644 index 0000000..2e931d4 --- /dev/null +++ b/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md @@ -0,0 +1 @@ +Rebuilt the locally-built external-secrets image from the `main` branch so the deployed tag (`v2.2.0-0e70a1b`) traces to a `main` commit rather than the now-merged feature branch, giving a stable provenance reference. From 13895bb04a5afcbb723d7ab3355d228431d76a5d Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 15:37:42 -0700 Subject: [PATCH 116/122] Localize external-secrets on ringtail (amd64 nix build) (#368) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #367. That PR localized external-secrets but the Dagger build (on indri's Apple Silicon runner) only produces an **arm64** image — and external-secrets also runs on **ringtail (amd64)** via the same shared manifest. This completes the localization so both clusters run the local binary on their native arch. ## Approach (matches the kube-state-metrics dual-build pattern) - **`containers/external-secrets/default.nix`** (new) — builds the **amd64** image on ringtail's nix-container-builder. `buildGoModule` with Go 1.26 (v2.2.0 requires ≥1.26.1; nixpkgs default is 1.25.x) and `-tags all_providers`, faithful to upstream. Same v2.2.0 source from the forge mirror. - **`argocd/manifests/external-secrets-ringtail/`** (new) — thin kustomize overlay that reuses the shared indri manifest as a base and overrides **only** the image to the `-nix` (amd64) tag. No manifest duplication. - **`argocd/apps/external-secrets-ringtail.yaml`** — repointed at the new overlay. Result: indri → `v2.2.0-…` (arm64, Dagger), ringtail → `v2.2.0-…-nix` (amd64, nix). ## Build Run #581 built both arches at the branch commit. Verified the nix image is `linux/amd64`, entrypoint = the binary, user 65534. ## Deployed from branch & verified on ringtail (k3s, amd64) - All 3 pods rolled to the nix amd64 image, `1/1 Running` (no exec-format error → arch correct) - Controller logs clean - **Live secret fetch proven:** force-synced `homepage/homepage-grafana` → `refreshTime` advanced, `Ready=True` - **All 20** ringtail ExternalSecrets remain `SecretSynced=True` ## Post-merge The `external-secrets-ringtail` app is temporarily pointed at this branch + overlay path (apps app left on `main`, manual-sync, untouched). After merge: ``` argocd app sync apps # picks up the new Application path on main argocd app set external-secrets-ringtail --revision main && argocd app sync external-secrets-ringtail ``` I'll also rebuild off `main` so both clusters land on stable main-sha tags (as done for indri in #367). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/368 --- argocd/apps/external-secrets-ringtail.yaml | 2 +- .../kustomization.yaml | 16 ++++++ containers/external-secrets/default.nix | 56 +++++++++++++++++++ .../external-secrets-ringtail-nix.infra.md | 1 + 4 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 argocd/manifests/external-secrets-ringtail/kustomization.yaml create mode 100644 containers/external-secrets/default.nix create mode 100644 docs/changelog.d/external-secrets-ringtail-nix.infra.md diff --git a/argocd/apps/external-secrets-ringtail.yaml b/argocd/apps/external-secrets-ringtail.yaml index e2f5898..0bb8bd7 100644 --- a/argocd/apps/external-secrets-ringtail.yaml +++ b/argocd/apps/external-secrets-ringtail.yaml @@ -15,7 +15,7 @@ spec: source: repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git targetRevision: main - path: argocd/manifests/external-secrets + path: argocd/manifests/external-secrets-ringtail destination: server: https://ringtail.tail8d86e.ts.net:6443 namespace: external-secrets diff --git a/argocd/manifests/external-secrets-ringtail/kustomization.yaml b/argocd/manifests/external-secrets-ringtail/kustomization.yaml new file mode 100644 index 0000000..05b6b54 --- /dev/null +++ b/argocd/manifests/external-secrets-ringtail/kustomization.yaml @@ -0,0 +1,16 @@ +# Ringtail (amd64) overlay for external-secrets. +# +# Reuses the shared indri manifest as a base and only overrides the controller +# image to the nix-built amd64 variant (`-nix` tag). The base sets the arm64 +# image (built via containers/external-secrets/container.py on indri's Dagger +# runner); ringtail's k3s is amd64 and needs the image built by +# containers/external-secrets/default.nix on the nix-container-builder. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../external-secrets + +images: + - name: registry.ops.eblu.me/blumeops/external-secrets + newTag: v2.2.0-59dace8-nix diff --git a/containers/external-secrets/default.nix b/containers/external-secrets/default.nix new file mode 100644 index 0000000..eabe03d --- /dev/null +++ b/containers/external-secrets/default.nix @@ -0,0 +1,56 @@ +# Nix-built External Secrets Operator (amd64, for ringtail k3s). +# Builds v2.2.0 from the forge mirror with all secret providers compiled in, +# faithful to upstream's `make build` (-tags all_providers). The container.py +# sibling builds the arm64 image for indri's minikube; this default.nix builds +# the amd64 image on ringtail's nix-container-builder. +{ pkgs ? import { } }: + +let + version = "2.2.0"; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/external-secrets.git"; + rev = "v${version}"; + hash = "sha256-eAocOAp5s4CFRrpKfQr2lf3Ji+6nQQ1A5/eTw5B7v9U="; + }; + + # external-secrets v2.2.0 requires Go >= 1.26.1; nixpkgs default go is 1.25.x. + external-secrets = (pkgs.buildGoModule.override { go = pkgs.go_1_26; }) { + inherit src version; + pname = "external-secrets"; + vendorHash = "sha256-0xuBK3fjAplPLAElHvKB6d+2lDz+De/s91fV4dPZwjE="; + + doCheck = false; + + subPackages = [ "." ]; + + tags = [ "all_providers" ]; + + ldflags = [ "-s" "-w" ]; + + meta = with pkgs.lib; { + description = "Kubernetes operator that integrates external secret management systems"; + homepage = "https://github.com/external-secrets/external-secrets"; + license = licenses.asl20; + mainProgram = "external-secrets"; + }; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/external-secrets"; + contents = [ + external-secrets + pkgs.cacert + pkgs.tzdata + ]; + + config = { + Entrypoint = [ "${external-secrets}/bin/external-secrets" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + ]; + User = "65534"; + }; +} diff --git a/docs/changelog.d/external-secrets-ringtail-nix.infra.md b/docs/changelog.d/external-secrets-ringtail-nix.infra.md new file mode 100644 index 0000000..9ce3f85 --- /dev/null +++ b/docs/changelog.d/external-secrets-ringtail-nix.infra.md @@ -0,0 +1 @@ +Completed the external-secrets localization for the ringtail (amd64) cluster. The indri Dagger build (`container.py`) only produces an arm64 image; added `containers/external-secrets/default.nix` to build the amd64 variant on ringtail's nix-container-builder, and gave `external-secrets-ringtail` a thin kustomize overlay that reuses the shared manifest and points at the `-nix` image. Both clusters now run the locally-built external-secrets binary on their native architecture. From f6c926f1f594a0ee019bca5d31cdcc4225f6d6cf Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 16:19:20 -0700 Subject: [PATCH 117/122] C0: rebuild external-secrets off main, repoint both clusters to stable tags indri -> v2.2.0-13895bb (arm64), ringtail -> v2.2.0-13895bb-nix (amd64). Both deployed images now trace to main commit 13895bb instead of earlier branch builds. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/external-secrets-ringtail/kustomization.yaml | 2 +- argocd/manifests/external-secrets/kustomization.yaml | 2 +- docs/changelog.d/+external-secrets-stable-main-sha.infra.md | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 docs/changelog.d/+external-secrets-stable-main-sha.infra.md diff --git a/argocd/manifests/external-secrets-ringtail/kustomization.yaml b/argocd/manifests/external-secrets-ringtail/kustomization.yaml index 05b6b54..9fd4e2f 100644 --- a/argocd/manifests/external-secrets-ringtail/kustomization.yaml +++ b/argocd/manifests/external-secrets-ringtail/kustomization.yaml @@ -13,4 +13,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/external-secrets - newTag: v2.2.0-59dace8-nix + newTag: v2.2.0-13895bb-nix diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml index 8b1aea5..639db66 100644 --- a/argocd/manifests/external-secrets/kustomization.yaml +++ b/argocd/manifests/external-secrets/kustomization.yaml @@ -13,4 +13,4 @@ resources: images: - name: ghcr.io/external-secrets/external-secrets newName: registry.ops.eblu.me/blumeops/external-secrets - newTag: v2.2.0-0e70a1b + newTag: v2.2.0-13895bb diff --git a/docs/changelog.d/+external-secrets-stable-main-sha.infra.md b/docs/changelog.d/+external-secrets-stable-main-sha.infra.md new file mode 100644 index 0000000..fbe3c21 --- /dev/null +++ b/docs/changelog.d/+external-secrets-stable-main-sha.infra.md @@ -0,0 +1 @@ +Rebuilt the external-secrets images off `main` and repointed both clusters to the stable main-sha tags (`v2.2.0-13895bb` arm64 / `v2.2.0-13895bb-nix` amd64), so the deployed images on indri and ringtail trace to the same `main` commit rather than earlier feature-branch builds. From a2f1e062243a47c7c68b5a57617f14102b798503 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 5 Jun 2026 06:46:58 -0700 Subject: [PATCH 118/122] Add hephaestus sync hub to indri (launchagent, PWA, device-code OIDC) (#369) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Makes indri the canonical **heph** hub for the hub-and-spoke task/context system, deployed as a self-updating LaunchAgent managed by Ansible. Other devices (gilbert) attach as offline-capable spokes. ## What's here - **`ansible/roles/heph`** (tag `heph`) — bootstrap `cargo install hephd` (only if absent; `--self-update` keeps it current after), version-pinned `heph-pwa` checkout served via `--web-root`, launchagent `mcquack.eblume.heph`: ``` hephd --mode server --http-addr 0.0.0.0:8787 --db … --web-root … --oidc-issuer …/o/heph/ --oidc-audience heph --self-update --self-update-interval-secs 600 ``` `~/.cargo/bin` is on the agent `PATH` so self-update's `cargo install` works. - **Caddy** — `heph.ops.eblu.me → localhost:8787` (TLS for the PWA secure context). - **Authentik** — new `heph` **public device-code** OIDC app + `default-device-code-flow` bound to the default brand's `flow_device_code` (verified live: brand `authentik-default`, field currently unset → additive). - **Docs** — `services/hephaestus.md` (Path-A seeding runbook + spoke caveat), `indri.md`, changelog fragment. ## Three features requested - **Autoupdate** — 10-min interval (`--self-update-interval-secs 600`). - **PWA** — `--web-root` (confirmed shipped in v1.2.0). - **Spoke** — gilbert reconfig documented (post-merge step). ## Deploy plan (not done yet — awaiting review) 1. Seed from gilbert (Path A): `heph daemon stop` → copy `heph.db` → `DELETE FROM meta WHERE key='origin'`. 2. Sync Authentik `apps`/blueprint; verify blueprint status via API (not just logs). 3. `provision-indri --tags heph,caddy` from this branch. 4. Point gilbert at the hub + `heph auth login`. ## Known follow-ups (heph-side, tracked in the Hephaestus project) - `heph daemon` can't bake hub/spoke config or pass `--self-update-interval-secs` → worked around by the ansible plist. - Path-A seeding lacks a clean `hephd --owner-id`/seed command → manual `meta.origin` reset for now. - Self-update moves hephd ahead of the ansible-pinned PWA shell over time (drift; tolerated by the SW cache, revisit on next release). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/369 --- ansible/playbooks/indri.yml | 2 + ansible/roles/caddy/defaults/main.yml | 3 + ansible/roles/heph/defaults/main.yml | 49 +++++++ ansible/roles/heph/handlers/main.yml | 6 + ansible/roles/heph/tasks/main.yml | 82 +++++++++++ ansible/roles/heph/templates/heph.plist.j2 | 50 +++++++ .../authentik/configmap-blueprint.yaml | 79 +++++++++++ docs/changelog.d/heph-indri-hub.infra.md | 1 + docs/reference/infrastructure/indri.md | 1 + docs/reference/services/hephaestus.md | 130 ++++++++++++++++++ 10 files changed, 403 insertions(+) create mode 100644 ansible/roles/heph/defaults/main.yml create mode 100644 ansible/roles/heph/handlers/main.yml create mode 100644 ansible/roles/heph/tasks/main.yml create mode 100644 ansible/roles/heph/templates/heph.plist.j2 create mode 100644 docs/changelog.d/heph-indri-hub.infra.md create mode 100644 docs/reference/services/hephaestus.md diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index ddb57f8..1e33bb1 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -260,5 +260,7 @@ tags: cv - role: docs tags: docs + - role: heph + tags: heph - role: caddy tags: caddy diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index 363d09e..e6d7385 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -52,6 +52,9 @@ caddy_services: - name: devpi host: "pypi.{{ caddy_domain }}" backend: "http://localhost:3141" + - name: heph + host: "heph.{{ caddy_domain }}" + backend: "http://localhost:8787" # hephaestus hub (server mode) + PWA shell - name: kiwix host: "kiwix.{{ caddy_domain }}" backend: "https://kiwix.tail8d86e.ts.net" diff --git a/ansible/roles/heph/defaults/main.yml b/ansible/roles/heph/defaults/main.yml new file mode 100644 index 0000000..e5eea36 --- /dev/null +++ b/ansible/roles/heph/defaults/main.yml @@ -0,0 +1,49 @@ +--- +# hephaestus hub — the canonical heph replica (server mode) on indri. +# Other devices (e.g. gilbert) are spokes that sync against this hub. +# See [[set-up-sync-hub]] and [[host-heph-pwa]] in the hephaestus repo. + +# Pinned release used for the initial `cargo install` and the PWA shell. +# After bootstrap, hephd's own --self-update keeps the binary current; this +# pin only governs the first install and the bundled PWA shell version. +heph_version: v1.2.0 + +# Anonymous public HTTPS clone — matches hephd's INSTALL_GIT_URL so the initial +# install and unattended self-update build from the same source (no ssh-agent). +heph_repo_url: https://forge.eblu.me/eblume/hephaestus.git + +heph_bin_dir: /Users/erichblume/.cargo/bin +heph_binary: "{{ heph_bin_dir }}/hephd" + +# rustc/cargo here are rustup shims. The bare (non-mise) environment that the +# launchagent and ansible run in falls back to rustup's *default* toolchain, +# which can lag behind heph's rust-version floor (Cargo.toml: 1.89). Pin the +# channel explicitly so both the bootstrap build and unattended self-update +# always use a current toolchain regardless of the host's rustup default. +heph_rust_toolchain: stable + +heph_data_dir: /Users/erichblume/.local/share/heph +heph_db: "{{ heph_data_dir }}/heph.db" +heph_socket: "{{ heph_data_dir }}/hephd.sock" +heph_log_dir: /Users/erichblume/Library/Logs + +# Version-pinned source checkout; the PWA static shell is served directly from +# its heph-pwa/ subdir (no copy), keeping shell and hub in lockstep at heph_version. +heph_pwa_src_dir: /Users/erichblume/.cache/heph-pwa-src +heph_web_root: "{{ heph_pwa_src_dir }}/heph-pwa" + +# Hub listens on all interfaces so tailnet spokes can reach it directly +# (http://indri.tail8d86e.ts.net:8787) and Caddy can proxy heph.ops.eblu.me. +# Access is gated by Authentik OIDC regardless — tailnet reachability is not +# enough (this is the owner's most sensitive data). +heph_http_addr: 0.0.0.0:8787 +heph_port: 8787 +heph_external_url: https://heph.ops.eblu.me + +# Authentik OIDC — issuer + audience together turn hub auth on. The audience is +# the device-code client id (see argocd/manifests/authentik heph blueprint). +heph_oidc_issuer: https://authentik.ops.eblu.me/application/o/heph/ +heph_oidc_audience: heph + +# Self-update poll interval (seconds). 10 minutes. +heph_self_update_interval_secs: 600 diff --git a/ansible/roles/heph/handlers/main.yml b/ansible/roles/heph/handlers/main.yml new file mode 100644 index 0000000..92fe9d7 --- /dev/null +++ b/ansible/roles/heph/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart heph + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.heph.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.heph.plist + changed_when: true diff --git a/ansible/roles/heph/tasks/main.yml b/ansible/roles/heph/tasks/main.yml new file mode 100644 index 0000000..7a45fe3 --- /dev/null +++ b/ansible/roles/heph/tasks/main.yml @@ -0,0 +1,82 @@ +--- +# hephaestus hub (server mode) on indri. +# +# DATA SEEDING (one-time, Path A — do this BEFORE the first provision so the hub +# adopts gilbert's existing data instead of being born empty): +# +# 1. On the seed device (gilbert): heph daemon stop +# 2. Copy its store to indri: scp ~/.local/share/heph/heph.db \ +# indri:~/.local/share/heph/heph.db +# 3. On indri, give the hub its OWN device origin (keeps gilbert's owner_id + +# data; hephd regenerates a fresh origin on next start when it is missing): +# sqlite3 ~/.local/share/heph/heph.db "DELETE FROM meta WHERE key='origin';" +# 4. Run this role (installs hephd, stages the PWA, loads the launchagent). +# +# hephd auto-creates an empty store on first start if none exists, so seeding is +# optional — skip it only if you intend a fresh, empty hub. + +- name: Ensure heph data directory exists + ansible.builtin.file: + path: "{{ heph_data_dir }}" + state: directory + mode: '0700' + +- name: Check for installed hephd binary + ansible.builtin.stat: + path: "{{ heph_binary }}" + register: heph_binary_stat + +# Bootstrap install only when hephd is absent. Thereafter hephd's own +# --self-update keeps it current; ansible must not fight (or downgrade) it. +# This builds from source and can take several minutes on a cold cargo cache. +- name: Bootstrap-install heph + hephd from the forge ({{ heph_version }}) + ansible.builtin.command: + cmd: >- + {{ heph_bin_dir }}/cargo install --locked + --git {{ heph_repo_url }} + --tag {{ heph_version }} + heph hephd + environment: + PATH: "{{ heph_bin_dir }}:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin" + RUSTUP_TOOLCHAIN: "{{ heph_rust_toolchain }}" + when: not heph_binary_stat.stat.exists + changed_when: true + notify: Restart heph + +# Checkout provides the PWA shell at {{ heph_web_root }} (heph-pwa/ subdir), +# served directly by hephd. Static files are read from disk per request, so a +# version bump needs no restart; the service worker (CACHE = "heph-pwa-vN") +# evicts stale assets on next load. +- name: Ensure heph cache parent directory exists + ansible.builtin.file: + path: "{{ heph_pwa_src_dir | dirname }}" + state: directory + mode: '0755' + +- name: Stage heph-pwa source at {{ heph_version }} + ansible.builtin.git: + repo: "{{ heph_repo_url }}" + dest: "{{ heph_pwa_src_dir }}" + version: "{{ heph_version }}" + depth: 1 + single_branch: true + force: true + +- name: Deploy heph LaunchAgent plist + ansible.builtin.template: + src: heph.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.heph.plist + mode: '0644' + notify: Restart heph + +- name: Check if heph LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.heph + register: heph_launchctl_check + changed_when: false + failed_when: false + +- name: Load heph LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.heph.plist + when: heph_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/heph/templates/heph.plist.j2 b/ansible/roles/heph/templates/heph.plist.j2 new file mode 100644 index 0000000..19a2367 --- /dev/null +++ b/ansible/roles/heph/templates/heph.plist.j2 @@ -0,0 +1,50 @@ + + + + + + Label + mcquack.eblume.heph + ProgramArguments + + {{ heph_binary }} + --mode + server + --http-addr + {{ heph_http_addr }} + --db + {{ heph_db }} + --socket + {{ heph_socket }} + --web-root + {{ heph_web_root }} + --oidc-issuer + {{ heph_oidc_issuer }} + --oidc-audience + {{ heph_oidc_audience }} + --self-update + --self-update-interval-secs + {{ heph_self_update_interval_secs }} + + RunAtLoad + + KeepAlive + + EnvironmentVariables + + + PATH + {{ heph_bin_dir }}:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin + HOME + /Users/erichblume + + RUSTUP_TOOLCHAIN + {{ heph_rust_toolchain }} + + StandardOutPath + {{ heph_log_dir }}/mcquack.heph.out.log + StandardErrorPath + {{ heph_log_dir }}/mcquack.heph.err.log + + diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml index fcbb99b..56d9110 100644 --- a/argocd/manifests/authentik/configmap-blueprint.yaml +++ b/argocd/manifests/authentik/configmap-blueprint.yaml @@ -434,3 +434,82 @@ data: provider: !KeyOf mealie-provider meta_launch_url: https://meals.ops.eblu.me policy_engine_mode: all + + heph.yaml: | + version: 1 + metadata: + name: BlumeOps Heph SSO + labels: + blueprints.goauthentik.io/description: "Hephaestus hub OIDC (device-code) provider, application, and device-code flow" + entries: + # Device-code flow (RFC 8628). authentik ships no default for this, so we + # create one and bind it to the brand below. An empty stage_configuration + # flow is sufficient: the already-authenticated user just confirms the code. + - model: authentik_flows.flow + id: device-code-flow + identifiers: + slug: default-device-code-flow + attrs: + name: Device code flow + title: Device code flow + slug: default-device-code-flow + designation: stage_configuration + authentication: require_authenticated + + # Enable the device-code grant globally by binding the flow to the default + # brand (domain authentik-default). Partial update — only sets this field. + - model: authentik_brands.brand + identifiers: + domain: authentik-default + attrs: + flow_device_code: !KeyOf device-code-flow + + # OAuth2 provider for heph — PUBLIC client (device-code + PKCE, no secret). + # client_id doubles as the token audience the hub verifies (--oidc-audience heph), + # and the app slug 'heph' is the issuer path (/application/o/heph/). + - model: authentik_providers_oauth2.oauth2provider + id: heph-provider + identifiers: + name: Heph + attrs: + name: Heph + authorization_flow: !Find [authentik_flows.flow, [slug, default-provider-authorization-implicit-consent]] + invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]] + client_type: public + client_id: heph + # Device-code (RFC 8628) + PKCE use no redirect, but the provider + # serializer requires the field — an empty list satisfies it. + redirect_uris: [] + signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]] + property_mappings: + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]] + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, email]] + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, profile]] + sub_mode: hashed_user_id + include_claims_in_id_token: true + + # Heph application — linked to the OAuth2 provider + - model: authentik_core.application + id: heph-app + identifiers: + slug: heph + attrs: + name: Hephaestus + slug: heph + provider: !KeyOf heph-provider + meta_launch_url: https://heph.ops.eblu.me + policy_engine_mode: any + + # Policy binding — restrict heph to admins group (single-owner, sensitive data) + - model: authentik_policies.policybinding + identifiers: + order: 0 + target: !KeyOf heph-app + group: !Find [authentik_core.group, [name, admins]] + attrs: + target: !KeyOf heph-app + group: !Find [authentik_core.group, [name, admins]] + order: 0 + enabled: true + negate: false + timeout: 30 diff --git a/docs/changelog.d/heph-indri-hub.infra.md b/docs/changelog.d/heph-indri-hub.infra.md new file mode 100644 index 0000000..6761cb7 --- /dev/null +++ b/docs/changelog.d/heph-indri-hub.infra.md @@ -0,0 +1 @@ +Added the [[hephaestus]] (`heph`) sync hub to indri as a self-updating LaunchAgent managed by Ansible (`ansible/roles/heph`, tag `heph`). The hub runs `hephd --mode server` behind `heph.ops.eblu.me` (Caddy TLS), with self-update on a 10-minute interval and the heph-pwa mobile shell served from `--web-root`. Access is gated by a new Authentik device-code (RFC 8628) OIDC application. Indri is now the canonical hub; other devices (e.g. gilbert) attach as offline-capable spokes. The hub's store was seeded from gilbert via the data-safe Path A bring-up (copy store, reset `meta.origin`). diff --git a/docs/reference/infrastructure/indri.md b/docs/reference/infrastructure/indri.md index 67652ca..8364ba0 100644 --- a/docs/reference/infrastructure/indri.md +++ b/docs/reference/infrastructure/indri.md @@ -33,6 +33,7 @@ Primary BlumeOps server. Mac Mini M1 (2020). - [[alloy|Alloy]] - Metrics/logs collector - [[caddy]] - Reverse proxy for `*.ops.eblu.me` - [[devpi]] - PyPI mirror (LaunchAgent) +- [[hephaestus]] - heph task/context sync hub (LaunchAgent, self-updating) - [[cv]] - Static CV site, served by Caddy - [[docs]] - Quartz-built docs site, served by Caddy diff --git a/docs/reference/services/hephaestus.md b/docs/reference/services/hephaestus.md new file mode 100644 index 0000000..1754ea0 --- /dev/null +++ b/docs/reference/services/hephaestus.md @@ -0,0 +1,130 @@ +--- +title: Hephaestus +modified: 2026-06-04 +last-reviewed: 2026-06-04 +tags: + - service + - hephaestus +--- + +# Hephaestus + +[hephaestus](https://github.com/eblume/hephaestus) (`heph`) is the user's +self-hosted task + context/knowledge system. It is **hub-and-spoke**: each device +runs a full local SQLite replica (`hephd --mode local`) and background-syncs +against one canonical **hub**. Indri runs that hub. + +## Quick Reference + +| Property | Value | +|----------|-------| +| **PWA URL** | https://heph.ops.eblu.me (browser PWA, Caddy TLS) | +| **Spoke sync URL** | http://indri.tail8d86e.ts.net:8787 (direct, tailnet) | +| **Local Port** | 8787 (`hephd --mode server`, bound `0.0.0.0`) | +| **Binary** | `~/.cargo/bin/hephd` (self-updating) | +| **Data** | `~/.local/share/heph/heph.db` | +| **PWA shell** | `~/.local/share/heph/web` | +| **Logs** | `~/Library/Logs/mcquack.heph.{out,err}.log` | +| **LaunchAgent** | `mcquack.eblume.heph` | +| **Ansible role** | `ansible/roles/heph` (tag `heph`) | + +## What runs on indri + +The launchagent runs the hub in server mode with three features enabled: + +``` +hephd --mode server --http-addr 0.0.0.0:8787 --db ~/.local/share/heph/heph.db + --web-root ~/.local/share/heph/web + --oidc-issuer https://authentik.ops.eblu.me/application/o/heph/ + --oidc-audience heph + --self-update --self-update-interval-secs 600 +``` + +- **Server mode** exposes the HTTP sync endpoint (`/rpc`, `/sync/*`) that spokes + reconcile their op-log against. +- **Self-update** (10-minute poll) rebuilds `hephd` from the forge when a newer + release tag appears (`cargo install --git https://forge.eblu.me/eblume/hephaestus.git`). + Indri's Rust toolchain (`~/.cargo/bin`) is on the agent's `PATH` for this, and + the plist pins `RUSTUP_TOOLCHAIN=stable` — the + launchagent runs without mise, so a bare `cargo` shim would otherwise fall back + to rustup's *default* toolchain, which can lag behind heph's `rust-version` floor + (1.89) and silently fail the build. +- **PWA** (`--web-root`) serves the [heph-pwa] mobile shell; Caddy terminates TLS + at `heph.ops.eblu.me` so the PWA runs in a secure context (service worker, + install-to-home-screen, voice capture). + +[heph-pwa]: https://github.com/eblume/hephaestus + +The hub binds `0.0.0.0` so tailnet spokes can also sync directly +(`http://indri.tail8d86e.ts.net:8787`); access is gated by Authentik OIDC either +way — tailnet reachability alone is not enough. + +## Authentication (Authentik OIDC, device-code) + +The hub verifies an OIDC bearer token on every sync. The `heph` application is a +**public** OAuth2 client using the **device-code flow** (RFC 8628), provisioned +in the [[authentik]] blueprint (`argocd/manifests/authentik/configmap-blueprint.yaml`): + +- Issuer: `https://authentik.ops.eblu.me/application/o/heph/` +- Audience / client id: `heph` +- Restricted to the `admins` group (single-owner, sensitive data). + +Because no Authentik instance ships a device-code flow by default, the blueprint +also creates `default-device-code-flow` and binds it to the default brand's +`flow_device_code`. Devices obtain a token with `heph auth login`; the PWA +currently takes a pasted token (in-app device-code login is upstream follow-up). + +## Data seeding (Path A, one-time) + +The hub was seeded from the existing `gilbert` device so no task history was +lost. heph's data-safe bring-up ("Path A") has the hub **adopt the device's +identity** rather than rewriting the device: + +1. Quiesce the seed device: `heph daemon stop` (on gilbert). +2. Copy its store to indri: `scp ~/.local/share/heph/heph.db indri:~/.local/share/heph/heph.db`. +3. Give the hub its **own device origin** (keeps gilbert's `owner_id` + data; + `hephd` regenerates a fresh `origin` on next start when it is missing): + ```fish + ssh indri "sqlite3 ~/.local/share/heph/heph.db \"DELETE FROM meta WHERE key='origin';\"" + ``` +4. `mise run provision-indri -- --tags heph` (installs hephd, stages the PWA, + loads the launchagent → hub starts on the seeded store). + +Only `meta.origin` changes; `owner_id`, nodes, op-log, and links are copied +untouched. A clean `hephd --owner-id` / seed command is tracked upstream as +hephaestus follow-up — until then this manual reset is the documented path. + +## Connecting a spoke (e.g. gilbert) + +A device joins by running its local daemon with the hub URL + OIDC client and +logging in once: + +```bash +hephd --mode local --hub-url http://indri.tail8d86e.ts.net:8787 \ + --oidc-issuer https://authentik.ops.eblu.me/application/o/heph/ \ + --oidc-client-id heph +heph auth login --hub-url http://indri.tail8d86e.ts.net:8787 \ + --issuer https://authentik.ops.eblu.me/application/o/heph/ --client-id heph +``` + +> **Use the direct `http://…:8787` tailnet URL for sync, not the Caddy HTTPS +> URL.** hephd's sync client is plain-HTTP-only; pointing `--hub-url` at +> `https://heph.ops.eblu.me` fails with a confusing `error sending request` +> (the HTTP connector rejects the `https` scheme before connecting). Tailscale +> encrypts the transport, and the OIDC bearer token still gates every request. +> `heph.ops.eblu.me` (Caddy TLS) exists only for the browser PWA, which needs a +> secure context. The cached token is keyed by the exact `--hub-url`, so use the +> same value for `hephd` and `heph auth login`. + +> **Caveat:** `heph daemon` cannot yet bake hub/spoke flags into the generated +> launchd plist (upstream gap). On a spoke whose plist is managed by `heph +> daemon`, the hub/OIDC flags must be hand-added — and a later `heph daemon +> start/restart` will regenerate the plist and drop them. Avoid `heph daemon` +> subcommands on a configured spoke until that gap is closed; reload via +> `launchctl` instead. + +## Related + +- [[indri]] — host +- [[authentik]] — OIDC provider +- [[caddy]] — TLS termination for `heph.ops.eblu.me` From 6576880b0e8e80cd88452add47627c3b4e6d6435 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 5 Jun 2026 07:30:31 -0700 Subject: [PATCH 119/122] heph Authentik: register heph-pwa redirect URIs (PKCE login) (#370) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the heph-pwa redirect URIs to the Authentik `heph` OAuth2 provider so the new browser **Login with Authentik** flow (Authorization Code + PKCE, hephaestus PR #9) can redirect back and exchange the code: - `https://heph.ops.eblu.me/` (the PWA origin) - `http://localhost:8787/` (local dev: `hephd --web-root`) Authentik also keys token-endpoint CORS off these origins, so they're required for the browser token exchange. Additive (the provider was `redirect_uris: []`); harmless until the PWA feature deploys. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/370 --- argocd/manifests/authentik/configmap-blueprint.yaml | 13 ++++++++++--- docs/changelog.d/heph-pwa-redirect-uris.infra.md | 1 + 2 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 docs/changelog.d/heph-pwa-redirect-uris.infra.md diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml index 56d9110..9da2f70 100644 --- a/argocd/manifests/authentik/configmap-blueprint.yaml +++ b/argocd/manifests/authentik/configmap-blueprint.yaml @@ -477,9 +477,16 @@ data: invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]] client_type: public client_id: heph - # Device-code (RFC 8628) + PKCE use no redirect, but the provider - # serializer requires the field — an empty list satisfies it. - redirect_uris: [] + # CLI/TUI use the device-code grant (no redirect). The heph-pwa browser + # login uses Authorization Code + PKCE, which DOES redirect back to the + # app's origin — register those here (Authentik also keys token-endpoint + # CORS off these origins). Trailing slash matters: the PWA's redirect_uri + # is its base dir, e.g. https://heph.ops.eblu.me/. + redirect_uris: + - matching_mode: strict + url: https://heph.ops.eblu.me/ + - matching_mode: strict + url: http://localhost:8787/ # local dev (hephd --web-root) signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]] property_mappings: - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]] diff --git a/docs/changelog.d/heph-pwa-redirect-uris.infra.md b/docs/changelog.d/heph-pwa-redirect-uris.infra.md new file mode 100644 index 0000000..f887eed --- /dev/null +++ b/docs/changelog.d/heph-pwa-redirect-uris.infra.md @@ -0,0 +1 @@ +Registered the heph-pwa redirect URIs (`https://heph.ops.eblu.me/`, plus `http://localhost:8787/` for dev) on the Authentik `heph` OAuth2 provider, enabling the PWA's new Authorization Code + PKCE "Login with Authentik" flow (and the token-endpoint CORS it needs). Pairs with hephaestus PR #9. From 3abe80523a0b402c40a0bd3d825e5d81b87939d8 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 5 Jun 2026 07:40:51 -0700 Subject: [PATCH 120/122] C0: bump indri heph hub to v1.2.1 (PWA Authentik login + /config) Co-Authored-By: Claude Opus 4.8 (1M context) --- ansible/roles/heph/defaults/main.yml | 2 +- docs/changelog.d/+heph-hub-v1.2.1.infra.md | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/+heph-hub-v1.2.1.infra.md diff --git a/ansible/roles/heph/defaults/main.yml b/ansible/roles/heph/defaults/main.yml index e5eea36..88d2240 100644 --- a/ansible/roles/heph/defaults/main.yml +++ b/ansible/roles/heph/defaults/main.yml @@ -6,7 +6,7 @@ # Pinned release used for the initial `cargo install` and the PWA shell. # After bootstrap, hephd's own --self-update keeps the binary current; this # pin only governs the first install and the bundled PWA shell version. -heph_version: v1.2.0 +heph_version: v1.2.1 # Anonymous public HTTPS clone — matches hephd's INSTALL_GIT_URL so the initial # install and unattended self-update build from the same source (no ssh-agent). diff --git a/docs/changelog.d/+heph-hub-v1.2.1.infra.md b/docs/changelog.d/+heph-hub-v1.2.1.infra.md new file mode 100644 index 0000000..c203323 --- /dev/null +++ b/docs/changelog.d/+heph-hub-v1.2.1.infra.md @@ -0,0 +1 @@ +Bumped the indri heph hub to v1.2.1, which adds the hub `GET /config` endpoint and ships the heph-pwa **Login with Authentik** flow (Authorization Code + PKCE). Pairs with the Authentik `heph` provider redirect URIs registered earlier. From cf63fcb5b5cf379700efe3ce0986b18ec4d76625 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Fri, 5 Jun 2026 08:22:46 -0700 Subject: [PATCH 121/122] C0: track heph in service-versions (self-updating; note drift task) Co-Authored-By: Claude Opus 4.8 (1M context) --- service-versions.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/service-versions.yaml b/service-versions.yaml index cc9dc9e..866c687 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -414,6 +414,23 @@ services: upstream-source: https://github.com/caddyserver/caddy/releases notes: Built from source with Gandi DNS and Layer 4 plugins + - name: heph + type: ansible + last-reviewed: 2026-06-05 + current-version: "v1.2.1" + upstream-source: https://forge.eblu.me/eblume/hephaestus/releases + notes: >- + hephaestus task/context sync hub on indri (server-mode launchagent, + ansible/roles/heph; cargo-built from the forge). SELF-UPDATING: hephd + polls the forge for newer releases every 10 min and rebuilds + restarts + itself, so the running version drifts AHEAD of the ansible heph_version + pin. current-version here is the last observed/deployed tag, not a hard + pin — verify the live version via `curl https://heph.ops.eblu.me/config` + is served (hub up) and the hub log's `current=` line. Reconciling this + self-update vs IaC-pin drift is tracked in the heph "Hephaestus" project: + "Reconcile hephd self-update with ansible-pinned version (drift on indri + hub)" (node 01KTBXWT6XTHNDH92CVJY88E5K). + - name: borgmatic type: ansible last-reviewed: 2026-04-15 From 50a36ff93a9d1c697c976a1db498bc5633f2cd7c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sat, 6 Jun 2026 18:07:13 -0700 Subject: [PATCH 122/122] heph Authentik: grant offline_access scope (fixes spoke sync refresh-token 400) The heph CLI requests scope "openid offline_access", but the Authentik heph OAuth2 provider only mapped openid/email/profile. Without the offline_access mapping the issued refresh token is bound to the login session rather than the 30-day refresh-token window; once the session lapses, hephd's refresh_token grant returns 400 Bad Request and spoke sync silently degrades (heph sync --status -> auth_failure: true). Add the built-in offline_access scope mapping to the provider's property_mappings and document the requirement in the service reference. Co-Authored-By: Claude Opus 4.8 (1M context) --- argocd/manifests/authentik/configmap-blueprint.yaml | 4 ++++ docs/changelog.d/heph-offline-access.bugfix.md | 1 + docs/reference/services/hephaestus.md | 11 +++++++++++ 3 files changed, 16 insertions(+) create mode 100644 docs/changelog.d/heph-offline-access.bugfix.md diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml index 9da2f70..cc97dea 100644 --- a/argocd/manifests/authentik/configmap-blueprint.yaml +++ b/argocd/manifests/authentik/configmap-blueprint.yaml @@ -492,6 +492,10 @@ data: - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]] - !Find [authentik_providers_oauth2.scopemapping, [scope_name, email]] - !Find [authentik_providers_oauth2.scopemapping, [scope_name, profile]] + # offline_access: heph CLI requests "openid offline_access"; without + # this mapping the refresh token is session-bound and hephd's + # refresh_token grant 400s once the session lapses (spoke sync dies). + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, offline_access]] sub_mode: hashed_user_id include_claims_in_id_token: true diff --git a/docs/changelog.d/heph-offline-access.bugfix.md b/docs/changelog.d/heph-offline-access.bugfix.md new file mode 100644 index 0000000..e9721bc --- /dev/null +++ b/docs/changelog.d/heph-offline-access.bugfix.md @@ -0,0 +1 @@ +Granted the `offline_access` scope on the Authentik `heph` OAuth2 provider so hephaestus spokes receive a durable 30-day refresh token. Previously the refresh token was session-bound, so spoke sync would silently fail with a `400 Bad Request` on the `refresh_token` grant once the Authentik session lapsed. diff --git a/docs/reference/services/hephaestus.md b/docs/reference/services/hephaestus.md index 1754ea0..7abc35b 100644 --- a/docs/reference/services/hephaestus.md +++ b/docs/reference/services/hephaestus.md @@ -68,6 +68,17 @@ in the [[authentik]] blueprint (`argocd/manifests/authentik/configmap-blueprint. - Issuer: `https://authentik.ops.eblu.me/application/o/heph/` - Audience / client id: `heph` - Restricted to the `admins` group (single-owner, sensitive data). +- Scope mappings: `openid`, `email`, `profile`, **`offline_access`**. + +> **`offline_access` is required for durable sync.** The `heph` CLI requests +> `scope = "openid offline_access"`, and a refresh token is only issued for the +> 30-day refresh-token window when the provider actually grants `offline_access`. +> Without that scope mapping the refresh token is bound to the login **session**; +> once the session lapses, hephd's `refresh_token` grant returns `400 Bad +> Request`, the bearer can't be refreshed, and spoke sync silently degrades +> (`heph sync --status` → `auth_failure: true`). `heph auth login` papers over it +> until the next session expiry. Keep `offline_access` in the provider's +> `property_mappings`. Because no Authentik instance ships a device-code flow by default, the blueprint also creates `default-device-code-flow` and binds it to the default brand's