diff --git a/.forgejo/workflows/build-blumeops.yaml b/.forgejo/workflows/build-blumeops.yaml index 383542f..c6e6c3c 100644 --- a/.forgejo/workflows/build-blumeops.yaml +++ b/.forgejo/workflows/build-blumeops.yaml @@ -178,10 +178,11 @@ jobs: echo "## Documentation" echo "" - echo "Download \`$TARBALL\` and configure the quartz container with:" + echo "Download \`$TARBALL\` directly, or bump \`docs_version\`" + echo "in \`ansible/roles/docs/defaults/main.yml\` and run:" echo "" echo "\`\`\`" - echo "DOCS_RELEASE_URL=https://forge.eblu.me/eblume/blumeops/releases/download/$VERSION/$TARBALL" + echo "mise run provision-indri -- --tags docs" echo "\`\`\`" } > /tmp/release_body.txt @@ -223,18 +224,16 @@ jobs: echo "" echo "Release created successfully!" - - name: Update docs deployment + - name: Bump docs_version in ansible role run: | VERSION="${{ steps.version.outputs.version }}" - TARBALL="docs-${VERSION}.tar.gz" - DEPLOYMENT_FILE="argocd/manifests/docs/deployment.yaml" - RELEASE_URL="https://forge.eblu.me/eblume/blumeops/releases/download/${VERSION}/${TARBALL}" + DEFAULTS_FILE="ansible/roles/docs/defaults/main.yml" - echo "Updating $DEPLOYMENT_FILE with new release URL..." - yq -i "(.spec.template.spec.containers[0].env[] | select(.name == \"DOCS_RELEASE_URL\")).value = \"${RELEASE_URL}\"" "$DEPLOYMENT_FILE" + echo "Bumping docs_version in $DEFAULTS_FILE to ${VERSION}..." + yq -i ".docs_version = \"${VERSION}\"" "$DEFAULTS_FILE" - echo "Updated deployment:" - grep -A1 "DOCS_RELEASE_URL" "$DEPLOYMENT_FILE" + echo "Updated defaults:" + grep -E "^docs_version:" "$DEFAULTS_FILE" - name: Commit release changes env: @@ -248,7 +247,7 @@ jobs: git config user.email "actions@forge.ops.eblu.me" # Stage deployment changes - git add argocd/manifests/docs/deployment.yaml + git add ansible/roles/docs/defaults/main.yml # Stage changelog changes if updated if [ "$CHANGELOG_UPDATED" = "true" ]; then @@ -270,34 +269,6 @@ jobs: echo "Changes committed and pushed" fi - - name: Deploy docs - env: - ARGOCD_AUTH_TOKEN: ${{ secrets.ARGOCD_AUTH_TOKEN }} - run: | - echo "Syncing docs app via ArgoCD..." - - # Sync docs app (uses ARGOCD_AUTH_TOKEN env var for auth) - argocd app sync docs \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --prune - - # Wait for sync to complete - argocd app wait docs \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --timeout 120 - - echo "Docs app synced successfully!" - - - name: Purge Fly.io proxy cache - env: - FLY_API_TOKEN: ${{ secrets.FLY_DEPLOY_TOKEN }} - run: | - echo "Purging nginx cache on Fly.io proxy..." - fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'" - echo "Cache purged" - - name: Summary run: | VERSION="${{ steps.version.outputs.version }}" @@ -309,5 +280,12 @@ jobs: echo "Release URL:" echo " https://forge.eblu.me/eblume/blumeops/releases/tag/$VERSION" echo "" - echo "Asset URL (for DOCS_RELEASE_URL ConfigMap):" + echo "Asset URL:" echo " https://forge.eblu.me/eblume/blumeops/releases/download/$VERSION/$TARBALL" + echo "" + echo "To deploy on indri, run from gilbert:" + echo " mise run provision-indri -- --tags docs" + echo "" + echo "Then purge the Fly.io proxy cache:" + echo " fly ssh console -a blumeops-proxy -C \\" + echo " \"sh -c 'rm -rf /tmp/cache && nginx -s reload'\"" diff --git a/.forgejo/workflows/cv-deploy.yaml b/.forgejo/workflows/cv-deploy.yaml index f99352d..001aa36 100644 --- a/.forgejo/workflows/cv-deploy.yaml +++ b/.forgejo/workflows/cv-deploy.yaml @@ -1,12 +1,14 @@ # CV Deploy Workflow # -# Updates the CV deployment to a specific package version, commits -# the change, and syncs via ArgoCD. +# Bumps cv_version in ansible/roles/cv/defaults/main.yml and pushes the change. +# Deployment to indri is manual (runner has no SSH access to indri): +# mise run provision-indri -- --tags cv # # Usage: # 1. Release a new CV package from the cv repo first # 2. Go to Actions > Deploy CV > Run workflow # 3. Enter the version to deploy, or leave as "latest" +# 4. Run the command above on gilbert to apply name: Deploy CV @@ -60,18 +62,16 @@ jobs: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Update CV deployment + - name: Bump cv_version in ansible role run: | VERSION="${{ steps.version.outputs.version }}" - TARBALL="cv-${VERSION}.tar.gz" - DEPLOYMENT_FILE="argocd/manifests/cv/deployment.yaml" - RELEASE_URL="https://forge.eblu.me/api/packages/eblume/generic/cv/${VERSION}/${TARBALL}" + DEFAULTS_FILE="ansible/roles/cv/defaults/main.yml" - echo "Updating $DEPLOYMENT_FILE with CV_RELEASE_URL..." - yq -i "(.spec.template.spec.containers[0].env[] | select(.name == \"CV_RELEASE_URL\")).value = \"${RELEASE_URL}\"" "$DEPLOYMENT_FILE" + echo "Bumping cv_version in $DEFAULTS_FILE to ${VERSION}..." + yq -i ".cv_version = \"${VERSION}\"" "$DEFAULTS_FILE" - echo "Updated deployment:" - grep -A1 "CV_RELEASE_URL" "$DEPLOYMENT_FILE" + echo "Updated defaults:" + grep -E "^cv_version:" "$DEFAULTS_FILE" - name: Commit release changes env: @@ -82,7 +82,7 @@ jobs: git config user.name "Forgejo Actions" git config user.email "actions@forge.ops.eblu.me" - git add argocd/manifests/cv/deployment.yaml + git add ansible/roles/cv/defaults/main.yml if git diff --cached --quiet; then echo "No changes to commit (already at $VERSION)" @@ -94,38 +94,16 @@ jobs: echo "Changes committed and pushed" fi - - name: Deploy CV - env: - ARGOCD_AUTH_TOKEN: ${{ secrets.ARGOCD_AUTH_TOKEN }} - run: | - echo "Syncing CV app via ArgoCD..." - - argocd app sync cv \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --prune - - argocd app wait cv \ - --server argocd.ops.eblu.me \ - --grpc-web \ - --timeout 120 - - echo "CV app synced successfully!" - - - name: Purge Fly.io proxy cache - env: - FLY_API_TOKEN: ${{ secrets.FLY_DEPLOY_TOKEN }} - run: | - echo "Purging nginx cache on Fly.io proxy..." - fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'" - echo "Cache purged" - - name: Summary run: | VERSION="${{ steps.version.outputs.version }}" echo "================================================" - echo "CV Deployed: $VERSION" + echo "CV version bumped: $VERSION" echo "================================================" echo "" - echo "CV should now be live at:" - echo " https://cv.ops.eblu.me/" + echo "To deploy on indri, run from gilbert:" + echo " mise run provision-indri -- --tags cv" + echo "" + echo "Then purge the Fly.io proxy cache:" + echo " fly ssh console -a blumeops-proxy -C \\" + echo " \"sh -c 'rm -rf /tmp/cache && nginx -s reload'\"" diff --git a/.gitignore b/.gitignore index acfafba..09e937c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .claude/settings.local.json .claude/agent-memory/ +.claude/scheduled_tasks.lock # Python __pycache__/ @@ -12,3 +13,5 @@ __pycache__/ # OS .DS_Store +/**/__pycache__ +/.env diff --git a/AGENTS.md b/AGENTS.md index 80f9852..c64af40 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -65,7 +65,7 @@ See [[agent-change-process]] for the full methodology. ./pulumi/ # Pulumi IaC (tailnet ACLs, dns, cloud) ~/.config/{nvim,fish} # user's shell config, managed by chezmoi ~/code/personal/ # user's projects -~/code/personal/zk # user's Obsidian-sync managed zettelkasten. Potential source for reference data. +~/code/personal/zk # user's zettelkasten (Obsidian-sync). Reference-data source; migrating into heph docs (hephaestus). ~/code/3rd/ # mirrored external projects ~/code/work # FORBIDDEN ``` @@ -86,7 +86,7 @@ Most services run in minikube on indri via ArgoCD (app-of-apps, manual sync). GP **Commands:** `argocd app list|get|diff|sync ` -**Login:** `argocd login argocd.ops.eblu.me --username admin --password "$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/srogeebssulhtb6tnqd7ls6qey/password')"` +**Login:** `argocd login argocd.ops.eblu.me --sso` (opens browser for Authentik SSO). Admin fallback for break-glass: `argocd login argocd.ops.eblu.me --username admin --password "$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/srogeebssulhtb6tnqd7ls6qey/password')"` ### Indri (Ansible) @@ -147,10 +147,16 @@ Create a new spork: `mise run spork-create ` ## Task Discovery +BlumeOps tasks live in [hephaestus](https://github.com/eblume/hephaestus) (`heph`), +the user's self-hosted context/task system. Fetch them with the CLI: + ```fish -mise run blumeops-tasks # fetch from Todoist, sorted by priority +heph list --project Blumeops --json # outstanding Blumeops tasks as JSON ``` -Most tasks are stored in `./mise-tasks/`. For scripts with any logic or + +(This replaced the retired `blumeops-tasks` mise task, which read from Todoist.) + +Most operational scripts are stored in `./mise-tasks/`. For scripts with any logic or complexity, use uv run --script 's with explicit dependencies. Complex workflows with artifacts should become dagger pipelines. Mise tasks are for development processes and operations - tools for the user or the agent. diff --git a/CHANGELOG.md b/CHANGELOG.md index 7ae5f8e..0499154 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,259 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [v1.17.0] - 2026-06-03 + +### Features + +- Deploy the Adelaide / Heidi / Addie baby shower app — guest splash, raffle + picker, and prize assignment console — on ringtail k3s with `shower.eblu.me` + as the public entry and `shower.ops.eblu.me` as the tailnet admin host. App + source: [`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app). +- Deploy adelaide-baby-shower-app v1.1.0 to ringtail k3s. Replaces the + boolean lock with a four-phase `ShowerState` (`pre_event` → `party` → + `prizes_locked` → `event_locked`), adds an append-only "guest memories" + panel where guests can leave photos and comments for the baby, and + polishes the admin and QR views. Three Django migrations + (`0009_shower_phase`, `0010_guest_memories`, `0011_book_description`) + run automatically in the entrypoint against the SQLite PV. No config + or env-var changes. + + Container build also gains a Forgejo-PyPI workaround: Forgejo's simple + index returns absolute file URLs hardcoded to the public ROOT_URL + (`forge.eblu.me`), which the Fly edge 403s on `/api/packages/*`. The + wheel and sdist are now both pulled via direct `fetchurl` against + `forge.ops.eblu.me` (tailnet-only) and the wheel is handed to pip as + a local path. +- `review-compliance-reports` now also fetches and summarizes the weekly Prowler container-image and IaC scans (previously only the K8s CIS in-cluster scan was processed). For each scan it shows status counts, severity breakdown, week-over-week delta, and — for the high-volume image/IaC scans — top-N tables grouped by check ID and resource instead of per-finding listings. +- runner-logs now authenticates with Forgejo API token and auto-detects the repo from git remote. Job logs are fetched via SSH to indri (reading Forgejo's on-disk zstd log files) instead of the web endpoint, which doesn't support token auth for private repos. + +### Bug Fixes + +- Fix nightly borgmatic backups failing for 2 days. The shower SQLite + dump hook referenced `kubectl --context=k3s-ringtail`, but indri's + kubeconfig deliberately doesn't carry the ringtail credentials. The + `before_backup` hook's failure aborted the entire run, taking out + *both* the local sifaka repo and the BorgBase offsite. Replaced + the inline-shell dump with a `~/bin/borgmatic-k8s-sqlite-dump` + helper deployed by the ansible role. Each dump entry now declares a + `target` of either `local:` (mealie — kubectl uses indri's + kubeconfig) or `ssh:` (shower — ssh into ringtail and + run `k3s kubectl` there, no indri-side kubeconfig needed; k3s.yaml + on ringtail is mode 644 so no sudo required). Bytes stream back via + `kubectl exec ... -- cat` rather than `kubectl cp`, since `kubectl + cp` requires `tar` inside the pod and nix-built images like shower + don't bundle it. +- Shower app container now bakes the wheel + Python deps into the image + at build time via `buildPythonPackage` instead of pip-installing on + first boot. Boots are deterministic and don't depend on forge PyPI + being reachable from the pod. The `wheelHash` in + `containers/shower/default.nix` is the sha256 sourced from the + [forge PyPI simple index](https://forge.eblu.me/api/packages/eblume/pypi/simple/adelaide-baby-shower-app/); + bumping the version means bumping that hash too. + + Borgmatic now covers the shower app: SQLite is dumped from the live + pod via `kubectl exec` (mirroring the existing mealie entry, with + `context: k3s-ringtail`), and the prize-photo media share is picked up + through `/Volumes/shower` (sifaka SMB mount on indri, same pattern as + `/Volumes/photos`). +- Disabled adaptive sync (VRR) on ringtail's DP-1 output. The OMEN 27i IPS panel pumps brightness when its refresh rate swings into the low VRR range during low-framerate content (e.g. game cutscenes), producing a flicker that worsened over a session until a reboot. Pinning the panel to a fixed 165Hz eliminates it. +- Fixed forge.eblu.me static assets (CSS, JS, images, fonts) not loading — the proxy's static asset cache block was missing the `Host` header, so Caddy couldn't route the requests. +- Fixed homepage container EACCES on cold start: the nix-built image now chowns + `/app/config` to uid 1000 at build time via `fakeRootCommands`, matching the + behavior of the old Dockerfile. Without this, homepage couldn't seed missing + skeleton configs (proxmox.yaml etc.) or create `/app/config/logs`, crashing on + its first uncached request. Caught during the ringtail cutover. +- Fixed sway keybindings on ringtail — the home-manager `keybindings` block was replacing the module's defaults entirely, leaving only explicit overrides (no workspace switching, focus, move, splits, resize mode, etc). Switched to `lib.mkOptionDefault` with `lib.mkForce` on the conflicting custom binds (`Mod+Return`, `Mod+d`, `Mod+space`, `Mod+l`) so defaults merge back in. Also added `Mod+F1` to show a filterable fuzzel list of current keybindings. + + Fixed fuzzel config errors on launch — `border-radius` and `border-width` were under `[main]`, but fuzzel expects them as `radius`/`width` under a `[border]` section. +- Pin the Quartz docs build to v4.5.2. The Dagger `build_docs` pipeline cloned Quartz from the default branch unpinned; Quartz v5.0.0 restructured its config layout (`.quartz/plugins`, `../quartz` imports) and broke the docs build against our existing `quartz.config.ts`/`quartz.layout.ts`. + +### Infrastructure + +- Wire the ringtail `blumeops-pg` cluster (which holds the wave-1-migrated + paperless + teslamate databases) into backups and Grafana. Adds a Tailscale + LoadBalancer Service (`blumeops-pg-ringtail.tail8d86e.ts.net`) and a Caddy L4 + route (`pg.ops.eblu.me:5434`), then repoints borgmatic's `teslamate` + + `paperless` postgres dumps and the `mealie` SQLite dump at ringtail, and the + Grafana TeslaMate datasource at the ringtail DB. Closes the backup gap that + opened at cutover (the migrated live data was still being backed up from the + now-frozen minikube copies) and unblocks the wave-1 decommission. +- Migrated homepage dashboard from minikube (indri/arm64) to k3s (ringtail/amd64). + The container is now built via nix (`containers/homepage/default.nix`), adapted + from nixpkgs `homepage-dashboard` with the upstream Next.js cache patches and + wrapped with `dockerTools.buildLayeredImage`. Autodiscovery shifts: services on + minikube (ArgoCD, Immich, Kiwix, Mealie, Miniflux, Grafana, Prometheus, + Navidrome, Paperless, TeslaMate, Transmission) become explicit static entries + in `services.yaml`; ringtail services (Authentik, Frigate/NVR, Ntfy, Ollama) + auto-populate via Ingress annotations. +- Migrated CV (`cv.eblu.me`) and Docs (`docs.eblu.me`) from minikube Deployments to indri-native ansible roles. Caddy now serves the extracted release tarballs directly via a new `kind: static` service-block in the Caddy template — no daemon, no container — replacing the prior nginx-in-a-pod layer. Removes a network hop on every request and shrinks minikube's footprint. See [[cv-on-indri]] and [[docs-on-indri]]. Part of the broader minikube wind-down. +- Migrated devpi (PyPI mirror at `pypi.ops.eblu.me`) from a minikube StatefulSet to a launchd-managed service on indri. devpi-server now runs in a uv-managed venv with pinned `devpi-server` and `devpi-web` versions, listens on `127.0.0.1:3141`, and is fronted by Caddy. The minikube StatefulSet was crash-looping under memory pressure (and breaking the Python toolchain everywhere); the new layout removes a layer of dependency on cluster health for critical-path tooling. See [[devpi-on-indri]]. +- Move the entire Immich stack — server, machine-learning, valkey, + and the PostgreSQL+VectorChord cluster — off `minikube-indri` and + onto `k3s-ringtail`. Postgres data migrated zero-loss via CNPG + `pg_basebackup` (replica catch-up then promote); row counts on + `asset`, `user`, `album`, `smart_search`, `activity`, `asset_face` + verified equal between source and replica before cutover. The ML + pod now uses ringtail's RTX 4080 via the nvidia-device-plugin + (time-slicing bumped 2 → 4 to share with frigate + ollama). Caddy + routing at `photos.ops.eblu.me` is unchanged (still + `photos.tail8d86e.ts.net`, the device just lives on ringtail now). + Borgmatic backups continue against the same `immich-pg` tailnet + hostname. First concrete chain in the broader indri-k8s + decommission effort. +- Add local nix container build for `tailscale` (`containers/tailscale/default.nix`) so ringtail's tailscale-operator ProxyClass proxy pods pull from the forge mirror instead of `docker.io/tailscale/tailscale`. Pinned at v1.94.2 to match `service-versions.yaml`. Indri's tailscale-operator continues to use upstream during the k8s-to-ringtail migration. +- Address the 6 critical Prowler IaC findings against `argocd/manifests/`. Prowler's IaC provider hardcodes `self._mutelist = None` and delegates filtering to Trivy, but doesn't plumb `--ignorefile` through — so the documented "use Trivy filtering" path is actually broken. Added a shim around `trivy` in the Prowler image that injects `--ignorefile $TRIVY_IGNOREFILE` for `trivy fs` invocations when the env var points at a real file. The IaC cronjob now mounts `mutelist/trivyignore.yaml` (Trivy's per-path schema) and sets the env var, muting the `external-secrets` and `kube-state-metrics` Secret-access findings (KSV-0041, KSV-0114). Separately, `grafana-clusterrole` is tightened to remove `secrets` access entirely: the dashboard sidecar already only consumes ConfigMap-labeled dashboards, so its `RESOURCE` env var is now `configmap` instead of `both`. +- Pin ringtail's wired IP to `192.168.1.21` via NixOS scripted networking; NetworkManager no longer manages `enp5s0`. Removes DHCP lease renewal as a failure mode after a silent lease teardown took ringtail offline. Also explicitly enables `net.ipv4.ip_forward` (previously set implicitly by scripted-DHCP) so k3s pod networking and Tailscale routing continue to work with static networking. +- Ripped out the compensating-controls (CC) framework: deleted `compensating-controls.yaml`, the `review-compensating-controls` mise task, and the associated how-to / explanation docs. Prowler and Kingfisher continue to run weekly and produce reports; the Prowler mutelist YAML files remain in place but no longer carry `CC: ` prefixes — each entry just keeps a free-form `Description` of why the finding is muted. The CC review cadence proved to be more overhead than this single-operator homelab needed. +- Wire shower app for public exposure: fly nginx `shower.eblu.me` server + block as a guest-only surface — splash page, `/prizes//`, static + assets, media. Everything authenticated (`/admin/`, `/host/`, + `/accounts/`) returns 403 with a "tailnet only" pointer. Staff hit + `shower.ops.eblu.me` for the operator console + admin; the app's + v1.0.1 `DJANGO_PUBLIC_URL_BASE` setting makes QR codes generated on + the tailnet point back at the WAN host for guests. Plus a Caddy route + on indri, Pulumi Gandi CNAME, and a Grafana APM dashboard tracking + request rate, error rate, latency, bandwidth, and access logs. +- Mirror Valkey 8.1 locally as `registry.ops.eblu.me/blumeops/valkey`. Replaces direct pulls of `docker.io/valkey/valkey:8.1-alpine` for paperless and immich sidecars. Built via native Dagger pipeline on Alpine 3.22. Stateless swap — no data migration. Authentik's nix-built Redis remains separate. +- Add nix-built amd64 valkey for ringtail (`containers/valkey/default.nix`) so immich-ringtail can stop pulling the upstream multi-arch `docker.io/valkey/valkey` image. Existing `container.py` continues to build Alpine arm64 for paperless on indri. Both bump to valkey 8.1.7 (Alpine 3.22 8.1.7-r0 / nixpkgs 8.1.7). +- Upgrade Grafana Alloy v1.14.0 → v1.16.0 across all four service deployments + (alloy-k8s, alloy-ringtail, alloy-tracing-ringtail on k8s; alloy native on + indri). Pulls in stable database observability (v1.15) and the OTel Collector + v0.147.0 bump. Container build also migrated from Dockerfile to native Dagger + `container.py` per the build-container-image migration playbook. +- Upgraded Dagger from v0.20.1 to v0.20.6 (engine, CLI pin, and SDK regen) and migrated `runner-job-image` from a Debian-based Dockerfile to a native Dagger `container.py` on Alpine 3.23, reusing the shared `alpine_runtime` helper. +- Decommission the wave-1 services on minikube-indri now that paperless, + teslamate, and mealie run on ringtail with their data backed up. Removes the + minikube `paperless`/`teslamate`/`mealie` manifest dirs + ArgoCD app + definitions (pruning the parked Deployments, Services, and the redundant + minikube mealie/paperless PVCs), and drops the `paperless`/`teslamate` roles + from the minikube `blumeops-pg` cluster. The `paperless` and `teslamate` + databases are dropped from indri's blumeops-pg as the finalization step. + miniflux + authentik remain on the minikube cluster (later waves). +- Upgraded the k8s Forgejo runner to the v12.8 line, switched it from first-boot registration to declarative `server.connections` credentials from 1Password, and consolidated the supporting runner how-to documentation. +- Move paperless, teslamate, and mealie off `minikube-indri` onto + `k3s-ringtail`, shedding ~1.1 GiB of resident load from the + OOM-thrashing 8 GiB minikube node (the kernel OOM killer had been + killing `kube-apiserver`/`dockerd`/argocd, flapping every + minikube-hosted service at once). paperless + teslamate databases + move into a fresh CNPG `blumeops-pg` cluster on ringtail via a cold + `pg_dump`/`pg_restore` from the quiesced source — row counts verified + equal before any routing flip; source DBs dropped only after the + ringtail side serves traffic. mealie's SQLite PVC is copied as-is. + paperless media stays on sifaka NFS. Downtime-tolerant cold cutover + (no streaming replication); rollback is repoint-and-scale-up with the + source untouched. Second chain in the indri-k8s decommission after + [[migrate-immich-to-ringtail]]. +- Recurring maintenance batch: + + - Ringtail flake inputs refreshed (`disko`, `home-manager`, `nixpkgs`). + - Tooling deps bumped: prek hooks (trufflehog v3.95.3, kingfisher v1.101.0, ruff v0.15.14, `ansible-core` 2.21.0); fly proxy base images (nginx 1.30.1-alpine, alloy v1.16.1); `typer==0.26.2` in mise tasks. +- Updated `nixos/ringtail/flake.lock` (weekly cadence): `disko`, `home-manager`, and `nixpkgs` inputs refreshed. `nixpkgs-services` skipped per overlay convention. +- Reviewed `mealie` service version freshness; upstream is 5 minor versions ahead (v3.17.0 vs deployed v3.12.0). Marked reviewed; upgrade deferred. +- Deploy shower v1.1.2 — bump container build to new app release. +- Upgrade unpoller v2.34.0 → v3.2.0 and migrate container build from Dockerfile to native Dagger (container.py). v3.0.0 carries breaking UniFi API changes; v3.2.0 introduces a 60s background poll (cached scrapes) by default — set `interval = 0` in `up.conf` to restore on-demand polling. +- Monthly tooling dependency refresh: prek hooks (trufflehog, kingfisher, ruff, shfmt, prettier, actionlint, ansible-lint), fly proxy base images (nginx 1.30.0, tailscale v1.94.2, alloy v1.16.0), normalize pyyaml lower bound in mise-tasks. +- Add GE-Proton (`pkgs.proton-ge-bin`) to `programs.steam.extraCompatPackages` + on ringtail. Subnautica 2 hangs at Mercuna plugin init under Proton + Experimental + DXVK D3D12; GE-Proton is available as a Steam per-game + compatibility option to work around it. +- Add `sn2-prelaunch` Steam launch wrapper on ringtail that removes + Subnautica 2's stale `Saved/running.dat` and `Saved/beforelobby.dat` + lockfiles before each launch. SN2 pops up an invisible (0×0-sized) + Error dialog when it detects an unclean exit, blocking GameThread + forever; this is observable only as a black screen with a spinning + loader. Use via Steam launch option: `sn2-prelaunch %command%`. +- Add local nix container build for `frigate-notify` (`containers/frigate-notify/default.nix`) so the Frigate→ntfy bridge is rebuilt on ringtail from the forge mirror instead of pulled from `ghcr.io/0x2142/frigate-notify`. +- Add resource limits to all ArgoCD pods to prevent unbounded resource consumption during node-wide pressure events. +- Black-hole the `/mirrors/*` repositories at the Fly proxy edge (`return 403` → `forge.ops.eblu.me`). A surprise $29.60 Fly bill traced to ~1.24 TB/30d of egress on `forge.eblu.me`, 99.95% of all proxy egress — of which ~71% was AI scrapers (Meta `meta-externalagent`, OpenAI `GPTBot`, Amazonbot) crawling the near-infinite git-history URL space of the public mirror repos and timing out Forgejo in the process. Mirrors exist for supply-chain control and are consumed over the tailnet, so their public web UI had no legitimate audience. `robots.txt` already disallowed `/mirrors/`, but the offending agents ignore it. Tier-2 mitigations (user-agent denylist, Anubis proof-of-work gateway) are documented in `docs/explanation/ai-scraper-mitigation.md`. +- Bump paperless and immich kustomizations to the main-SHA-built valkey tag (`v8.1.6-r0-fabca04`). Routine post-merge follow-up to keep production manifests pointing at images built from a commit on main. +- Bump shower container to v1.1.1 (probe FOD hash). +- Bumped shower app to v1.1.3 (wheel/sdist + FOD hashes probed on ringtail). +- Cap systemd-coredump on ringtail (ProcessSizeMax/ExternalSizeMax 1G, MaxUse 2G) so multi-GB Wine/Proton game crash dumps no longer thrash the disk and lock up the desktop. +- Deploy shower v1.1.1 to ringtail (kustomize newTag bump). +- Deployed shower v1.1.3 to ringtail (image built and pushed from ringtail; runner bypassed due to indri overload). +- Fix three follow-ups from the wave-1 decommission: grant the local + break-glass `admin` account ArgoCD admin rights (`g, admin, role:admin` — + previously only the Authentik `admins` group had access, so admin was + locked out whenever its token expired), and repoint the alloy blackbox + probe for teslamate from the deleted minikube service to + `https://tesla.ops.eblu.me/` (through Caddy over Tailscale). The orphaned + paperless/teslamate roles + ExternalSecrets left on the minikube + blumeops-pg are also cleaned up. +- Moved the Immich blackbox health probe from indri's alloy to ringtail's alloy. After the immich migration to ringtail, the probe still targeted `immich-server.immich.svc.cluster.local` on indri's cluster where the service no longer exists, causing a persistent `ServiceProbeFailure` alert. +- Pin shower v1.1.1 FOD outputHash (probed locally on ringtail). +- Rebuild Prowler container against main HEAD (v5.23.0-495e45d) after merging the IaC mutelist Dockerfile changes. +- Rebuild and retag alloy v1.16.0 container images from the main-branch SHA + following the squash-merge of #345, per the build-container-image + squash-merge convention. Both images (`registry.ops.eblu.me/blumeops/alloy`) + now reference `9564435` rather than the branch SHA `26a3ab5`, restoring + source traceability after branch cleanup. +- Rebuild shower from the post-merge commit on main so the container's + SHA tag points at a commit that will still exist after the 30-day + branch-cleanup window. Functionally identical to the branch-tag image + already deployed, just preserves source traceability per + [[build-container-image#Squash-merge and container tags]]. +- Rebuild unpoller container from squashed main commit so the image SHA tag matches a commit in main's history (was tagged with the pre-squash branch SHA). +- Rebuild valkey container from squashed main commit (both arm64 dagger and amd64 nix variants), and update paperless + immich-ringtail kustomizations to the main-SHA tags `v8.1.7-ecded30` and `v8.1.7-ecded30-nix`. +- Retired the `blumeops-tasks` mise task (Todoist API) in favor of `heph list --project Blumeops --json` from the self-hosted [hephaestus](https://github.com/eblume/hephaestus) system. Updated docs to point task discovery and rotation reminders at heph, and noted that the `~/code/personal/zk` zettelkasten is migrating into heph docs. +- Switch the Fly proxy deploy strategy from `bluegreen` to `immediate` in `fly/fly.toml`. With a single proxy machine, bluegreen offers little benefit — the green machine routinely failed to reach "started" inside Fly's default 5-minute deploy timeout (the cold-start sequence of `tailscaled` → `tailscale up` → wait-for-MagicDNS → nginx startup eats most of the budget), and the failed deploys would roll back. `immediate` replaces the machine in place with a brief downtime (~5–10s) but actually completes. +- Switch the ringtail provisioning playbook's blumeops clone URL from `forge.eblu.me` (public, via Fly proxy) to `forge.ops.eblu.me` (tailnet, direct via Caddy on indri). Ringtail is always on the tailnet, so the WAN round-trip is pure overhead — it also made `provision-ringtail` brittle whenever the Fly proxy was slow or down. +- Switched Grafana's deployment strategy from `RollingUpdate` to `Recreate`. With an RWO PVC holding the SQLite database and Bleve search index, `RollingUpdate` reliably crashloops the new pod on the index lock until rollout timeout. `Recreate` terminates the old pod first so the new one acquires the lock cleanly. +- Update `tailscale-operator-ringtail` ProxyClass to reference the `0108b68` main-SHA build of the tailscale container. Routine post-merge cleanup so the deployed image traces to a commit that survives PR branch cleanup. +- Update the ringtail NixOS flake lockfile (`nixos/ringtail/flake.lock`): bump + `nixpkgs` (b77b3de → 25f5383) and `disko` (5ba0c95 → 115e521) to latest. + `nixpkgs-services` was intentionally left pinned (skipped by the + `flake-update` pipeline). Routine recurring maintenance per [[manage-lockfile]]. +- Upgrade native macOS Alloy on indri to v1.16.0. Built on gilbert with Go + 1.26.2 + CGO (required for the macOS native DNS resolver, which Tailscale + MagicDNS depends on), scp'd to `~/.local/bin/alloy` on indri, codesigned, + and the LaunchAgent reloaded. Completes the v1.16.0 fleet upgrade started + in #345 — all four Alloy services (alloy-k8s, alloy-ringtail, + alloy-tracing-ringtail, alloy ansible) now run v1.16.0. +- Upgraded zot on indri from v2.1.15 to v2.1.16 (security fixes: TLS verification on metrics client, CORS Allow-Credentials suppression on wildcard origins, manifest/API-key body size limits). + +### Documentation + +- Reviewed `replicating-blumeops` tutorial: fixed "BluemeOps" typos (also in `contributing.md`) and added `last-reviewed` frontmatter. +- Reviewed [[indri]] reference card: added `devpi`, `cv`, and `docs` to the native-services list; widened the k8s note to reflect the growing set of apps now on ringtail and the planned indri-minikube decommission; added CPU/RAM specs. +- New how-to: rotate-fly-deploy-token. Documents the 75-day rotation cadence, why we use `org`-scoped tokens (silences the cosmetic metrics-token warning on `fly status` with marginal blast-radius cost given the single-app personal org), and the procedure for rotation + Forgejo Actions secret sync. +- Add `docs/explanation/ai-scraper-mitigation.md` — the egress-cost / AI-crawler threat model for the public Fly proxy, the tiered mitigation plan (Tier 1: mirror black-hole, shipped; Tier 2: user-agent denylist + Anubis; Tier 3: Cloudflare, rejected on principle), and the data behind it. +- Fix manage-forgejo-mirrors verify step — sync button is on the repo settings page ("Synchronize now"), not the main repo page. +- Fixed the `op item edit` invocation in the [[zot]] API-key rotation procedure: the previous `pbpaste | op item edit ... "field[password]=-"` stdin syntax is rejected by op 2.34 as "invalid JSON" (recent op versions treat piped input as a full JSON template, not a single field value). Procedure now reads the clipboard into a local fish variable and passes it as an inline assignment. +- Fixed the export-filename step in [[run-1password-backup]]: 1Password's desktop app names the export `1PasswordExport--.1pux` automatically rather than letting you save to a fixed name, so the procedure now points the task at that glob instead of pretending the default name is `1Password-export.1pux`. +- Refresh the contributing tutorial: add `last-reviewed`, include the `.ai.md` changelog fragment type, and clarify that `prek` is pinned via `mise`. +- Review and refresh the Navidrome reference card: add `last-reviewed`, correct the scanner env var name, document the current image/version, and record routing and runtime details from the manifests. +- Review and refresh the Ollama reference card: add `last-reviewed`, bump the documented image tag to 0.20.4, and add the two `qwen3.5` models now declared in `models.txt`. +- Reviewed [[1password]] reference card: added the `blumeops` vs `Personal` vault split, noted that `onepassword-connect` runs on both indri and ringtail (not just one cluster), and pulled the `op read` vs `op item get --fields` guidance up from agent memory into the card. +- Reviewed `index.md`; added ringtail to the infrastructure overview and stamped `last-reviewed`. +- Reviewed transmission card: corrected storage layout (`/config/` is emptyDir, watch dir disabled) and noted the Prometheus exporter sidecar. +- rotate-fly-deploy-token: combine mint+store into one command with both fish and bash forms; document the `op item edit` "Password item requires ps value" validator gotcha and the placeholder-password workaround. + +### AI Assistance + +- Adopt `AGENTS.md` as the canonical agent instruction file, keep `CLAUDE.md` as a compatibility shim, and update docs to reference the neutral file and the correct agent-change-process path. +- CLAUDE.md now imports AGENTS.md via `@AGENTS.md` instead of telling agents to go read it. Claude Code only auto-loads CLAUDE.md, so the prose shim was easy to skip; the import inlines AGENTS.md into the session prompt unconditionally. + +### Miscellaneous + +- Removed the dead minikube manifests, container builds, and tooling shims left behind after the cv + docs migration to indri-native (#342). Deletes `argocd/{apps,manifests}/{cv,docs}/`, `containers/{cv,quartz}/`, and the `quartz`→`docs` mapping in `mise-tasks/container-version-check`. Bumps `docs.current-version` to `v1.16.0` (the blumeops release tag) now that the legacy nginx-base version pin is gone. +- Rebuild shower v1.1.0 container from main HEAD (`3c7967e`) and bump the + kustomization tag to `v1.1.0-3c7967e-nix`. The PR was squash-merged, so + the branch commit `444ff91` baked into the prior tag isn't reachable + from main's history. The new tag points at a commit that exists on + main; image content is byte-identical because the FOD output is content + addressed and the inputs didn't change. +- Rebuild shower v1.1.2 from main HEAD (a33fa47) and retag — PR #358 was squash-merged so the branch SHA baked into the prior image tag isn't reachable from main. FOD is content-addressed, so image bytes are identical; only provenance changes. +- Remove the duplicate Homepage tiles for Mealie, Paperless, Immich, and + TeslaMate. Homepage runs on ringtail and autodiscovers ringtail Ingresses via + `gethomepage.dev/*` annotations; once these services migrated to ringtail they + were discovered automatically, making their leftover static `services.yaml` + entries (needed only while they lived on minikube) redundant. +- Removed the now-unused `containers/devpi/` Dagger build artifact. Devpi runs natively on indri via uv venv; the container image is no longer referenced anywhere. Doc examples in `docs/reference/tools/dagger.md` updated to use `miniflux` as the example container name. +- `container-build-and-release` now prints the specific `mise run runner-logs ` command after dispatching, polling the Forgejo API to resolve the run number for the commit it just triggered. +- `mise run runner-logs -j ` now reports a clear error when the log file doesn't exist on indri (e.g. a runner crash that left `action_task.log_in_storage = 0`). Previously it printed only the header and exited 0, because `zstdcat` exits 0 with a "can't stat … -- ignored" stderr message and ssh+fish on indri swallows the remote exit code. + + ## [v1.16.0] - 2026-04-18 ### Infrastructure diff --git a/CLAUDE.md b/CLAUDE.md index d825c0f..43c994c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,7 +1 @@ -# CLAUDE.md - -Claude Code compatibility shim. - -The canonical agent instructions for this repository now live in [`AGENTS.md`](AGENTS.md). - -If a tool specifically looks for `CLAUDE.md`, read `AGENTS.md` and follow that file as the source of truth. +@AGENTS.md diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index ce6a930..1e33bb1 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -212,6 +212,23 @@ no_log: true tags: [forgejo_metrics] + # Devpi root password (PyPI mirror admin) + - name: Fetch devpi root password + ansible.builtin.command: + cmd: op read "op://vg6xf6vvfmoh5hqjjhlhbeoaie/kyhzfifryqnuk7jeyibmmjvxxm/add more/root password" + delegate_to: localhost + register: _devpi_root_password + changed_when: false + no_log: true + check_mode: false + tags: [devpi] + + - name: Set devpi root password fact + ansible.builtin.set_fact: + devpi_root_password: "{{ _devpi_root_password.stdout }}" + no_log: true + tags: [devpi] + roles: - role: alloy tags: alloy @@ -227,6 +244,8 @@ tags: zot - role: zot_metrics tags: zot_metrics + - role: devpi + tags: devpi - role: minikube tags: minikube - role: minikube_metrics @@ -237,5 +256,11 @@ tags: jellyfin_metrics - role: forgejo_metrics tags: forgejo_metrics + - role: cv + tags: cv + - role: docs + tags: docs + - role: heph + tags: heph - role: caddy tags: caddy diff --git a/ansible/playbooks/ringtail.yml b/ansible/playbooks/ringtail.yml index ee5604b..b05d67a 100644 --- a/ansible/playbooks/ringtail.yml +++ b/ansible/playbooks/ringtail.yml @@ -57,7 +57,7 @@ tasks: - name: Ensure blumeops repo is present ansible.builtin.git: - repo: "https://forge.eblu.me/eblume/blumeops.git" + repo: "https://forge.ops.eblu.me/eblume/blumeops.git" dest: /etc/blumeops version: "{{ ringtail_commit | default('main') }}" force: true diff --git a/ansible/roles/borgmatic/defaults/main.yml b/ansible/roles/borgmatic/defaults/main.yml index 25d0149..a743161 100644 --- a/ansible/roles/borgmatic/defaults/main.yml +++ b/ansible/roles/borgmatic/defaults/main.yml @@ -27,6 +27,9 @@ borgmatic_source_directories: - /Users/erichblume/.config/borgmatic - /Users/erichblume/Documents - /Users/erichblume/.local/share/borgmatic/k8s-dumps + # Shower app prize-photo uploads (sifaka SMB mount). Mounted manually + # on indri via Finder — see docs/how-to/operations/shower-app.md. + - /Volumes/shower # Backup repositories borgmatic_repositories: @@ -53,7 +56,17 @@ borgmatic_k8s_sqlite_dumps: namespace: mealie label_selector: app=mealie db_path: /app/data/mealie.db - context: minikube + # migrated to ringtail (wave-1); ssh to ringtail and run k3s kubectl + # there, same as shower below. + target: ssh:eblume@ringtail + - name: shower + namespace: shower + label_selector: app=shower + db_path: /app/data/db.sqlite3 + # ssh to ringtail and run k3s kubectl there — avoids needing a + # ringtail kubeconfig on indri. k3s.yaml on ringtail is + # world-readable (mode 644), so no sudo required. + target: ssh:eblume@ringtail # Exclude patterns borgmatic_exclude_patterns: [] @@ -90,17 +103,18 @@ borgmatic_postgresql_databases: hostname: pg.ops.eblu.me port: 5432 username: borgmatic - - name: teslamate - hostname: pg.ops.eblu.me - port: 5432 - username: borgmatic - name: authentik hostname: pg.ops.eblu.me port: 5432 username: borgmatic + # migrated to ringtail blumeops-pg (wave-1); port 5434 = Caddy L4 route + - name: teslamate + hostname: pg.ops.eblu.me + port: 5434 + username: borgmatic - name: paperless hostname: pg.ops.eblu.me - port: 5432 + port: 5434 username: borgmatic # immich-pg cluster (VectorChord) via Caddy L4 on port 5433 - name: immich diff --git a/ansible/roles/borgmatic/tasks/main.yml b/ansible/roles/borgmatic/tasks/main.yml index eacefa5..36d3bb6 100644 --- a/ansible/roles/borgmatic/tasks/main.yml +++ b/ansible/roles/borgmatic/tasks/main.yml @@ -19,8 +19,10 @@ ansible.builtin.copy: content: | # Managed by ansible (borgmatic role) - k8s PostgreSQL backup credentials + # 5432 = minikube blumeops-pg, 5433 = immich-pg, 5434 = ringtail blumeops-pg pg.ops.eblu.me:5432:*:borgmatic:{{ borgmatic_db_password }} pg.ops.eblu.me:5433:*:borgmatic:{{ borgmatic_db_password }} + pg.ops.eblu.me:5434:*:borgmatic:{{ borgmatic_db_password }} dest: ~/.pgpass mode: '0600' no_log: true @@ -49,6 +51,20 @@ mode: '0700' when: borgmatic_k8s_sqlite_dumps | length > 0 +- name: Ensure ~/bin exists + ansible.builtin.file: + path: "{{ ansible_env.HOME }}/bin" + state: directory + mode: '0755' + when: borgmatic_k8s_sqlite_dumps | length > 0 + +- name: Deploy k8s SQLite dump helper script + ansible.builtin.template: + src: k8s-sqlite-dump.sh.j2 + dest: "{{ ansible_env.HOME }}/bin/borgmatic-k8s-sqlite-dump" + mode: '0755' + when: borgmatic_k8s_sqlite_dumps | length > 0 + - name: Deploy borgmatic configuration ansible.builtin.template: src: config.yaml.j2 diff --git a/ansible/roles/borgmatic/templates/config.yaml.j2 b/ansible/roles/borgmatic/templates/config.yaml.j2 index 85804b7..0893dbc 100644 --- a/ansible/roles/borgmatic/templates/config.yaml.j2 +++ b/ansible/roles/borgmatic/templates/config.yaml.j2 @@ -32,12 +32,20 @@ exclude_patterns: encryption_passcommand: {{ borgmatic_encryption_passcommand }} {% if borgmatic_k8s_sqlite_dumps %} -# Pre-backup: dump SQLite databases from k8s pods -# Uses sqlite3 .backup for a safe, consistent copy (no corruption from concurrent writes) +# Pre-backup: dump SQLite databases from k8s pods. +# Uses sqlite3.backup() for a safe, consistent copy. +# +# Quoting/escaping is delegated to ~/bin/borgmatic-k8s-sqlite-dump +# (deployed by the borgmatic ansible role). Each entry's `target` +# is either: +# - local: -> local kubectl with --context (mealie etc.) +# - ssh: -> ssh + k3s kubectl on the cluster host, +# used for ringtail since indri's kubeconfig +# deliberately doesn't carry that context. before_backup: - mkdir -p {{ borgmatic_k8s_dump_dir }} {% for db in borgmatic_k8s_sqlite_dumps %} - - /opt/homebrew/bin/kubectl --context={{ db.context }} exec -n {{ db.namespace }} deploy/{{ db.name }} -- python3 -c "import sqlite3; sqlite3.connect('{{ db.db_path }}').backup(sqlite3.connect('/tmp/{{ db.name }}-backup.db'))" && /opt/homebrew/bin/kubectl --context={{ db.context }} cp {{ db.namespace }}/$(/opt/homebrew/bin/kubectl --context={{ db.context }} get pod -n {{ db.namespace }} -l {{ db.label_selector }} -o jsonpath='{.items[0].metadata.name}'):/tmp/{{ db.name }}-backup.db {{ borgmatic_k8s_dump_dir }}/{{ db.name }}.db + - {{ ansible_env.HOME }}/bin/borgmatic-k8s-sqlite-dump {{ db.target }} {{ db.namespace }} {{ db.label_selector }} {{ db.db_path }} {{ db.name }} {{ borgmatic_k8s_dump_dir }}/{{ db.name }}.db {% endfor %} {% endif %} diff --git a/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 b/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 new file mode 100644 index 0000000..9cc24da --- /dev/null +++ b/ansible/roles/borgmatic/templates/k8s-sqlite-dump.sh.j2 @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# {{ ansible_managed }} +# +# Helper script invoked by borgmatic's before_backup hook to capture a +# k8s pod's SQLite database. Keeps the borgmatic config readable by +# pulling all the quoting out of YAML. +# +# Usage: +# borgmatic-k8s-sqlite-dump \ +# +# +# is one of: +# local: - run local kubectl with --context= +# ssh: - ssh to host and run k3s kubectl there +# (no indri-side kubeconfig needed) +# +# - k8s namespace of the pod +# - label selector to find the pod (e.g. app=shower) +# - absolute path inside the pod to the SQLite DB +# - short name used for temp filenames +# - file on this host to receive the dump +set -euo pipefail + +target=${1:?missing target} +namespace=${2:?missing namespace} +selector=${3:?missing selector} +db_path=${4:?missing db path} +name=${5:?missing name} +dump_target=${6:?missing dump target} + +# Stage the backup next to the source DB (a guaranteed-writable volume); +# minimal nix images (e.g. mealie) have no /tmp. +pod_tmp="$(dirname "$db_path")/.borgmatic-backup-${name}.db" + +python_backup='import sqlite3; sqlite3.connect("'"$db_path"'").backup(sqlite3.connect("'"$pod_tmp"'"))' + +mode=${target%%:*} +ref=${target#*:} + +case "$mode" in + local) + # Pulls dump bytes out via "kubectl exec -- cat" rather than + # "kubectl cp", which would otherwise need tar inside the pod + # (nix-built images like shower don't bundle tar). + context=$ref + kubectl="/opt/homebrew/bin/kubectl --context=$context -n $namespace" + pod=$($kubectl get pod -l "$selector" \ + -o jsonpath='{.items[0].metadata.name}') + $kubectl exec "$pod" -- python3 -c "$python_backup" + $kubectl exec "$pod" -- cat "$pod_tmp" > "$dump_target" + $kubectl exec "$pod" -- rm -f "$pod_tmp" + ;; + ssh) + host=$ref + # Force bash on the remote (user's login shell on ringtail is + # fish). Pipe the script via stdin to dodge nested quoting. + # The dump bytes come back over the ssh stdout stream — no + # intermediate scp, no tar requirement in the pod. + ssh "$host" bash < "$dump_target" +set -euo pipefail +export KUBECONFIG=/etc/rancher/k3s/k3s.yaml +pod=\$(k3s kubectl -n "$namespace" get pod -l "$selector" -o jsonpath='{.items[0].metadata.name}') +k3s kubectl -n "$namespace" exec "\$pod" -- python3 -c '$python_backup' 1>&2 +k3s kubectl -n "$namespace" exec "\$pod" -- cat "$pod_tmp" +k3s kubectl -n "$namespace" exec "\$pod" -- rm -f "$pod_tmp" 1>&2 +EOF + ;; + *) + echo "borgmatic-k8s-sqlite-dump: unknown target mode: $mode" >&2 + echo " expected local: or ssh:" >&2 + exit 1 + ;; +esac diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index ebb210b..e6d7385 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -51,7 +51,10 @@ caddy_services: backend: "https://feed.tail8d86e.ts.net" - name: devpi host: "pypi.{{ caddy_domain }}" - backend: "https://pypi.tail8d86e.ts.net" + backend: "http://localhost:3141" + - name: heph + host: "heph.{{ caddy_domain }}" + backend: "http://localhost:8787" # hephaestus hub (server mode) + PWA shell - name: kiwix host: "kiwix.{{ caddy_domain }}" backend: "https://kiwix.tail8d86e.ts.net" @@ -72,10 +75,16 @@ caddy_services: backend: "https://go.tail8d86e.ts.net" - name: docs host: "docs.{{ caddy_domain }}" - backend: "https://docs.tail8d86e.ts.net" + kind: static + root: "{{ docs_content_dir }}" + try_html: true # Quartz: path → path/ → path.html → 404.html - name: cv host: "cv.{{ caddy_domain }}" - backend: "https://cv.tail8d86e.ts.net" + kind: static + root: "{{ cv_content_dir }}" + download_paths: + - path: /resume.pdf + filename: erich-blume-resume.pdf - name: nvr host: "nvr.{{ caddy_domain }}" backend: "https://nvr.tail8d86e.ts.net" @@ -95,6 +104,9 @@ caddy_services: - name: paperless host: "paperless.{{ caddy_domain }}" backend: "https://paperless.tail8d86e.ts.net" + - name: shower + host: "shower.{{ caddy_domain }}" + backend: "https://shower.tail8d86e.ts.net" - name: sifaka host: "nas.{{ caddy_domain }}" backend: "http://sifaka:5000" @@ -108,6 +120,8 @@ caddy_tcp_services: backend: "pg.tail8d86e.ts.net:5432" # PostgreSQL (blumeops-pg) - port: 5433 backend: "immich-pg.tail8d86e.ts.net:5432" # PostgreSQL (immich-pg) + - port: 5434 + backend: "blumeops-pg-ringtail.tail8d86e.ts.net:5432" # PostgreSQL (blumeops-pg on ringtail) - port: "{{ sifaka_node_exporter_port }}" backend: "sifaka:{{ sifaka_node_exporter_port }}" # Sifaka node_exporter - port: "{{ sifaka_smartctl_exporter_port }}" diff --git a/ansible/roles/caddy/templates/Caddyfile.j2 b/ansible/roles/caddy/templates/Caddyfile.j2 index 4f103f1..f6b5f64 100644 --- a/ansible/roles/caddy/templates/Caddyfile.j2 +++ b/ansible/roles/caddy/templates/Caddyfile.j2 @@ -31,6 +31,25 @@ {% for service in caddy_services %} @{{ service.name }} host {{ service.host }} handle @{{ service.name }} { +{% if service.kind | default('proxy') == 'static' %} + root * {{ service.root }} + encode gzip + # Long-cache fingerprinted assets; everything else stays default. + @{{ service.name }}_assets path_regexp \.(css|js|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ + header @{{ service.name }}_assets Cache-Control "public, max-age=31536000, immutable" +{% for dl in service.download_paths | default([]) %} + @{{ service.name }}_dl{{ loop.index }} path {{ dl.path }} + header @{{ service.name }}_dl{{ loop.index }} Content-Disposition `attachment; filename="{{ dl.filename }}"` +{% endfor %} +{% if service.try_html | default(false) %} + # Quartz clean URLs: path → path/ → path.html → /404.html (200). + # Caddy's handle_errors is a top-level directive and can't live in + # this nested handle, so the 404 page rides as the final try_files + # candidate (served with 200 — acceptable for a human-facing 404). + try_files {path} {path}/ {path}.html /404.html +{% endif %} + file_server +{% else %} {% if service.cache_policy | default('') == 'spa' %} # SPA cache policy: hashed static assets are immutable, HTML must revalidate. # Prevents stale HTML from referencing chunk hashes that no longer exist. @@ -47,6 +66,7 @@ } {% else %} reverse_proxy {{ service.backend }} +{% endif %} {% endif %} } diff --git a/ansible/roles/cv/defaults/main.yml b/ansible/roles/cv/defaults/main.yml new file mode 100644 index 0000000..a18cc82 --- /dev/null +++ b/ansible/roles/cv/defaults/main.yml @@ -0,0 +1,10 @@ +--- +# CV / resume static site (native, replaces minikube Deployment) +# Caddy serves cv_content_dir directly via the static-kind service block. + +cv_version: "v1.0.3" +cv_release_url: "https://forge.ops.eblu.me/api/packages/eblume/generic/cv/{{ cv_version }}/cv-{{ cv_version }}.tar.gz" + +cv_home: /Users/erichblume/blumeops/cv +cv_content_dir: "{{ cv_home }}/content" +cv_version_sentinel: "{{ cv_home }}/.installed-version" diff --git a/ansible/roles/cv/tasks/main.yml b/ansible/roles/cv/tasks/main.yml new file mode 100644 index 0000000..c254325 --- /dev/null +++ b/ansible/roles/cv/tasks/main.yml @@ -0,0 +1,57 @@ +--- +# cv role — download and extract the CV release tarball into cv_content_dir. +# Caddy serves the directory directly; there is no daemon to manage. +# +# Idempotency: a sentinel file records the installed cv_version. The +# download/extract steps only run when the sentinel doesn't match cv_version. +# +# We use curl rather than ansible.builtin.get_url because the forge generic- +# packages endpoint returns 405 on HEAD requests, which get_url issues before +# downloading. + +- name: Ensure cv home exists + ansible.builtin.file: + path: "{{ cv_home }}" + state: directory + mode: '0755' + +- name: Read installed cv version sentinel + ansible.builtin.slurp: + src: "{{ cv_version_sentinel }}" + register: cv_installed_raw + failed_when: false + changed_when: false + +- name: Set installed cv version fact + ansible.builtin.set_fact: + cv_installed_version: >- + {{ (cv_installed_raw.content | b64decode).strip() + if (cv_installed_raw.content is defined) else '' }} + +- name: Recreate cv content dir + ansible.builtin.file: + path: "{{ cv_content_dir }}" + state: "{{ item }}" + mode: '0755' + loop: + - absent + - directory + when: cv_installed_version != cv_version + +- name: Download and extract cv release tarball + ansible.builtin.shell: + cmd: >- + set -euo pipefail; + curl -fsSL {{ cv_release_url | quote }} -o {{ cv_home }}/cv.tar.gz && + tar -xzf {{ cv_home }}/cv.tar.gz -C {{ cv_content_dir }} && + rm -f {{ cv_home }}/cv.tar.gz + executable: /bin/bash + when: cv_installed_version != cv_version + changed_when: true + +- name: Write cv version sentinel + ansible.builtin.copy: + content: "{{ cv_version }}\n" + dest: "{{ cv_version_sentinel }}" + mode: '0644' + when: cv_installed_version != cv_version diff --git a/ansible/roles/devpi/defaults/main.yml b/ansible/roles/devpi/defaults/main.yml new file mode 100644 index 0000000..6d52b9b --- /dev/null +++ b/ansible/roles/devpi/defaults/main.yml @@ -0,0 +1,21 @@ +--- +# devpi PyPI caching mirror (native launchd, replaces minikube StatefulSet) + +devpi_home: /Users/erichblume/devpi +devpi_venv: "{{ devpi_home }}/venv" +devpi_server_dir: "{{ devpi_home }}/server-dir" +devpi_binary: "{{ devpi_venv }}/bin/devpi-server" +devpi_init_binary: "{{ devpi_venv }}/bin/devpi-init" + +devpi_python_version: "3.12" +devpi_server_version: "6.19.3" +devpi_web_version: "5.0.2" + +devpi_host: 127.0.0.1 +devpi_port: 3141 +devpi_outside_url: "https://pypi.ops.eblu.me" + +devpi_log_dir: /Users/erichblume/Library/Logs + +# uv binary on indri — mise shim so version bumps via `mise upgrade uv` flow through transparently +devpi_uv_binary: /Users/erichblume/.local/share/mise/shims/uv diff --git a/ansible/roles/devpi/handlers/main.yml b/ansible/roles/devpi/handlers/main.yml new file mode 100644 index 0000000..2765850 --- /dev/null +++ b/ansible/roles/devpi/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart devpi + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.devpi.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.devpi.plist + changed_when: true diff --git a/ansible/roles/devpi/tasks/main.yml b/ansible/roles/devpi/tasks/main.yml new file mode 100644 index 0000000..985ca46 --- /dev/null +++ b/ansible/roles/devpi/tasks/main.yml @@ -0,0 +1,71 @@ +--- +# devpi role — devpi-server in a uv-managed venv, run via LaunchAgent. +# Replaces the prior minikube StatefulSet; see [[devpi-on-indri]]. +# +# The root password is fetched in the indri.yml playbook pre_tasks and +# exposed as `devpi_root_password`. + +- name: Ensure devpi home exists + ansible.builtin.file: + path: "{{ devpi_home }}" + state: directory + mode: '0755' + +- name: Ensure devpi server-dir exists + ansible.builtin.file: + path: "{{ devpi_server_dir }}" + state: directory + mode: '0700' + +- name: Create devpi venv if missing + ansible.builtin.command: + cmd: "{{ devpi_uv_binary }} venv --python {{ devpi_python_version }} {{ devpi_venv }}" + creates: "{{ devpi_venv }}/bin/python" + +- name: Install devpi-server and devpi-web into venv + # Always bootstrap from upstream PyPI — devpi is the index it would otherwise resolve through, + # and that's a circular dependency (devpi cannot install itself from itself). + ansible.builtin.command: + cmd: >- + {{ devpi_uv_binary }} pip install + --python {{ devpi_venv }}/bin/python + --index-url https://pypi.org/simple/ + devpi-server=={{ devpi_server_version }} + devpi-web=={{ devpi_web_version }} + register: devpi_pip_install + changed_when: "'Installed' in devpi_pip_install.stdout or 'Uninstalled' in devpi_pip_install.stdout" + notify: Restart devpi + +- name: Check if devpi server-dir is initialized + ansible.builtin.stat: + path: "{{ devpi_server_dir }}/.serverversion" + register: devpi_serverversion + +- name: Initialize devpi server-dir + ansible.builtin.command: + cmd: >- + {{ devpi_init_binary }} + --serverdir {{ devpi_server_dir }} + --root-passwd {{ devpi_root_password }} + when: not devpi_serverversion.stat.exists + changed_when: true + no_log: true + +- name: Deploy devpi LaunchAgent plist + ansible.builtin.template: + src: devpi.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.devpi.plist + mode: '0644' + notify: Restart devpi + +- name: Check if devpi LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.devpi + register: devpi_launchctl_check + changed_when: false + failed_when: false + +- name: Load devpi LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.devpi.plist + when: devpi_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/devpi/templates/devpi.plist.j2 b/ansible/roles/devpi/templates/devpi.plist.j2 new file mode 100644 index 0000000..b9485e6 --- /dev/null +++ b/ansible/roles/devpi/templates/devpi.plist.j2 @@ -0,0 +1,34 @@ + + + + + + Label + mcquack.eblume.devpi + ProgramArguments + + {{ devpi_binary }} + --serverdir + {{ devpi_server_dir }} + --host + {{ devpi_host }} + --port + {{ devpi_port }} + --outside-url + {{ devpi_outside_url }} + + RunAtLoad + + KeepAlive + + EnvironmentVariables + + PATH + {{ devpi_venv }}/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin + + StandardOutPath + {{ devpi_log_dir }}/mcquack.devpi.out.log + StandardErrorPath + {{ devpi_log_dir }}/mcquack.devpi.err.log + + diff --git a/ansible/roles/docs/defaults/main.yml b/ansible/roles/docs/defaults/main.yml new file mode 100644 index 0000000..a5a1a8a --- /dev/null +++ b/ansible/roles/docs/defaults/main.yml @@ -0,0 +1,10 @@ +--- +# Docs (Quartz-built static site) — replaces minikube Deployment. +# Caddy serves docs_content_dir directly via the static-kind service block, +# with Quartz-style try_files (path → path/ → path.html → 404). + +docs_version: "v1.17.0" +docs_release_url: "https://forge.eblu.me/eblume/blumeops/releases/download/{{ docs_version }}/docs-{{ docs_version }}.tar.gz" +docs_home: /Users/erichblume/blumeops/docs +docs_content_dir: "{{ docs_home }}/content" +docs_version_sentinel: "{{ docs_home }}/.installed-version" diff --git a/ansible/roles/docs/tasks/main.yml b/ansible/roles/docs/tasks/main.yml new file mode 100644 index 0000000..dec775e --- /dev/null +++ b/ansible/roles/docs/tasks/main.yml @@ -0,0 +1,57 @@ +--- +# docs role — download and extract the Quartz-built docs tarball into +# docs_content_dir. Caddy serves the directory directly with Quartz-style +# try_files; there is no daemon to manage. +# +# Idempotency: a sentinel file records the installed docs_version. The +# download/extract steps only run when the sentinel doesn't match docs_version. +# +# Mirrors the cv role's curl-based download for consistency, even though the +# forge releases endpoint here does support HEAD. + +- name: Ensure docs home exists + ansible.builtin.file: + path: "{{ docs_home }}" + state: directory + mode: '0755' + +- name: Read installed docs version sentinel + ansible.builtin.slurp: + src: "{{ docs_version_sentinel }}" + register: docs_installed_raw + failed_when: false + changed_when: false + +- name: Set installed docs version fact + ansible.builtin.set_fact: + docs_installed_version: >- + {{ (docs_installed_raw.content | b64decode).strip() + if (docs_installed_raw.content is defined) else '' }} + +- name: Recreate docs content dir + ansible.builtin.file: + path: "{{ docs_content_dir }}" + state: "{{ item }}" + mode: '0755' + loop: + - absent + - directory + when: docs_installed_version != docs_version + +- name: Download and extract docs release tarball + ansible.builtin.shell: + cmd: >- + set -euo pipefail; + curl -fsSL {{ docs_release_url | quote }} -o {{ docs_home }}/docs.tar.gz && + tar -xzf {{ docs_home }}/docs.tar.gz -C {{ docs_content_dir }} && + rm -f {{ docs_home }}/docs.tar.gz + executable: /bin/bash + when: docs_installed_version != docs_version + changed_when: true + +- name: Write docs version sentinel + ansible.builtin.copy: + content: "{{ docs_version }}\n" + dest: "{{ docs_version_sentinel }}" + mode: '0644' + when: docs_installed_version != docs_version diff --git a/ansible/roles/heph/defaults/main.yml b/ansible/roles/heph/defaults/main.yml new file mode 100644 index 0000000..88d2240 --- /dev/null +++ b/ansible/roles/heph/defaults/main.yml @@ -0,0 +1,49 @@ +--- +# hephaestus hub — the canonical heph replica (server mode) on indri. +# Other devices (e.g. gilbert) are spokes that sync against this hub. +# See [[set-up-sync-hub]] and [[host-heph-pwa]] in the hephaestus repo. + +# Pinned release used for the initial `cargo install` and the PWA shell. +# After bootstrap, hephd's own --self-update keeps the binary current; this +# pin only governs the first install and the bundled PWA shell version. +heph_version: v1.2.1 + +# Anonymous public HTTPS clone — matches hephd's INSTALL_GIT_URL so the initial +# install and unattended self-update build from the same source (no ssh-agent). +heph_repo_url: https://forge.eblu.me/eblume/hephaestus.git + +heph_bin_dir: /Users/erichblume/.cargo/bin +heph_binary: "{{ heph_bin_dir }}/hephd" + +# rustc/cargo here are rustup shims. The bare (non-mise) environment that the +# launchagent and ansible run in falls back to rustup's *default* toolchain, +# which can lag behind heph's rust-version floor (Cargo.toml: 1.89). Pin the +# channel explicitly so both the bootstrap build and unattended self-update +# always use a current toolchain regardless of the host's rustup default. +heph_rust_toolchain: stable + +heph_data_dir: /Users/erichblume/.local/share/heph +heph_db: "{{ heph_data_dir }}/heph.db" +heph_socket: "{{ heph_data_dir }}/hephd.sock" +heph_log_dir: /Users/erichblume/Library/Logs + +# Version-pinned source checkout; the PWA static shell is served directly from +# its heph-pwa/ subdir (no copy), keeping shell and hub in lockstep at heph_version. +heph_pwa_src_dir: /Users/erichblume/.cache/heph-pwa-src +heph_web_root: "{{ heph_pwa_src_dir }}/heph-pwa" + +# Hub listens on all interfaces so tailnet spokes can reach it directly +# (http://indri.tail8d86e.ts.net:8787) and Caddy can proxy heph.ops.eblu.me. +# Access is gated by Authentik OIDC regardless — tailnet reachability is not +# enough (this is the owner's most sensitive data). +heph_http_addr: 0.0.0.0:8787 +heph_port: 8787 +heph_external_url: https://heph.ops.eblu.me + +# Authentik OIDC — issuer + audience together turn hub auth on. The audience is +# the device-code client id (see argocd/manifests/authentik heph blueprint). +heph_oidc_issuer: https://authentik.ops.eblu.me/application/o/heph/ +heph_oidc_audience: heph + +# Self-update poll interval (seconds). 10 minutes. +heph_self_update_interval_secs: 600 diff --git a/ansible/roles/heph/handlers/main.yml b/ansible/roles/heph/handlers/main.yml new file mode 100644 index 0000000..92fe9d7 --- /dev/null +++ b/ansible/roles/heph/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart heph + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.heph.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.heph.plist + changed_when: true diff --git a/ansible/roles/heph/tasks/main.yml b/ansible/roles/heph/tasks/main.yml new file mode 100644 index 0000000..7a45fe3 --- /dev/null +++ b/ansible/roles/heph/tasks/main.yml @@ -0,0 +1,82 @@ +--- +# hephaestus hub (server mode) on indri. +# +# DATA SEEDING (one-time, Path A — do this BEFORE the first provision so the hub +# adopts gilbert's existing data instead of being born empty): +# +# 1. On the seed device (gilbert): heph daemon stop +# 2. Copy its store to indri: scp ~/.local/share/heph/heph.db \ +# indri:~/.local/share/heph/heph.db +# 3. On indri, give the hub its OWN device origin (keeps gilbert's owner_id + +# data; hephd regenerates a fresh origin on next start when it is missing): +# sqlite3 ~/.local/share/heph/heph.db "DELETE FROM meta WHERE key='origin';" +# 4. Run this role (installs hephd, stages the PWA, loads the launchagent). +# +# hephd auto-creates an empty store on first start if none exists, so seeding is +# optional — skip it only if you intend a fresh, empty hub. + +- name: Ensure heph data directory exists + ansible.builtin.file: + path: "{{ heph_data_dir }}" + state: directory + mode: '0700' + +- name: Check for installed hephd binary + ansible.builtin.stat: + path: "{{ heph_binary }}" + register: heph_binary_stat + +# Bootstrap install only when hephd is absent. Thereafter hephd's own +# --self-update keeps it current; ansible must not fight (or downgrade) it. +# This builds from source and can take several minutes on a cold cargo cache. +- name: Bootstrap-install heph + hephd from the forge ({{ heph_version }}) + ansible.builtin.command: + cmd: >- + {{ heph_bin_dir }}/cargo install --locked + --git {{ heph_repo_url }} + --tag {{ heph_version }} + heph hephd + environment: + PATH: "{{ heph_bin_dir }}:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin" + RUSTUP_TOOLCHAIN: "{{ heph_rust_toolchain }}" + when: not heph_binary_stat.stat.exists + changed_when: true + notify: Restart heph + +# Checkout provides the PWA shell at {{ heph_web_root }} (heph-pwa/ subdir), +# served directly by hephd. Static files are read from disk per request, so a +# version bump needs no restart; the service worker (CACHE = "heph-pwa-vN") +# evicts stale assets on next load. +- name: Ensure heph cache parent directory exists + ansible.builtin.file: + path: "{{ heph_pwa_src_dir | dirname }}" + state: directory + mode: '0755' + +- name: Stage heph-pwa source at {{ heph_version }} + ansible.builtin.git: + repo: "{{ heph_repo_url }}" + dest: "{{ heph_pwa_src_dir }}" + version: "{{ heph_version }}" + depth: 1 + single_branch: true + force: true + +- name: Deploy heph LaunchAgent plist + ansible.builtin.template: + src: heph.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.heph.plist + mode: '0644' + notify: Restart heph + +- name: Check if heph LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.heph + register: heph_launchctl_check + changed_when: false + failed_when: false + +- name: Load heph LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.heph.plist + when: heph_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/heph/templates/heph.plist.j2 b/ansible/roles/heph/templates/heph.plist.j2 new file mode 100644 index 0000000..19a2367 --- /dev/null +++ b/ansible/roles/heph/templates/heph.plist.j2 @@ -0,0 +1,50 @@ + + + + + + Label + mcquack.eblume.heph + ProgramArguments + + {{ heph_binary }} + --mode + server + --http-addr + {{ heph_http_addr }} + --db + {{ heph_db }} + --socket + {{ heph_socket }} + --web-root + {{ heph_web_root }} + --oidc-issuer + {{ heph_oidc_issuer }} + --oidc-audience + {{ heph_oidc_audience }} + --self-update + --self-update-interval-secs + {{ heph_self_update_interval_secs }} + + RunAtLoad + + KeepAlive + + EnvironmentVariables + + + PATH + {{ heph_bin_dir }}:/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin + HOME + /Users/erichblume + + RUSTUP_TOOLCHAIN + {{ heph_rust_toolchain }} + + StandardOutPath + {{ heph_log_dir }}/mcquack.heph.out.log + StandardErrorPath + {{ heph_log_dir }}/mcquack.heph.err.log + + diff --git a/argocd/apps/cloudnative-pg-ringtail.yaml b/argocd/apps/cloudnative-pg-ringtail.yaml new file mode 100644 index 0000000..fa7bba0 --- /dev/null +++ b/argocd/apps/cloudnative-pg-ringtail.yaml @@ -0,0 +1,27 @@ +# CloudNativePG Operator for ringtail k3s cluster +# Deploys the operator only; PostgreSQL clusters are created separately +# +# Sibling of cloudnative-pg.yaml (minikube). Same mirror, same release, +# different destination. Both apps will coexist during the immich +# migration; the minikube one is removed at the end of the broader +# indri-k8s decommission. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: cloudnative-pg-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/mirrors/cloudnative-pg.git + targetRevision: v1.27.1 + path: releases + directory: + include: 'cnpg-1.27.1.yaml' + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: cnpg-system + syncPolicy: + syncOptions: + - CreateNamespace=true + - ServerSideApply=true # Required for large CRDs that exceed annotation size limit diff --git a/argocd/apps/cv.yaml b/argocd/apps/cv.yaml deleted file mode 100644 index ad09a8d..0000000 --- a/argocd/apps/cv.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: cv - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/cv - destination: - server: https://kubernetes.default.svc - namespace: cv - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/databases-ringtail.yaml b/argocd/apps/databases-ringtail.yaml new file mode 100644 index 0000000..00de4e3 --- /dev/null +++ b/argocd/apps/databases-ringtail.yaml @@ -0,0 +1,26 @@ +# Databases on ringtail k3s. +# +# Today: only immich-pg (CNPG Cluster) + its borgmatic ExternalSecret. +# More databases may move here as the indri-k8s decommission proceeds. +# +# Prerequisites: +# - cloudnative-pg-ringtail (operator must exist before the Cluster CR) +# - external-secrets-ringtail + 1password-connect-ringtail (for the +# immich-pg-borgmatic ExternalSecret to sync) +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: databases-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/databases-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: databases + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/devpi.yaml b/argocd/apps/devpi.yaml deleted file mode 100644 index 4a15672..0000000 --- a/argocd/apps/devpi.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# devpi PyPI Caching Proxy -# Provides PyPI cache and private package hosting -# -# After first deployment, initialize devpi: -# kubectl -n devpi exec -it devpi-0 -- devpi-init --serverdir /devpi --root-passwd -# kubectl -n devpi rollout restart statefulset devpi -# -# Then create user/index: -# uvx devpi use https://pypi.tail8d86e.ts.net -# uvx devpi login root -# uvx devpi user -c eblume email=blume.erich@gmail.com -# uvx devpi index -c eblume/dev bases=root/pypi -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: devpi - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/devpi - destination: - server: https://kubernetes.default.svc - namespace: devpi - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/docs.yaml b/argocd/apps/docs.yaml deleted file mode 100644 index cd8db35..0000000 --- a/argocd/apps/docs.yaml +++ /dev/null @@ -1,18 +0,0 @@ ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: docs - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/docs - destination: - server: https://kubernetes.default.svc - namespace: docs - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/external-secrets-ringtail.yaml b/argocd/apps/external-secrets-ringtail.yaml index e2f5898..0bb8bd7 100644 --- a/argocd/apps/external-secrets-ringtail.yaml +++ b/argocd/apps/external-secrets-ringtail.yaml @@ -15,7 +15,7 @@ spec: source: repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git targetRevision: main - path: argocd/manifests/external-secrets + path: argocd/manifests/external-secrets-ringtail destination: server: https://ringtail.tail8d86e.ts.net:6443 namespace: external-secrets diff --git a/argocd/apps/homepage.yaml b/argocd/apps/homepage.yaml index 86a0f8d..22147f2 100644 --- a/argocd/apps/homepage.yaml +++ b/argocd/apps/homepage.yaml @@ -14,7 +14,7 @@ spec: targetRevision: main path: argocd/manifests/homepage destination: - server: https://kubernetes.default.svc + server: https://ringtail.tail8d86e.ts.net:6443 namespace: homepage syncPolicy: syncOptions: diff --git a/argocd/apps/immich-ringtail.yaml b/argocd/apps/immich-ringtail.yaml new file mode 100644 index 0000000..c93cbee --- /dev/null +++ b/argocd/apps/immich-ringtail.yaml @@ -0,0 +1,31 @@ +# Immich on ringtail k3s. +# +# Staging deployment; the minikube `immich` app remains in parallel +# until cutover. See [[immich-cutover-and-decommission]] for the +# routing flip + minikube cleanup. +# +# Prerequisites: +# - cnpg-on-ringtail + databases-ringtail (postgres) +# - 1password-connect-ringtail + external-secrets-ringtail (not used +# by this app today — immich-db Secret is created manually, +# matching the minikube pattern) +# - The immich-db Secret in the immich namespace, holding the +# password for the `immich` postgres role (copied from the source +# immich-pg-app Secret at migration time). +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: immich-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/immich-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: immich + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/immich.yaml b/argocd/apps/immich.yaml deleted file mode 100644 index 7efd263..0000000 --- a/argocd/apps/immich.yaml +++ /dev/null @@ -1,30 +0,0 @@ -# Immich - Self-hosted photo and video management -# High-performance Google Photos/iCloud alternative with AI features -# -# Kustomize manifests in argocd/manifests/immich/ -# Components: server, machine-learning, valkey (Redis) -# -# Prerequisites: -# 1. Create immich namespace and secrets: -# kubectl create namespace immich -# kubectl --context=minikube-indri create secret generic immich-db -n immich \ -# --from-literal=password="$(kubectl --context=minikube-indri -n databases get secret immich-pg-app -o jsonpath='{.data.password}' | base64 -d)" -# 2. Create immich-pg database and user (see immich-pg app) -# 3. NFS share on sifaka at /volume1/photos with read/write for indri -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: immich - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/immich - destination: - server: https://kubernetes.default.svc - namespace: immich - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/mealie-ringtail.yaml b/argocd/apps/mealie-ringtail.yaml new file mode 100644 index 0000000..2f014a9 --- /dev/null +++ b/argocd/apps/mealie-ringtail.yaml @@ -0,0 +1,26 @@ +# Mealie on ringtail k3s. +# +# Wave-1 indri-k8s decommission. Staging deployment; the minikube `mealie` +# app stays in parallel until cutover (copy SQLite PVC, drop the minikube +# tailscale ingress, flip Caddy). See [[migrate-wave1-ringtail]]. +# +# Prerequisites: +# - external-secrets-ringtail (onepassword-blumeops ClusterSecretStore) +# - mealie-data PVC contents copied from minikube at cutover +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: mealie-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/mealie-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: mealie + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/mealie.yaml b/argocd/apps/mealie.yaml deleted file mode 100644 index af33469..0000000 --- a/argocd/apps/mealie.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: mealie - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/mealie - destination: - server: https://kubernetes.default.svc - namespace: mealie - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/paperless-ringtail.yaml b/argocd/apps/paperless-ringtail.yaml new file mode 100644 index 0000000..bec98e9 --- /dev/null +++ b/argocd/apps/paperless-ringtail.yaml @@ -0,0 +1,28 @@ +# Paperless-ngx on ringtail k3s. +# +# Wave-1 indri-k8s decommission. Staging deployment; the minikube +# `paperless` app stays in parallel until cutover (drop the minikube +# tailscale ingress to free the name, then flip Caddy). See +# [[migrate-wave1-ringtail]]. +# +# Prerequisites: +# - databases-ringtail blumeops-pg (paperless database + role) +# - external-secrets-ringtail (onepassword-blumeops ClusterSecretStore) +# - sifaka NFS rule granting ringtail access to /volume1/paperless +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: paperless-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/paperless-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: paperless + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/paperless.yaml b/argocd/apps/paperless.yaml deleted file mode 100644 index 88437eb..0000000 --- a/argocd/apps/paperless.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: paperless - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/paperless - destination: - server: https://kubernetes.default.svc - namespace: paperless - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/apps/shower.yaml b/argocd/apps/shower.yaml new file mode 100644 index 0000000..c4a7a62 --- /dev/null +++ b/argocd/apps/shower.yaml @@ -0,0 +1,20 @@ +# Adelaide / Heidi / Addie baby shower app — Django guest/raffle/prize system. +# Public landing page at shower.eblu.me (via fly proxy), staff console + admin +# at shower.ops.eblu.me (tailnet only). Built from forge PyPI wheel. +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: shower + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/shower + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: shower + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/teslamate-ringtail.yaml b/argocd/apps/teslamate-ringtail.yaml new file mode 100644 index 0000000..b7b3491 --- /dev/null +++ b/argocd/apps/teslamate-ringtail.yaml @@ -0,0 +1,28 @@ +# TeslaMate on ringtail k3s. +# +# Wave-1 indri-k8s decommission. Staging deployment; the minikube +# `teslamate` app stays in parallel until cutover (migrate the teslamate +# database, drop the minikube tailscale ingress, flip Caddy). See +# [[migrate-wave1-ringtail]]. +# +# Prerequisites: +# - databases-ringtail blumeops-pg (teslamate database + role; cube + +# earthdistance extensions created by superuser at cutover) +# - external-secrets-ringtail (onepassword-blumeops ClusterSecretStore) +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: teslamate-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/teslamate-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: teslamate + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/teslamate.yaml b/argocd/apps/teslamate.yaml deleted file mode 100644 index 60247da..0000000 --- a/argocd/apps/teslamate.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# TeslaMate Tesla Data Logger -# Requires: CloudNativePG PostgreSQL cluster and manual secret setup -# -# Before syncing, create the namespace and secrets: -# kubectl create namespace teslamate -# op inject -i argocd/manifests/databases/secret-teslamate.yaml.tpl | kubectl apply -f - -# op inject -i argocd/manifests/teslamate/secret-encryption-key.yaml.tpl | kubectl apply -f - -# op inject -i argocd/manifests/teslamate/secret-db.yaml.tpl | kubectl apply -f - -# -# Then create the database: -# PGPASSWORD=$(op read "op://blumeops/postgres/password") \ -# psql -h pg.ops.eblu.me -U eblume -c "CREATE DATABASE teslamate OWNER teslamate;" -# -# After syncing, access the TeslaMate UI at https://tesla.tail8d86e.ts.net to complete -# Tesla API authentication via OAuth flow. -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: teslamate - namespace: argocd -spec: - project: default - source: - repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git - targetRevision: main - path: argocd/manifests/teslamate - destination: - server: https://kubernetes.default.svc - namespace: teslamate - syncPolicy: - syncOptions: - - CreateNamespace=true diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index a716ddc..2940b0b 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -159,8 +159,10 @@ prometheus.exporter.blackbox "services" { } target { + // devpi runs natively on indri (LaunchAgent), not in-cluster. + // We probe through Caddy (https://pypi.ops.eblu.me) which the cluster can reach via Tailscale. name = "devpi" - address = "http://devpi.devpi.svc.cluster.local:3141/+api" + address = "https://pypi.ops.eblu.me/+api" module = "http_2xx" } @@ -189,14 +191,9 @@ prometheus.exporter.blackbox "services" { } target { + // Migrated to ringtail (wave-1); probe through Caddy over Tailscale. name = "teslamate" - address = "http://teslamate.teslamate.svc.cluster.local:4000/" - module = "http_2xx" - } - - target { - name = "immich" - address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping" + address = "https://tesla.ops.eblu.me/" module = "http_2xx" } diff --git a/argocd/manifests/alloy-k8s/kustomization.yaml b/argocd/manifests/alloy-k8s/kustomization.yaml index f51bd3a..3503ead 100644 --- a/argocd/manifests/alloy-k8s/kustomization.yaml +++ b/argocd/manifests/alloy-k8s/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.14.0-fd0bebb + newTag: v1.16.0-9564435 configMapGenerator: - name: alloy-config diff --git a/argocd/manifests/alloy-ringtail/config.alloy b/argocd/manifests/alloy-ringtail/config.alloy index e92ab0f..e5cc045 100644 --- a/argocd/manifests/alloy-ringtail/config.alloy +++ b/argocd/manifests/alloy-ringtail/config.alloy @@ -45,6 +45,26 @@ prometheus.scrape "kube_state_metrics" { forward_to = [prometheus.remote_write.prometheus.receiver] } +// ============== SERVICE HEALTH PROBES ============== + +// Blackbox-style HTTP probes for in-cluster services on ringtail +prometheus.exporter.blackbox "services" { + config = "{ modules: { http_2xx: { prober: http, timeout: 5s } } }" + + target { + name = "immich" + address = "http://immich-server.immich.svc.cluster.local:2283/api/server/ping" + module = "http_2xx" + } +} + +// Scrape blackbox probe results +prometheus.scrape "blackbox" { + targets = prometheus.exporter.blackbox.services.targets + scrape_interval = "30s" + forward_to = [prometheus.remote_write.prometheus.receiver] +} + // Push metrics to indri Prometheus prometheus.remote_write "prometheus" { external_labels = { cluster = "ringtail" } diff --git a/argocd/manifests/alloy-ringtail/kustomization.yaml b/argocd/manifests/alloy-ringtail/kustomization.yaml index df472aa..526fec5 100644 --- a/argocd/manifests/alloy-ringtail/kustomization.yaml +++ b/argocd/manifests/alloy-ringtail/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.14.0-fd0bebb-nix + newTag: v1.16.0-9564435-nix configMapGenerator: - name: alloy-config diff --git a/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml b/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml index 5c8e683..b1e6338 100644 --- a/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml +++ b/argocd/manifests/alloy-tracing-ringtail/kustomization.yaml @@ -9,7 +9,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/alloy - newTag: v1.14.0-fd0bebb-nix + newTag: v1.16.0-9564435-nix configMapGenerator: - name: alloy-tracing-config diff --git a/argocd/manifests/argocd/README.md b/argocd/manifests/argocd/README.md index 615e3bb..2eaf4d4 100644 --- a/argocd/manifests/argocd/README.md +++ b/argocd/manifests/argocd/README.md @@ -25,7 +25,7 @@ kubectl wait --for=condition=available deployment/argocd-server -n argocd --time kubectl -n argocd get secret argocd-initial-admin-secret -o jsonpath="{.data.password}" | base64 -d && echo # 5. Login and change password -argocd login argocd.tail8d86e.ts.net --username admin --grpc-web +argocd login argocd.tail8d86e.ts.net --username admin argocd account update-password # 6. Apply repo-creds-forge credential template for SSH access to all forge repos @@ -114,4 +114,4 @@ spec: Future improvement: integrate with a secrets operator (e.g., External Secrets). - The credential template (`repo-creds`) uses a URL prefix to match all repos on forge. - ArgoCD uses Tailscale Ingress with Let's Encrypt for TLS termination. -- The `--grpc-web` flag is required for CLI access through the Tailscale ingress. +- After Authentik is up, prefer `argocd login argocd.ops.eblu.me --sso` over the admin password login above; admin is only needed during bootstrap or as break-glass. diff --git a/argocd/manifests/argocd/argocd-cm-patch.yaml b/argocd/manifests/argocd/argocd-cm-patch.yaml index cb7e27f..54e4ede 100644 --- a/argocd/manifests/argocd/argocd-cm-patch.yaml +++ b/argocd/manifests/argocd/argocd-cm-patch.yaml @@ -16,7 +16,6 @@ data: name: Authentik issuer: https://authentik.ops.eblu.me/application/o/argocd/ clientID: argocd - clientSecret: $argocd-oidc-authentik:client-secret requestedScopes: - openid - profile diff --git a/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml b/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml index c2ea095..4914587 100644 --- a/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml +++ b/argocd/manifests/argocd/argocd-rbac-cm-patch.yaml @@ -2,6 +2,9 @@ # # - workflow-bot: minimal CI/CD permissions (sync, get) # - admins: Authentik admins group mapped to ArgoCD admin role +# - admin: local break-glass account — keeps ArgoCD admin rights for when +# Authentik SSO is unavailable (without this it has no permissions, since +# policy.default is unset) # apiVersion: v1 kind: ConfigMap @@ -14,3 +17,4 @@ data: p, role:workflow-bot, applications, get, *, allow g, workflow-bot, role:workflow-bot g, admins, role:admin + g, admin, role:admin diff --git a/argocd/manifests/argocd/external-secret-oidc-authentik.yaml b/argocd/manifests/argocd/external-secret-oidc-authentik.yaml deleted file mode 100644 index 475a713..0000000 --- a/argocd/manifests/argocd/external-secret-oidc-authentik.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# ExternalSecret for ArgoCD OIDC client secret (Authentik) -# -# Referenced from argocd-cm as $argocd-oidc-authentik:client-secret -# Must have app.kubernetes.io/part-of: argocd label for ArgoCD to read it -# ---- -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: argocd-oidc-authentik - namespace: argocd -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: argocd-oidc-authentik - creationPolicy: Owner - template: - metadata: - labels: - app.kubernetes.io/part-of: argocd - data: - - secretKey: client-secret - remoteRef: - conversionStrategy: Default - decodingStrategy: None - key: "Authentik (blumeops)" - metadataPolicy: None - property: argocd-client-secret diff --git a/argocd/manifests/argocd/kustomization.yaml b/argocd/manifests/argocd/kustomization.yaml index 9bdac10..6deb7ec 100644 --- a/argocd/manifests/argocd/kustomization.yaml +++ b/argocd/manifests/argocd/kustomization.yaml @@ -9,7 +9,6 @@ resources: - https://raw.githubusercontent.com/argoproj/argo-cd/998fb59dc355653c0657908a6ea2f87136e022d1/manifests/install.yaml - ingress-tailscale.yaml - external-secret-repo-forge.yaml - - external-secret-oidc-authentik.yaml patches: - path: argocd-cmd-params-cm.yaml diff --git a/argocd/manifests/authentik/configmap-blueprint.yaml b/argocd/manifests/authentik/configmap-blueprint.yaml index 27910ef..cc97dea 100644 --- a/argocd/manifests/authentik/configmap-blueprint.yaml +++ b/argocd/manifests/authentik/configmap-blueprint.yaml @@ -262,14 +262,15 @@ data: name: ArgoCD authorization_flow: !Find [authentik_flows.flow, [slug, default-provider-authorization-implicit-consent]] invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]] - client_type: confidential + client_type: public client_id: argocd - client_secret: !Env AUTHENTIK_ARGOCD_CLIENT_SECRET redirect_uris: - matching_mode: strict url: https://argocd.ops.eblu.me/auth/callback - matching_mode: strict url: https://argocd.tail8d86e.ts.net/auth/callback + - matching_mode: strict + url: http://localhost:8085/auth/callback signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]] property_mappings: - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]] @@ -433,3 +434,93 @@ data: provider: !KeyOf mealie-provider meta_launch_url: https://meals.ops.eblu.me policy_engine_mode: all + + heph.yaml: | + version: 1 + metadata: + name: BlumeOps Heph SSO + labels: + blueprints.goauthentik.io/description: "Hephaestus hub OIDC (device-code) provider, application, and device-code flow" + entries: + # Device-code flow (RFC 8628). authentik ships no default for this, so we + # create one and bind it to the brand below. An empty stage_configuration + # flow is sufficient: the already-authenticated user just confirms the code. + - model: authentik_flows.flow + id: device-code-flow + identifiers: + slug: default-device-code-flow + attrs: + name: Device code flow + title: Device code flow + slug: default-device-code-flow + designation: stage_configuration + authentication: require_authenticated + + # Enable the device-code grant globally by binding the flow to the default + # brand (domain authentik-default). Partial update — only sets this field. + - model: authentik_brands.brand + identifiers: + domain: authentik-default + attrs: + flow_device_code: !KeyOf device-code-flow + + # OAuth2 provider for heph — PUBLIC client (device-code + PKCE, no secret). + # client_id doubles as the token audience the hub verifies (--oidc-audience heph), + # and the app slug 'heph' is the issuer path (/application/o/heph/). + - model: authentik_providers_oauth2.oauth2provider + id: heph-provider + identifiers: + name: Heph + attrs: + name: Heph + authorization_flow: !Find [authentik_flows.flow, [slug, default-provider-authorization-implicit-consent]] + invalidation_flow: !Find [authentik_flows.flow, [slug, default-provider-invalidation-flow]] + client_type: public + client_id: heph + # CLI/TUI use the device-code grant (no redirect). The heph-pwa browser + # login uses Authorization Code + PKCE, which DOES redirect back to the + # app's origin — register those here (Authentik also keys token-endpoint + # CORS off these origins). Trailing slash matters: the PWA's redirect_uri + # is its base dir, e.g. https://heph.ops.eblu.me/. + redirect_uris: + - matching_mode: strict + url: https://heph.ops.eblu.me/ + - matching_mode: strict + url: http://localhost:8787/ # local dev (hephd --web-root) + signing_key: !Find [authentik_crypto.certificatekeypair, [name, authentik Self-signed Certificate]] + property_mappings: + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, openid]] + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, email]] + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, profile]] + # offline_access: heph CLI requests "openid offline_access"; without + # this mapping the refresh token is session-bound and hephd's + # refresh_token grant 400s once the session lapses (spoke sync dies). + - !Find [authentik_providers_oauth2.scopemapping, [scope_name, offline_access]] + sub_mode: hashed_user_id + include_claims_in_id_token: true + + # Heph application — linked to the OAuth2 provider + - model: authentik_core.application + id: heph-app + identifiers: + slug: heph + attrs: + name: Hephaestus + slug: heph + provider: !KeyOf heph-provider + meta_launch_url: https://heph.ops.eblu.me + policy_engine_mode: any + + # Policy binding — restrict heph to admins group (single-owner, sensitive data) + - model: authentik_policies.policybinding + identifiers: + order: 0 + target: !KeyOf heph-app + group: !Find [authentik_core.group, [name, admins]] + attrs: + target: !KeyOf heph-app + group: !Find [authentik_core.group, [name, admins]] + order: 0 + enabled: true + negate: false + timeout: 30 diff --git a/argocd/manifests/authentik/deployment-worker.yaml b/argocd/manifests/authentik/deployment-worker.yaml index b81ec32..053fa3d 100644 --- a/argocd/manifests/authentik/deployment-worker.yaml +++ b/argocd/manifests/authentik/deployment-worker.yaml @@ -75,11 +75,6 @@ spec: secretKeyRef: name: authentik-config key: jellyfin-client-secret - - name: AUTHENTIK_ARGOCD_CLIENT_SECRET - valueFrom: - secretKeyRef: - name: authentik-config - key: argocd-client-secret - name: AUTHENTIK_MEALIE_CLIENT_SECRET valueFrom: secretKeyRef: diff --git a/argocd/manifests/authentik/external-secret.yaml b/argocd/manifests/authentik/external-secret.yaml index 9abf699..93de499 100644 --- a/argocd/manifests/authentik/external-secret.yaml +++ b/argocd/manifests/authentik/external-secret.yaml @@ -53,10 +53,6 @@ spec: remoteRef: key: "Authentik (blumeops)" property: jellyfin-client-secret - - secretKey: argocd-client-secret - remoteRef: - key: "Authentik (blumeops)" - property: argocd-client-secret - secretKey: mealie-client-secret remoteRef: key: "Authentik (blumeops)" diff --git a/argocd/manifests/cv/deployment.yaml b/argocd/manifests/cv/deployment.yaml deleted file mode 100644 index f2b00e6..0000000 --- a/argocd/manifests/cv/deployment.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cv - namespace: cv -spec: - replicas: 2 - strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 - selector: - matchLabels: - app: cv - template: - metadata: - labels: - app: cv - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: cv - image: registry.ops.eblu.me/blumeops/cv:kustomized - ports: - - containerPort: 80 - name: http - env: - - name: CV_RELEASE_URL - value: "https://forge.eblu.me/api/packages/eblume/generic/cv/v1.0.3/cv-v1.0.3.tar.gz" - resources: - requests: - memory: "64Mi" - cpu: "10m" - limits: - memory: "128Mi" - livenessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 10 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 5 - periodSeconds: 10 diff --git a/argocd/manifests/cv/ingress-tailscale.yaml b/argocd/manifests/cv/ingress-tailscale.yaml deleted file mode 100644 index 489f95a..0000000 --- a/argocd/manifests/cv/ingress-tailscale.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: cv-tailscale - namespace: cv - annotations: - tailscale.com/proxy-class: "default" - tailscale.com/proxy-group: "ingress" - tailscale.com/tags: "tag:k8s,tag:flyio-target" - gethomepage.dev/enabled: "true" - gethomepage.dev/name: "CV" - gethomepage.dev/group: "Services" - gethomepage.dev/icon: "mdi-file-document" - gethomepage.dev/description: "Resume / CV" - gethomepage.dev/href: "https://cv.eblu.me" - gethomepage.dev/pod-selector: "app=cv" -spec: - ingressClassName: tailscale - defaultBackend: - service: - name: cv - port: - number: 80 - tls: - - hosts: - - cv diff --git a/argocd/manifests/cv/kustomization.yaml b/argocd/manifests/cv/kustomization.yaml deleted file mode 100644 index 199108d..0000000 --- a/argocd/manifests/cv/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: cv -resources: - - deployment.yaml - - service.yaml - - ingress-tailscale.yaml - - pdb.yaml -images: - - name: registry.ops.eblu.me/blumeops/cv - newTag: v1.0.3-613f05d diff --git a/argocd/manifests/cv/pdb.yaml b/argocd/manifests/cv/pdb.yaml deleted file mode 100644 index db5240d..0000000 --- a/argocd/manifests/cv/pdb.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: cv -spec: - minAvailable: 1 - selector: - matchLabels: - app: cv diff --git a/argocd/manifests/cv/service.yaml b/argocd/manifests/cv/service.yaml deleted file mode 100644 index 23e0e94..0000000 --- a/argocd/manifests/cv/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: cv - namespace: cv -spec: - selector: - app: cv - ports: - - name: http - port: 80 - targetPort: 80 diff --git a/argocd/manifests/databases-ringtail/blumeops-pg.yaml b/argocd/manifests/databases-ringtail/blumeops-pg.yaml new file mode 100644 index 0000000..3a37249 --- /dev/null +++ b/argocd/manifests/databases-ringtail/blumeops-pg.yaml @@ -0,0 +1,97 @@ +# PostgreSQL Cluster for blumeops services on ringtail k3s. +# +# Wave-1 indri-k8s decommission target (see [[migrate-wave1-ringtail]]). +# Holds the paperless and teslamate databases migrated off the minikube +# blumeops-pg via cold pg_dump/pg_restore at cutover. miniflux + authentik +# stay where they are for now (later waves), so this cluster only carries +# the wave-1 roles. +# +# Apps reach this in-cluster at blumeops-pg-rw.databases.svc.cluster.local +# — the same name they used on minikube, so teslamate's DATABASE_HOST is +# unchanged. +# +# Database creation is deferred to cutover, mirroring the minikube cluster +# (where only the bootstrap database is declared and the rest were created +# out-of-band): +# - paperless: the bootstrap database below (restored into at cutover). +# - teslamate: created at its cutover by the eblume superuser, because the +# dump's `earthdistance` extension is untrusted and CREATE EXTENSION +# needs superuser. (cube + earthdistance ownership then transferred to +# the teslamate role so it can ALTER EXTENSION UPDATE.) +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: blumeops-pg + namespace: databases +spec: + instances: 1 + imageName: ghcr.io/cloudnative-pg/postgresql:18.3 + + storage: + size: 10Gi + storageClass: local-path + + bootstrap: + initdb: + database: paperless + owner: paperless + + managed: + roles: + # eblume superuser for admin + privileged restore steps (extensions) + - name: eblume + login: true + superuser: true + createdb: true + createrole: true + connectionLimit: -1 + ensure: present + inherit: true + passwordSecret: + name: blumeops-pg-eblume + # borgmatic read-only user for backups + - name: borgmatic + login: true + connectionLimit: -1 + ensure: present + inherit: true + inRoles: + - pg_read_all_data + passwordSecret: + name: blumeops-pg-borgmatic + # paperless user (also the bootstrap database owner above; the + # managed role sets its password from the 1Password-backed secret) + - name: paperless + login: true + connectionLimit: -1 + ensure: present + inherit: true + passwordSecret: + name: blumeops-pg-paperless + # teslamate user. Extension ownership (cube, earthdistance) is + # transferred to this role at cutover so it can ALTER EXTENSION UPDATE. + - name: teslamate + login: true + connectionLimit: -1 + ensure: present + inherit: true + passwordSecret: + name: blumeops-pg-teslamate + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + + postgresql: + parameters: + max_connections: "50" + shared_buffers: "128MB" + password_encryption: "scram-sha-256" + pg_hba: + # Password auth from anywhere; network security is via Tailscale. + - host all all 0.0.0.0/0 scram-sha-256 + - host all all ::/0 scram-sha-256 diff --git a/argocd/manifests/databases/external-secret-immich-borgmatic.yaml b/argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml similarity index 73% rename from argocd/manifests/databases/external-secret-immich-borgmatic.yaml rename to argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml index 8801c1a..ee600e3 100644 --- a/argocd/manifests/databases/external-secret-immich-borgmatic.yaml +++ b/argocd/manifests/databases-ringtail/external-secret-borgmatic.yaml @@ -1,13 +1,14 @@ -# ExternalSecret for borgmatic backup user password on immich-pg cluster +# ExternalSecret for borgmatic backup user password +# +# Replaces the manual op inject workflow from secret-borgmatic.yaml.tpl # -# Reuses the same 1Password item as blumeops-pg-borgmatic. # 1Password item: "borgmatic" in blumeops vault # Field: "db-password" # apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: - name: immich-pg-borgmatic + name: blumeops-pg-borgmatic namespace: databases spec: refreshInterval: 1h @@ -15,7 +16,7 @@ spec: kind: ClusterSecretStore name: onepassword-blumeops target: - name: immich-pg-borgmatic + name: blumeops-pg-borgmatic creationPolicy: Owner template: type: kubernetes.io/basic-auth diff --git a/argocd/manifests/databases-ringtail/external-secret-eblume.yaml b/argocd/manifests/databases-ringtail/external-secret-eblume.yaml new file mode 100644 index 0000000..a324c7d --- /dev/null +++ b/argocd/manifests/databases-ringtail/external-secret-eblume.yaml @@ -0,0 +1,30 @@ +# ExternalSecret for eblume superuser password +# +# Replaces the manual op inject workflow from secret-eblume.yaml.tpl +# +# 1Password item: "postgres" in blumeops vault +# Field: "password" +# +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: blumeops-pg-eblume + namespace: databases +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: blumeops-pg-eblume + creationPolicy: Owner + template: + type: kubernetes.io/basic-auth + data: + username: eblume + password: "{{ .password }}" + data: + - secretKey: password + remoteRef: + key: postgres + property: password diff --git a/argocd/manifests/databases-ringtail/external-secret-immich-borgmatic.yaml b/argocd/manifests/databases-ringtail/external-secret-immich-borgmatic.yaml new file mode 100644 index 0000000..3d1fc14 --- /dev/null +++ b/argocd/manifests/databases-ringtail/external-secret-immich-borgmatic.yaml @@ -0,0 +1,32 @@ +# ExternalSecret for borgmatic backup user password on immich-pg cluster +# (ringtail k3s). +# +# Mirror of argocd/manifests/databases/external-secret-immich-borgmatic.yaml. +# The onepassword-blumeops ClusterSecretStore exists on ringtail via the +# external-secrets-ringtail app. +# +# 1Password item: "borgmatic" in blumeops vault +# Field: "db-password" +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: immich-pg-borgmatic + namespace: databases +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: immich-pg-borgmatic + creationPolicy: Owner + template: + type: kubernetes.io/basic-auth + data: + username: borgmatic + password: "{{ .password }}" + data: + - secretKey: password + remoteRef: + key: borgmatic + property: db-password diff --git a/argocd/manifests/databases/external-secret-paperless.yaml b/argocd/manifests/databases-ringtail/external-secret-paperless.yaml similarity index 100% rename from argocd/manifests/databases/external-secret-paperless.yaml rename to argocd/manifests/databases-ringtail/external-secret-paperless.yaml diff --git a/argocd/manifests/databases/external-secret-teslamate.yaml b/argocd/manifests/databases-ringtail/external-secret-teslamate.yaml similarity index 100% rename from argocd/manifests/databases/external-secret-teslamate.yaml rename to argocd/manifests/databases-ringtail/external-secret-teslamate.yaml diff --git a/argocd/manifests/databases-ringtail/immich-pg.yaml b/argocd/manifests/databases-ringtail/immich-pg.yaml new file mode 100644 index 0000000..982bc43 --- /dev/null +++ b/argocd/manifests/databases-ringtail/immich-pg.yaml @@ -0,0 +1,53 @@ +# PostgreSQL Cluster for Immich on ringtail k3s. +# +# Initially bootstrapped via CNPG pg_basebackup from the minikube +# immich-pg cluster on 2026-05-13, then promoted to primary. The +# externalClusters + bootstrap.pg_basebackup blocks have been pruned +# from this manifest now that the migration is complete — leaving +# them around is a footgun (re-enabling replica.enabled=true would +# try to demote this cluster against a stale source). See +# [[immich-pg-data-migration]] for the procedure used. +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: immich-pg + namespace: databases +spec: + instances: 1 + imageName: ghcr.io/tensorchord/cloudnative-vectorchord:17-0.5.0 + + storage: + size: 10Gi + storageClass: local-path + + # Managed roles + managed: + roles: + - name: borgmatic + login: true + connectionLimit: -1 + ensure: present + inherit: true + inRoles: + - pg_read_all_data + passwordSecret: + name: immich-pg-borgmatic + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + + postgresql: + shared_preload_libraries: + - "vchord.so" + parameters: + max_connections: "50" + shared_buffers: "128MB" + password_encryption: "scram-sha-256" + pg_hba: + - host all all 0.0.0.0/0 scram-sha-256 + - host all all ::/0 scram-sha-256 diff --git a/argocd/manifests/databases-ringtail/kustomization.yaml b/argocd/manifests/databases-ringtail/kustomization.yaml new file mode 100644 index 0000000..143345c --- /dev/null +++ b/argocd/manifests/databases-ringtail/kustomization.yaml @@ -0,0 +1,16 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: databases + +resources: + - immich-pg.yaml + - external-secret-immich-borgmatic.yaml + - service-immich-pg-tailscale.yaml + # wave-1 indri-k8s decommission: blumeops-pg (paperless + teslamate) + - blumeops-pg.yaml + - service-blumeops-pg-tailscale.yaml + - external-secret-eblume.yaml + - external-secret-borgmatic.yaml + - external-secret-paperless.yaml + - external-secret-teslamate.yaml diff --git a/argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml b/argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml new file mode 100644 index 0000000..f7ca5ef --- /dev/null +++ b/argocd/manifests/databases-ringtail/service-blumeops-pg-tailscale.yaml @@ -0,0 +1,24 @@ +# Tailscale LoadBalancer for the ringtail blumeops-pg cluster. +# Canonical hostname: blumeops-pg-ringtail.tail8d86e.ts.net (distinct from +# the minikube blumeops-pg, which still owns pg.tail8d86e.ts.net until the +# wave-1 decommission). Borgmatic on indri and the Grafana TeslaMate +# datasource reach it via the Caddy L4 route pg.ops.eblu.me:5434. +apiVersion: v1 +kind: Service +metadata: + name: blumeops-pg-tailscale + namespace: databases + annotations: + tailscale.com/hostname: "blumeops-pg-ringtail" + tailscale.com/proxy-class: "default" +spec: + type: LoadBalancer + loadBalancerClass: tailscale + selector: + cnpg.io/cluster: blumeops-pg + role: primary + ports: + - name: postgresql + port: 5432 + targetPort: 5432 + protocol: TCP diff --git a/argocd/manifests/databases/service-immich-pg-tailscale.yaml b/argocd/manifests/databases-ringtail/service-immich-pg-tailscale.yaml similarity index 57% rename from argocd/manifests/databases/service-immich-pg-tailscale.yaml rename to argocd/manifests/databases-ringtail/service-immich-pg-tailscale.yaml index 78891dd..92deb14 100644 --- a/argocd/manifests/databases/service-immich-pg-tailscale.yaml +++ b/argocd/manifests/databases-ringtail/service-immich-pg-tailscale.yaml @@ -1,6 +1,8 @@ -# Tailscale LoadBalancer for immich-pg PostgreSQL access -# Canonical hostname: immich-pg.tail8d86e.ts.net -# Caddy L4 proxies pg.ops.eblu.me:5433 → this service for borgmatic backups +# Tailscale LoadBalancer for immich-pg PostgreSQL access on ringtail. +# Canonical hostname: immich-pg.tail8d86e.ts.net (claimed from the +# minikube side after the minikube service was removed during the +# immich-to-ringtail migration). Borgmatic on indri uses this +# hostname for nightly backups. apiVersion: v1 kind: Service metadata: diff --git a/argocd/manifests/databases/blumeops-pg.yaml b/argocd/manifests/databases/blumeops-pg.yaml index 58c771a..37aef23 100644 --- a/argocd/manifests/databases/blumeops-pg.yaml +++ b/argocd/manifests/databases/blumeops-pg.yaml @@ -44,18 +44,9 @@ spec: - pg_read_all_data passwordSecret: name: blumeops-pg-borgmatic - # teslamate user for TeslaMate Tesla data logger - # Superuser removed. Extension ownership (cube, earthdistance) - # transferred manually so teslamate can ALTER EXTENSION UPDATE. - # earthdistance is untrusted — DROP+CREATE needs temporary - # superuser escalation during upgrades. - - name: teslamate - login: true - connectionLimit: -1 - ensure: present - inherit: true - passwordSecret: - name: blumeops-pg-teslamate + # teslamate + paperless roles removed: migrated to ringtail blumeops-pg + # (wave-1 decommission). Their databases were dropped from this cluster + # after the cutover was verified and backed up. # authentik user for Authentik identity provider (runs on ringtail) - name: authentik login: true @@ -65,14 +56,6 @@ spec: createdb: true passwordSecret: name: blumeops-pg-authentik - # paperless user for Paperless-ngx document management - - name: paperless - login: true - connectionLimit: -1 - ensure: present - inherit: true - passwordSecret: - name: blumeops-pg-paperless # Resource limits for minikube environment resources: diff --git a/argocd/manifests/databases/immich-pg.yaml b/argocd/manifests/databases/immich-pg.yaml deleted file mode 100644 index 74c6f4e..0000000 --- a/argocd/manifests/databases/immich-pg.yaml +++ /dev/null @@ -1,69 +0,0 @@ -# PostgreSQL Cluster for Immich -# Uses VectorChord (successor to pgvecto.rs) for AI-powered vector search -# See: https://github.com/immich-app/immich/discussions/9060 -# Managed by CloudNativePG operator -apiVersion: postgresql.cnpg.io/v1 -kind: Cluster -metadata: - name: immich-pg - namespace: databases -spec: - instances: 1 - # VectorChord image for PostgreSQL 17 with VectorChord 0.5.0 - # Immich v2.4.1 requires VectorChord >=0.3 <0.6 - # See: https://github.com/tensorchord/VectorChord - imageName: ghcr.io/tensorchord/cloudnative-vectorchord:17-0.5.0 - - storage: - size: 10Gi - storageClass: standard - - # Bootstrap creates initial database and owner - bootstrap: - initdb: - database: immich - owner: immich - postInitSQL: - # Extensions required by Immich - - CREATE EXTENSION IF NOT EXISTS vector; - - CREATE EXTENSION IF NOT EXISTS vchord CASCADE; - - CREATE EXTENSION IF NOT EXISTS cube CASCADE; - - CREATE EXTENSION IF NOT EXISTS earthdistance CASCADE; - - # Managed roles - # Note: connectionLimit, ensure, inherit are CNPG defaults added to prevent ArgoCD drift - managed: - roles: - # borgmatic read-only user for backups - - name: borgmatic - login: true - connectionLimit: -1 - ensure: present - inherit: true - inRoles: - - pg_read_all_data - passwordSecret: - name: immich-pg-borgmatic - - # Resource limits for minikube environment - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "1Gi" - cpu: "500m" - - # PostgreSQL configuration - postgresql: - # VectorChord requires vchord.so in shared_preload_libraries - shared_preload_libraries: - - "vchord.so" - parameters: - max_connections: "50" - shared_buffers: "128MB" - password_encryption: "scram-sha-256" - pg_hba: - # Allow connections from k8s pods - - host all all 0.0.0.0/0 scram-sha-256 - - host all all ::/0 scram-sha-256 diff --git a/argocd/manifests/databases/kustomization.yaml b/argocd/manifests/databases/kustomization.yaml index b25e09e..0393757 100644 --- a/argocd/manifests/databases/kustomization.yaml +++ b/argocd/manifests/databases/kustomization.yaml @@ -5,13 +5,8 @@ namespace: databases resources: - blumeops-pg.yaml - - immich-pg.yaml - service-tailscale.yaml - - service-immich-pg-tailscale.yaml - service-metrics-tailscale.yaml - external-secret-eblume.yaml - external-secret-borgmatic.yaml - - external-secret-immich-borgmatic.yaml - - external-secret-teslamate.yaml - external-secret-authentik.yaml - - external-secret-paperless.yaml diff --git a/argocd/manifests/devpi/README.md b/argocd/manifests/devpi/README.md deleted file mode 100644 index 11fd697..0000000 --- a/argocd/manifests/devpi/README.md +++ /dev/null @@ -1,72 +0,0 @@ -# devpi PyPI Caching Proxy - -devpi-server running in Kubernetes, providing: -- PyPI caching proxy at `root/pypi` -- Private package hosting at `eblume/dev` - -## Setup - -### 1. Create the root password secret - -```fish -kubectl create namespace devpi -op inject -i argocd/manifests/devpi/secret-root.yaml.tpl | kubectl apply -f - -``` - -### 2. Deploy via ArgoCD - -```fish -argocd app sync apps -argocd app sync devpi -``` - -The container will auto-initialize on first startup using the root password from the secret. - -### 3. Create user and index (first time only) - -After the pod is running: - -```fish -# Login to devpi as root -uvx --from devpi-client devpi use https://pypi.tail8d86e.ts.net -uvx --from devpi-client devpi login root -# Enter root password when prompted - -# Create eblume user (prompts for password - use the one from 1Password) -uvx --from devpi-client devpi user -c eblume email=blume.erich@gmail.com - -# Create private index inheriting from PyPI -uvx --from devpi-client devpi index -c eblume/dev bases=root/pypi -``` - -## Usage - -### As pip index (caching proxy) - -Configure `~/.config/pip/pip.conf`: - -```ini -[global] -index-url = https://pypi.tail8d86e.ts.net/root/pypi/+simple/ -trusted-host = pypi.tail8d86e.ts.net -``` - -### Upload private packages - -```fish -cd ~/code/personal/your-package -uv build -uv publish --publish-url https://pypi.tail8d86e.ts.net/eblume/dev/ -``` - -## URLs - -- Web UI: https://pypi.tail8d86e.ts.net -- PyPI cache: https://pypi.tail8d86e.ts.net/root/pypi/+simple/ -- Private index: https://pypi.tail8d86e.ts.net/eblume/dev/+simple/ - -## Credentials - -Stored in 1Password vault `blumeops`, item `kyhzfifryqnuk7jeyibmmjvxxm`: -- `root password` - devpi root user -- `password` - eblume user password diff --git a/argocd/manifests/devpi/external-secret.yaml b/argocd/manifests/devpi/external-secret.yaml deleted file mode 100644 index 290ea67..0000000 --- a/argocd/manifests/devpi/external-secret.yaml +++ /dev/null @@ -1,25 +0,0 @@ -# ExternalSecret for devpi root password -# -# Replaces the manual op inject workflow from secret-root.yaml.tpl -# -# 1Password item: "devpi" in blumeops vault -# Field: "root password" -# -apiVersion: external-secrets.io/v1 -kind: ExternalSecret -metadata: - name: devpi-root - namespace: devpi -spec: - refreshInterval: 1h - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-blumeops - target: - name: devpi-root - creationPolicy: Owner - data: - - secretKey: password - remoteRef: - key: devpi - property: root password diff --git a/argocd/manifests/devpi/ingress-tailscale.yaml b/argocd/manifests/devpi/ingress-tailscale.yaml deleted file mode 100644 index 474bf72..0000000 --- a/argocd/manifests/devpi/ingress-tailscale.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: devpi-tailscale - namespace: devpi - annotations: - tailscale.com/proxy-class: "default" - tailscale.com/proxy-group: "ingress" - gethomepage.dev/enabled: "true" - gethomepage.dev/name: "PyPI" - gethomepage.dev/group: "Infrastructure" - gethomepage.dev/icon: "pypi.png" - gethomepage.dev/description: "PyPI cache" - gethomepage.dev/href: "https://pypi.ops.eblu.me" - gethomepage.dev/pod-selector: "app=devpi" -spec: - ingressClassName: tailscale - defaultBackend: - service: - name: devpi - port: - number: 3141 - tls: - - hosts: - - pypi diff --git a/argocd/manifests/devpi/kustomization.yaml b/argocd/manifests/devpi/kustomization.yaml deleted file mode 100644 index 2083aaa..0000000 --- a/argocd/manifests/devpi/kustomization.yaml +++ /dev/null @@ -1,14 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: devpi - -resources: - - statefulset.yaml - - service.yaml - - ingress-tailscale.yaml - - external-secret.yaml - -images: - - name: registry.ops.eblu.me/blumeops/devpi - newTag: v6.19.3-37b8a21 diff --git a/argocd/manifests/devpi/statefulset.yaml b/argocd/manifests/devpi/statefulset.yaml deleted file mode 100644 index 91875df..0000000 --- a/argocd/manifests/devpi/statefulset.yaml +++ /dev/null @@ -1,64 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: devpi - namespace: devpi -spec: - serviceName: devpi - replicas: 1 - selector: - matchLabels: - app: devpi - template: - metadata: - labels: - app: devpi - spec: - securityContext: - fsGroup: 1000 - seccompProfile: - type: RuntimeDefault - containers: - - name: devpi - image: registry.ops.eblu.me/blumeops/devpi:kustomized - env: - - name: DEVPI_ROOT_PASSWORD - valueFrom: - secretKeyRef: - name: devpi-root - key: password - - name: DEVPI_OUTSIDE_URL - value: "https://pypi.ops.eblu.me" - ports: - - containerPort: 3141 - name: http - volumeMounts: - - name: data - mountPath: /devpi - resources: - requests: - memory: "256Mi" - cpu: "100m" - limits: - memory: "2Gi" # High limit for initial PyPI index build, reclaimed after - cpu: "500m" - livenessProbe: - httpGet: - path: /+api - port: 3141 - initialDelaySeconds: 30 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /+api - port: 3141 - initialDelaySeconds: 10 - periodSeconds: 10 - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: ["ReadWriteOnce"] - resources: - requests: - storage: 50Gi diff --git a/argocd/manifests/docs/deployment.yaml b/argocd/manifests/docs/deployment.yaml deleted file mode 100644 index c477b83..0000000 --- a/argocd/manifests/docs/deployment.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: docs - namespace: docs -spec: - replicas: 2 - strategy: - type: RollingUpdate - rollingUpdate: - maxUnavailable: 0 - maxSurge: 1 - selector: - matchLabels: - app: docs - template: - metadata: - labels: - app: docs - spec: - securityContext: - seccompProfile: - type: RuntimeDefault - containers: - - name: docs - image: registry.ops.eblu.me/blumeops/quartz:kustomized - ports: - - containerPort: 80 - name: http - env: - - name: DOCS_RELEASE_URL - value: "https://forge.eblu.me/eblume/blumeops/releases/download/v1.16.0/docs-v1.16.0.tar.gz" - resources: - requests: - memory: "64Mi" - cpu: "10m" - limits: - memory: "128Mi" - livenessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 10 - periodSeconds: 30 - readinessProbe: - httpGet: - path: /healthz - port: 80 - initialDelaySeconds: 5 - periodSeconds: 10 diff --git a/argocd/manifests/docs/ingress-tailscale.yaml b/argocd/manifests/docs/ingress-tailscale.yaml deleted file mode 100644 index 047e823..0000000 --- a/argocd/manifests/docs/ingress-tailscale.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: docs-tailscale - namespace: docs - annotations: - tailscale.com/proxy-class: "default" - tailscale.com/proxy-group: "ingress" - tailscale.com/tags: "tag:k8s,tag:flyio-target" - gethomepage.dev/enabled: "true" - gethomepage.dev/name: "Docs" - gethomepage.dev/group: "Services" - gethomepage.dev/icon: "mdi-book-open-page-variant" - gethomepage.dev/description: "BlumeOps Documentation" - gethomepage.dev/href: "https://docs.eblu.me" - gethomepage.dev/pod-selector: "app=docs" -spec: - ingressClassName: tailscale - defaultBackend: - service: - name: docs - port: - number: 80 - tls: - - hosts: - - docs diff --git a/argocd/manifests/docs/kustomization.yaml b/argocd/manifests/docs/kustomization.yaml deleted file mode 100644 index a16185f..0000000 --- a/argocd/manifests/docs/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -namespace: docs -resources: - - deployment.yaml - - service.yaml - - ingress-tailscale.yaml - - pdb.yaml -images: - - name: registry.ops.eblu.me/blumeops/quartz - newTag: v1.28.2-613f05d diff --git a/argocd/manifests/docs/pdb.yaml b/argocd/manifests/docs/pdb.yaml deleted file mode 100644 index a87b8e9..0000000 --- a/argocd/manifests/docs/pdb.yaml +++ /dev/null @@ -1,10 +0,0 @@ ---- -apiVersion: policy/v1 -kind: PodDisruptionBudget -metadata: - name: docs -spec: - minAvailable: 1 - selector: - matchLabels: - app: docs diff --git a/argocd/manifests/docs/service.yaml b/argocd/manifests/docs/service.yaml deleted file mode 100644 index 62b0f83..0000000 --- a/argocd/manifests/docs/service.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: docs - namespace: docs -spec: - selector: - app: docs - ports: - - name: http - port: 80 - targetPort: 80 diff --git a/argocd/manifests/external-secrets-ringtail/kustomization.yaml b/argocd/manifests/external-secrets-ringtail/kustomization.yaml new file mode 100644 index 0000000..9fd4e2f --- /dev/null +++ b/argocd/manifests/external-secrets-ringtail/kustomization.yaml @@ -0,0 +1,16 @@ +# Ringtail (amd64) overlay for external-secrets. +# +# Reuses the shared indri manifest as a base and only overrides the controller +# image to the nix-built amd64 variant (`-nix` tag). The base sets the arm64 +# image (built via containers/external-secrets/container.py on indri's Dagger +# runner); ringtail's k3s is amd64 and needs the image built by +# containers/external-secrets/default.nix on the nix-container-builder. +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - ../external-secrets + +images: + - name: registry.ops.eblu.me/blumeops/external-secrets + newTag: v2.2.0-13895bb-nix diff --git a/argocd/manifests/external-secrets/kustomization.yaml b/argocd/manifests/external-secrets/kustomization.yaml index 574aaa7..639db66 100644 --- a/argocd/manifests/external-secrets/kustomization.yaml +++ b/argocd/manifests/external-secrets/kustomization.yaml @@ -12,4 +12,5 @@ resources: images: - name: ghcr.io/external-secrets/external-secrets - newTag: v2.2.0 + newName: registry.ops.eblu.me/blumeops/external-secrets + newTag: v2.2.0-13895bb diff --git a/argocd/manifests/forgejo-runner/config.yaml b/argocd/manifests/forgejo-runner/config.yaml index 4894825..01ede7c 100644 --- a/argocd/manifests/forgejo-runner/config.yaml +++ b/argocd/manifests/forgejo-runner/config.yaml @@ -1,9 +1,8 @@ -# Reviewed against v12.7.3 defaults (2026-03-30) +# Reviewed against v12.8.2 defaults (2026-04-20) log: level: info runner: - file: /data/.runner capacity: 2 timeout: 3h shutdown_timeout: 3h @@ -13,7 +12,15 @@ runner: TZ: America/Los_Angeles container: - # Job execution image is set via RUNNER_LABELS in deployment.yaml network: "host" # Connect to DinD sidecar via TCP (not socket) docker_host: tcp://127.0.0.1:2375 + +server: + connections: + forgejo: + url: https://forge.ops.eblu.me/ + uuid: ${FORGEJO_RUNNER_UUID} + token: ${FORGEJO_RUNNER_TOKEN} + labels: + - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.6-50f8c2a diff --git a/argocd/manifests/forgejo-runner/deployment.yaml b/argocd/manifests/forgejo-runner/deployment.yaml index c793895..7db7798 100644 --- a/argocd/manifests/forgejo-runner/deployment.yaml +++ b/argocd/manifests/forgejo-runner/deployment.yaml @@ -25,14 +25,6 @@ spec: env: - name: TZ value: America/Los_Angeles - - name: DOCKER_HOST - value: tcp://localhost:2375 - - name: FORGEJO_URL - value: "https://forge.ops.eblu.me" - - name: RUNNER_NAME - value: "k8s-runner" - - name: RUNNER_LABELS - value: "k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image:v0.20.1-24f7512" command: - /bin/sh - -c @@ -44,19 +36,11 @@ spec: done echo "Docker daemon ready" - # Register if not already registered - if [ ! -f /data/.runner ]; then - echo "Registering runner..." - forgejo-runner register \ - --instance "$FORGEJO_URL" \ - --token "$RUNNER_TOKEN" \ - --name "$RUNNER_NAME" \ - --labels "$RUNNER_LABELS" \ - --no-interactive - fi + # Render config with credentials from ExternalSecret. + envsubst < /config/config.yaml > /tmp/config.yaml # Start daemon - exec forgejo-runner daemon --config /config/config.yaml + exec forgejo-runner daemon --config /tmp/config.yaml envFrom: - secretRef: name: forgejo-runner-env diff --git a/argocd/manifests/forgejo-runner/external-secret.yaml b/argocd/manifests/forgejo-runner/external-secret.yaml index fce28bb..ab7a691 100644 --- a/argocd/manifests/forgejo-runner/external-secret.yaml +++ b/argocd/manifests/forgejo-runner/external-secret.yaml @@ -1,11 +1,7 @@ -# ExternalSecret for Forgejo Runner token +# ExternalSecret for Forgejo Runner credentials # # 1Password item: "Forgejo Secrets" in blumeops vault -# Field: runner_reg (runner registration token) -# -# Non-secret env vars (FORGEJO_URL, RUNNER_NAME, RUNNER_LABELS) live in the -# deployment spec so that changes (e.g. image version bumps) trigger a rollout -# automatically. +# Fields: runner_k8s_uuid, runner_k8s_token # apiVersion: external-secrets.io/v1 kind: ExternalSecret @@ -21,7 +17,11 @@ spec: name: forgejo-runner-env creationPolicy: Owner data: - - secretKey: RUNNER_TOKEN + - secretKey: FORGEJO_RUNNER_UUID remoteRef: key: Forgejo Secrets - property: runner_reg + property: runner_k8s_uuid + - secretKey: FORGEJO_RUNNER_TOKEN + remoteRef: + key: Forgejo Secrets + property: runner_k8s_token diff --git a/argocd/manifests/forgejo-runner/kustomization.yaml b/argocd/manifests/forgejo-runner/kustomization.yaml index f8d9377..93cd33b 100644 --- a/argocd/manifests/forgejo-runner/kustomization.yaml +++ b/argocd/manifests/forgejo-runner/kustomization.yaml @@ -11,7 +11,7 @@ resources: images: - name: code.forgejo.org/forgejo/runner newName: registry.ops.eblu.me/blumeops/forgejo-runner - newTag: v12.7.3-352b95c + newTag: v12.8.2-1425bf1 - name: docker newTag: 27-dind diff --git a/argocd/manifests/frigate/deployment-notify.yaml b/argocd/manifests/frigate/deployment-notify.yaml index 740d104..91f4237 100644 --- a/argocd/manifests/frigate/deployment-notify.yaml +++ b/argocd/manifests/frigate/deployment-notify.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: frigate-notify - image: ghcr.io/0x2142/frigate-notify:kustomized + image: registry.ops.eblu.me/blumeops/frigate-notify:kustomized env: - name: TZ value: America/Los_Angeles diff --git a/argocd/manifests/frigate/kustomization.yaml b/argocd/manifests/frigate/kustomization.yaml index b424bd0..a61c758 100644 --- a/argocd/manifests/frigate/kustomization.yaml +++ b/argocd/manifests/frigate/kustomization.yaml @@ -17,8 +17,8 @@ images: newTag: "1.37" - name: ghcr.io/blakeblackshear/frigate newTag: 0.17.1-tensorrt - - name: ghcr.io/0x2142/frigate-notify - newTag: v0.5.4 + - name: registry.ops.eblu.me/blumeops/frigate-notify + newTag: v0.5.4-e928054-nix configMapGenerator: - name: frigate-config diff --git a/argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml b/argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml new file mode 100644 index 0000000..96348e8 --- /dev/null +++ b/argocd/manifests/grafana-config/dashboards/configmap-shower-apm.yaml @@ -0,0 +1,229 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-shower-apm + namespace: monitoring + labels: + grafana_dashboard: "1" +data: + shower-apm.json: | + { + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "req/s", + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "normal" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 16, "x": 0, "y": 0 }, + "id": 1, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (status) (rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\"}[5m]))", "legendFormat": "{{status}}", "refId": "A" } + ], + "title": "Request Rate by Status", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.01 }, { "color": "red", "value": 0.05 }] }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 0 }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\",status=~\"5..\"}[5m])) / sum(rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\"}[5m]))", "refId": "A" } + ], + "title": "Error Rate (5xx)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 5 }] }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 4 }, + "id": 3, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(flyio_nginx_http_requests_total{host=\"shower.eblu.me\",request_uri=~\"/admin/login.*\",status=~\"4..\"}[$__range]))", "refId": "A" } + ], + "title": "Failed admin logins (range)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 4 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "auto" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_http_requests_total{host=\"shower.eblu.me\"}[5m]))", "refId": "A" } + ], + "title": "Current RPS", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "seconds", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, + "id": 5, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(0.50, sum by (le) (rate(flyio_nginx_http_request_duration_seconds_bucket{host=\"shower.eblu.me\"}[5m])))", "legendFormat": "p50", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(0.90, sum by (le) (rate(flyio_nginx_http_request_duration_seconds_bucket{host=\"shower.eblu.me\"}[5m])))", "legendFormat": "p90", "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "histogram_quantile(0.99, sum by (le) (rate(flyio_nginx_http_request_duration_seconds_bucket{host=\"shower.eblu.me\"}[5m])))", "legendFormat": "p99", "refId": "C" } + ], + "title": "Latency Percentiles", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "", + "drawStyle": "line", + "fillOpacity": 20, + "lineInterpolation": "linear", + "lineWidth": 1, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, + "id": 6, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "single", "sort": "none" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_http_response_bytes_total{host=\"shower.eblu.me\"}[5m]))", "legendFormat": "Bandwidth", "refId": "A" } + ], + "title": "Bandwidth", + "type": "timeseries" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, + "id": 7, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "targets": [ + { "datasource": { "type": "loki", "uid": "loki" }, "expr": "{instance=\"flyio-proxy\", job=\"flyio-nginx\"} |= \"shower.eblu.me\" | json | line_format \"{{.client_ip}} {{.request_method}} {{.request_uri}} {{.status}} {{.request_time}}s\"", "refId": "A" } + ], + "title": "Recent Access Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["shower", "flyio", "apm"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "", + "title": "Shower APM", + "uid": "shower-apm", + "version": 1, + "weekStart": "" + } diff --git a/argocd/manifests/grafana-config/kustomization.yaml b/argocd/manifests/grafana-config/kustomization.yaml index a6e8000..b518043 100644 --- a/argocd/manifests/grafana-config/kustomization.yaml +++ b/argocd/manifests/grafana-config/kustomization.yaml @@ -22,6 +22,7 @@ resources: - dashboards/configmap-transmission.yaml - dashboards/configmap-cv-apm.yaml - dashboards/configmap-docs-apm.yaml + - dashboards/configmap-shower-apm.yaml - dashboards/configmap-flyio.yaml - dashboards/configmap-sifaka-disks.yaml - dashboards/configmap-forgejo.yaml diff --git a/argocd/manifests/grafana/datasources.yaml b/argocd/manifests/grafana/datasources.yaml index 5a3d0f3..64ed2bf 100644 --- a/argocd/manifests/grafana/datasources.yaml +++ b/argocd/manifests/grafana/datasources.yaml @@ -63,5 +63,7 @@ datasources: password: $TESLAMATE_DB_PASSWORD type: postgres uid: TeslaMate - url: blumeops-pg-rw.databases.svc.cluster.local:5432 + # teslamate DB migrated to ringtail blumeops-pg (wave-1); reached via the + # Caddy L4 route on indri (pg.ops.eblu.me:5434 -> blumeops-pg-ringtail). + url: pg.ops.eblu.me:5434 user: teslamate diff --git a/argocd/manifests/grafana/deployment.yaml b/argocd/manifests/grafana/deployment.yaml index 848503e..cbba267 100644 --- a/argocd/manifests/grafana/deployment.yaml +++ b/argocd/manifests/grafana/deployment.yaml @@ -14,7 +14,9 @@ spec: app.kubernetes.io/name: grafana app.kubernetes.io/instance: grafana strategy: - type: RollingUpdate + # RWO PVC for SQLite + Bleve index — RollingUpdate spawns the new pod + # before the old one terminates, and it crashloops on the index lock. + type: Recreate template: metadata: labels: @@ -156,7 +158,9 @@ spec: - name: FOLDER value: /tmp/dashboards - name: RESOURCE - value: both + # ConfigMap-only — no dashboards are sourced from Secrets, + # so the ServiceAccount has no read access to secrets. + value: configmap - name: FOLDER_ANNOTATION value: grafana_folder securityContext: @@ -183,7 +187,7 @@ spec: - name: FOLDER value: /tmp/dashboards - name: RESOURCE - value: both + value: configmap - name: FOLDER_ANNOTATION value: grafana_folder - name: REQ_USERNAME diff --git a/argocd/manifests/grafana/rbac.yaml b/argocd/manifests/grafana/rbac.yaml index d0d0c843..1c2dee3 100644 --- a/argocd/manifests/grafana/rbac.yaml +++ b/argocd/manifests/grafana/rbac.yaml @@ -7,7 +7,7 @@ metadata: app.kubernetes.io/instance: grafana rules: - apiGroups: [""] - resources: ["configmaps", "secrets"] + resources: ["configmaps"] verbs: ["get", "watch", "list"] --- apiVersion: rbac.authorization.k8s.io/v1 diff --git a/argocd/manifests/homepage/kustomization.yaml b/argocd/manifests/homepage/kustomization.yaml index 27de0eb..31b6847 100644 --- a/argocd/manifests/homepage/kustomization.yaml +++ b/argocd/manifests/homepage/kustomization.yaml @@ -17,7 +17,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/homepage - newTag: v1.11.0-e375859 + newTag: v1.11.0-678f26b-nix configMapGenerator: - name: homepage-config diff --git a/argocd/manifests/homepage/services.yaml b/argocd/manifests/homepage/services.yaml index 58b8bb7..cc1adf4 100644 --- a/argocd/manifests/homepage/services.yaml +++ b/argocd/manifests/homepage/services.yaml @@ -1,3 +1,6 @@ +# Homepage runs on ringtail (k3s) — its k8s autodiscovery only sees ringtail +# Ingresses (frigate→NVR, authentik, ntfy, ollama). Services that live on +# minikube (and indri-native) need explicit static entries here. - Host Services: - Forgejo: href: https://forge.eblu.me @@ -12,6 +15,10 @@ href: https://registry.ops.eblu.me icon: zot-registry description: Container registry + - Devpi: + href: https://pypi.ops.eblu.me + icon: mdi-language-python + description: PyPI caching mirror - Sifaka NAS: href: https://nas.ops.eblu.me icon: synology @@ -53,10 +60,6 @@ # type: caddy # url: http://indri.tail8d86e.ts.net:2019 - Home: - - NVR: - href: https://nvr.ops.eblu.me - icon: frigate.png - description: Network video recorder - Jellyfin: href: https://jellyfin.ops.eblu.me icon: jellyfin @@ -68,12 +71,62 @@ enableBlocks: true enableNowPlaying: false fields: ["movies", "series", "episodes"] + - DJ: + href: https://dj.ops.eblu.me + icon: navidrome.png + description: Music streaming server + widget: + type: navidrome + url: https://dj.ops.eblu.me + user: "{{HOMEPAGE_VAR_NAVIDROME_USER}}" + token: "{{HOMEPAGE_VAR_NAVIDROME_TOKEN}}" + salt: "{{HOMEPAGE_VAR_NAVIDROME_SALT}}" +- Content: + - Kiwix: + href: https://kiwix.ops.eblu.me + icon: kiwix.png + description: Offline Wikipedia + - Miniflux: + href: https://feed.ops.eblu.me + icon: miniflux.png + description: RSS reader + widget: + type: miniflux + url: https://feed.ops.eblu.me + key: "{{HOMEPAGE_VAR_MINIFLUX_API_KEY}}" + fields: ["unread"] - Infrastructure: - - Authentik: - href: https://authentik.ops.eblu.me - icon: authentik - description: Identity provider - - Ntfy: - href: https://ntfy.ops.eblu.me - icon: ntfy.png - description: Push notifications + - ArgoCD: + href: https://argocd.ops.eblu.me + icon: argo-cd.png + description: GitOps CD + - Grafana: + href: https://grafana.ops.eblu.me + icon: grafana.png + description: Metrics dashboards + widget: + type: grafana + url: https://grafana.ops.eblu.me + username: "{{HOMEPAGE_VAR_GRAFANA_USERNAME}}" + password: "{{HOMEPAGE_VAR_GRAFANA_PASSWORD}}" + fields: ["dashboards", "totalalerts", "alertstriggered"] + - Prometheus: + href: https://prometheus.ops.eblu.me + icon: prometheus.png + description: Metrics storage +- Services: + # CV and Docs were previously auto-discovered from k8s Ingresses; after + # the indri-native migration ([[cv-on-indri]], [[docs-on-indri]]) there + # is no Ingress to discover, so they live here as static entries. + - CV: + href: https://cv.eblu.me + icon: mdi-file-document + description: Resume / CV + - Docs: + href: https://docs.eblu.me + icon: mdi-book-open-page-variant + description: BlumeOps Documentation + - Transmission: + href: https://torrent.ops.eblu.me + icon: transmission.png + description: Torrent client diff --git a/argocd/manifests/immich/deployment-ml.yaml b/argocd/manifests/immich-ringtail/deployment-ml.yaml similarity index 83% rename from argocd/manifests/immich/deployment-ml.yaml rename to argocd/manifests/immich-ringtail/deployment-ml.yaml index 57c4242..5ea8035 100644 --- a/argocd/manifests/immich/deployment-ml.yaml +++ b/argocd/manifests/immich-ringtail/deployment-ml.yaml @@ -16,11 +16,16 @@ spec: app: immich component: machine-learning spec: + runtimeClassName: nvidia securityContext: seccompProfile: type: RuntimeDefault containers: - name: machine-learning + # ringtail uses the -cuda tag (set in kustomization.yaml) + # to take advantage of the RTX 4080 via the nvidia + # device plugin. Time-slicing is configured for 4 replicas + # so frigate + ollama + this pod can share. image: ghcr.io/immich-app/immich-machine-learning:kustomized ports: - name: http @@ -57,6 +62,7 @@ spec: cpu: "100m" limits: memory: "4Gi" + nvidia.com/gpu: "1" volumes: - name: cache persistentVolumeClaim: diff --git a/argocd/manifests/immich/deployment-server.yaml b/argocd/manifests/immich-ringtail/deployment-server.yaml similarity index 100% rename from argocd/manifests/immich/deployment-server.yaml rename to argocd/manifests/immich-ringtail/deployment-server.yaml diff --git a/argocd/manifests/immich/deployment-valkey.yaml b/argocd/manifests/immich-ringtail/deployment-valkey.yaml similarity index 100% rename from argocd/manifests/immich/deployment-valkey.yaml rename to argocd/manifests/immich-ringtail/deployment-valkey.yaml diff --git a/argocd/manifests/immich/ingress-tailscale.yaml b/argocd/manifests/immich-ringtail/ingress-tailscale.yaml similarity index 62% rename from argocd/manifests/immich/ingress-tailscale.yaml rename to argocd/manifests/immich-ringtail/ingress-tailscale.yaml index 59a4c05..f0b5fe1 100644 --- a/argocd/manifests/immich/ingress-tailscale.yaml +++ b/argocd/manifests/immich-ringtail/ingress-tailscale.yaml @@ -1,6 +1,9 @@ -# Tailscale Ingress for Immich -# Exposes Immich at photos.tail8d86e.ts.net -# Caddy will proxy photos.ops.eblu.me to this endpoint +# Tailscale ProxyGroup Ingress for Immich on ringtail. +# +# Production hostname: photos.tail8d86e.ts.net +# (during the cutover window this was photos-ringtail; the minikube +# ingress was torn down before this was renamed to photos to avoid +# the Tailscale device-name collision.) apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -16,12 +19,6 @@ metadata: gethomepage.dev/description: "Photo management" gethomepage.dev/href: "https://photos.ops.eblu.me" gethomepage.dev/pod-selector: "app=immich,component=server" - # TODO: Add Immich widget - requires API key from Account Settings > API Keys - # See: https://gethomepage.dev/widgets/services/immich/ - # gethomepage.dev/widget.type: "immich" - # gethomepage.dev/widget.url: "https://photos.ops.eblu.me" - # gethomepage.dev/widget.key: "{{HOMEPAGE_VAR_IMMICH_API_KEY}}" - # gethomepage.dev/widget.version: "2" spec: ingressClassName: tailscale rules: diff --git a/argocd/manifests/immich/kustomization.yaml b/argocd/manifests/immich-ringtail/kustomization.yaml similarity index 56% rename from argocd/manifests/immich/kustomization.yaml rename to argocd/manifests/immich-ringtail/kustomization.yaml index c7c54e1..2fa131c 100644 --- a/argocd/manifests/immich/kustomization.yaml +++ b/argocd/manifests/immich-ringtail/kustomization.yaml @@ -1,7 +1,8 @@ ---- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization + namespace: immich + resources: - deployment-server.yaml - deployment-ml.yaml @@ -13,10 +14,16 @@ resources: - pv-nfs.yaml - pvc.yaml - ingress-tailscale.yaml + images: - name: ghcr.io/immich-app/immich-server newTag: v2.6.3 - name: ghcr.io/immich-app/immich-machine-learning - newTag: v2.6.3 + # CUDA variant of the same release — ringtail has an RTX 4080 + newTag: v2.6.3-cuda + # amd64 valkey built via nix on the ringtail nix-container-builder + # (see containers/valkey/default.nix). The Alpine container.py build + # is arm64-only and serves paperless on indri. - name: docker.io/valkey/valkey - newTag: "8.1-alpine" + newName: registry.ops.eblu.me/blumeops/valkey + newTag: v8.1.7-ecded30-nix diff --git a/argocd/manifests/immich-ringtail/pv-nfs.yaml b/argocd/manifests/immich-ringtail/pv-nfs.yaml new file mode 100644 index 0000000..3d5a682 --- /dev/null +++ b/argocd/manifests/immich-ringtail/pv-nfs.yaml @@ -0,0 +1,29 @@ +# NFS PersistentVolume for Immich photo library on ringtail k3s. +# +# Mirror of argocd/manifests/immich/pv-nfs.yaml (minikube) but with +# a distinct name (minikube and ringtail are separate clusters, so PV +# names don't collide cluster-side, but using the same name in two +# manifests is confusing). +# +# The sifaka NFS export for /volume1/photos already permits +# 192.168.1.0/24 + 100.64.0.0/10. Ringtail's wired IP (192.168.1.21) +# falls in the first CIDR, so no DSM rule changes are needed. +# +# Verified 2026-05-13: ringtail pod can read existing dirs, write +# new files, and delete them. DNS resolves sifaka to 192.168.1.203 +# (LAN), so NFS traffic stays off the tailnet — avoids the known +# sifaka-tailscale-userspace bite. +apiVersion: v1 +kind: PersistentVolume +metadata: + name: immich-library-nfs-pv-ringtail +spec: + capacity: + storage: 2Ti + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + nfs: + server: sifaka + path: /volume1/photos diff --git a/argocd/manifests/immich/pvc-ml-cache.yaml b/argocd/manifests/immich-ringtail/pvc-ml-cache.yaml similarity index 100% rename from argocd/manifests/immich/pvc-ml-cache.yaml rename to argocd/manifests/immich-ringtail/pvc-ml-cache.yaml diff --git a/argocd/manifests/immich/pvc.yaml b/argocd/manifests/immich-ringtail/pvc.yaml similarity index 54% rename from argocd/manifests/immich/pvc.yaml rename to argocd/manifests/immich-ringtail/pvc.yaml index c764636..5bfc052 100644 --- a/argocd/manifests/immich/pvc.yaml +++ b/argocd/manifests/immich-ringtail/pvc.yaml @@ -1,5 +1,5 @@ -# PersistentVolumeClaim for Immich photo library -# Binds to the NFS PV for sifaka:/volume1/photos +# PersistentVolumeClaim for Immich photo library on ringtail. +# Binds to immich-library-nfs-pv-ringtail (sifaka:/volume1/photos). apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -9,7 +9,7 @@ spec: accessModes: - ReadWriteMany storageClassName: "" - volumeName: immich-library-nfs-pv + volumeName: immich-library-nfs-pv-ringtail resources: requests: storage: 2Ti diff --git a/argocd/manifests/immich/service-ml.yaml b/argocd/manifests/immich-ringtail/service-ml.yaml similarity index 100% rename from argocd/manifests/immich/service-ml.yaml rename to argocd/manifests/immich-ringtail/service-ml.yaml diff --git a/argocd/manifests/immich/service-valkey.yaml b/argocd/manifests/immich-ringtail/service-valkey.yaml similarity index 100% rename from argocd/manifests/immich/service-valkey.yaml rename to argocd/manifests/immich-ringtail/service-valkey.yaml diff --git a/argocd/manifests/immich/service.yaml b/argocd/manifests/immich-ringtail/service.yaml similarity index 100% rename from argocd/manifests/immich/service.yaml rename to argocd/manifests/immich-ringtail/service.yaml diff --git a/argocd/manifests/immich/README.md b/argocd/manifests/immich/README.md deleted file mode 100644 index a82a856..0000000 --- a/argocd/manifests/immich/README.md +++ /dev/null @@ -1,115 +0,0 @@ -# Immich - -Self-hosted photo and video management solution with AI-powered search and face recognition. - -## Prerequisites - -1. **NFS Share**: Create `/volume1/photos` on sifaka with NFS permissions for indri -2. **PostgreSQL**: The `immich-pg` cluster (with pgvecto.rs) must be healthy -3. **Secrets**: Create the database password secret - -## Deployment Order - -1. Sync `blumeops-pg` (to get CloudNativePG operator if not already running) -2. Wait for `immich-pg` cluster to be healthy -3. Create secrets (see below) -4. Sync `immich` (deploys all resources: storage, services, deployments) -5. Run `mise run provision-indri -- --tags caddy` to update Caddy config - -## Components - -| Component | Deployment | Service | Port | -|-----------|------------|---------|------| -| Server (web/API) | `immich-server` | `immich-server` | 2283 | -| Machine Learning | `immich-machine-learning` | `immich-machine-learning` | 3003 | -| Valkey (Redis) | `immich-valkey` | `immich-valkey` | 6379 | - -## Secret Setup - -The `immich-db` secret contains the database password, which is auto-generated by CloudNativePG -in the `immich-pg-app` secret. To create or regenerate the secret: - -```bash -# Create namespace if needed -kubectl --context=minikube-indri create namespace immich - -# Copy password from CNPG secret to immich namespace -kubectl --context=minikube-indri create secret generic immich-db -n immich \ - --from-literal=password="$(kubectl --context=minikube-indri -n databases get secret immich-pg-app -o jsonpath='{.data.password}' | base64 -d)" -``` - -Note: This secret is not managed by ExternalSecrets since the source of truth is the CNPG-generated secret. - -## Access - -- **URL**: https://photos.ops.eblu.me (after Caddy is updated) -- **Tailscale**: https://photos.tail8d86e.ts.net (direct) - -## First-Time Setup - -1. Navigate to https://photos.ops.eblu.me -2. Create an admin account -3. Configure external library (optional - for importing existing photos) - -## External Library (iCloud Photos) - -To import existing photos from iCloud sync on indri: - -1. In Immich Admin > External Libraries, create a new library -2. Set the import path to the location where iCloud photos sync -3. Configure scan schedule or trigger manual scan - -## Architecture - -``` -┌─────────────────┐ ┌─────────────────┐ -│ immich-server │────▶│ immich-pg │ -│ (web/api) │ │ (PostgreSQL │ -└────────┬────────┘ │ + pgvecto.rs) │ - │ └─────────────────┘ - │ -┌────────▼────────┐ ┌─────────────────┐ -│ immich-ml │ │ valkey │ -│ (ML inference) │ │ (Redis cache) │ -└─────────────────┘ └─────────────────┘ - │ -┌────────▼────────┐ -│ sifaka NFS │ -│ /volume1/photos│ -└─────────────────┘ -``` - -## Version Management - -Image versions are controlled via `kustomization.yaml`: - -```yaml -images: - - name: ghcr.io/immich-app/immich-server - newTag: v2.6.3 - - name: ghcr.io/immich-app/immich-machine-learning - newTag: v2.6.3 - - name: docker.io/valkey/valkey - newTag: "8.1-alpine" -``` - -To upgrade, update `newTag` values and sync via ArgoCD. - -## Troubleshooting - -```bash -# Check pods -kubectl --context=minikube-indri -n immich get pods - -# Check immich-pg cluster -kubectl --context=minikube-indri -n databases get cluster immich-pg - -# View server logs -kubectl --context=minikube-indri -n immich logs -l app=immich,component=server - -# View ML logs -kubectl --context=minikube-indri -n immich logs -l app=immich,component=machine-learning - -# Check PVC binding -kubectl --context=minikube-indri -n immich get pvc -``` diff --git a/argocd/manifests/immich/pv-nfs.yaml b/argocd/manifests/immich/pv-nfs.yaml deleted file mode 100644 index 0bd6ee2..0000000 --- a/argocd/manifests/immich/pv-nfs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# NFS PersistentVolume for Immich photo library -# Requires: NFS share on sifaka at /volume1/photos with NFS permissions for indri -# -# To create on Synology: -# 1. Control Panel > Shared Folder > Create -# 2. Name: photos, Location: Volume 1 -# 3. Control Panel > File Services > NFS > NFS Rules -# 4. Add rule for "photos" share: Hostname=indri, Privilege=Read/Write, Squash=No mapping -apiVersion: v1 -kind: PersistentVolume -metadata: - name: immich-library-nfs-pv -spec: - capacity: - storage: 2Ti - accessModes: - - ReadWriteMany - persistentVolumeReclaimPolicy: Retain - storageClassName: "" - nfs: - server: sifaka - path: /volume1/photos diff --git a/argocd/manifests/mealie/deployment.yaml b/argocd/manifests/mealie-ringtail/deployment.yaml similarity index 89% rename from argocd/manifests/mealie/deployment.yaml rename to argocd/manifests/mealie-ringtail/deployment.yaml index bdcf91e..10d06ab 100644 --- a/argocd/manifests/mealie/deployment.yaml +++ b/argocd/manifests/mealie-ringtail/deployment.yaml @@ -1,3 +1,9 @@ +# Mealie on ringtail k3s — Nix image. +# +# Single gunicorn process (the Nix image's default `mealie-run` entrypoint +# runs init_db then gunicorn), serving the prebuilt frontend. DB is SQLite +# on the mealie-data PVC; its contents are copied from the minikube PVC at +# cutover. See [[migrate-wave1-ringtail]]. apiVersion: apps/v1 kind: Deployment metadata: @@ -5,6 +11,8 @@ metadata: namespace: mealie spec: replicas: 1 + strategy: + type: Recreate selector: matchLabels: app: mealie diff --git a/argocd/manifests/mealie/external-secret.yaml b/argocd/manifests/mealie-ringtail/external-secret.yaml similarity index 100% rename from argocd/manifests/mealie/external-secret.yaml rename to argocd/manifests/mealie-ringtail/external-secret.yaml diff --git a/argocd/manifests/mealie/ingress-tailscale.yaml b/argocd/manifests/mealie-ringtail/ingress-tailscale.yaml similarity index 100% rename from argocd/manifests/mealie/ingress-tailscale.yaml rename to argocd/manifests/mealie-ringtail/ingress-tailscale.yaml diff --git a/argocd/manifests/mealie/kustomization.yaml b/argocd/manifests/mealie-ringtail/kustomization.yaml similarity index 88% rename from argocd/manifests/mealie/kustomization.yaml rename to argocd/manifests/mealie-ringtail/kustomization.yaml index fb0713b..ad65785 100644 --- a/argocd/manifests/mealie/kustomization.yaml +++ b/argocd/manifests/mealie-ringtail/kustomization.yaml @@ -12,4 +12,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/mealie - newTag: v3.12.0-613f05d + newTag: v3.16.0-e0057b4-nix diff --git a/argocd/manifests/mealie/pvc.yaml b/argocd/manifests/mealie-ringtail/pvc.yaml similarity index 50% rename from argocd/manifests/mealie/pvc.yaml rename to argocd/manifests/mealie-ringtail/pvc.yaml index f473e07..89c38ef 100644 --- a/argocd/manifests/mealie/pvc.yaml +++ b/argocd/manifests/mealie-ringtail/pvc.yaml @@ -1,4 +1,5 @@ ---- +# SQLite data volume for Mealie on ringtail. Contents copied from the +# minikube mealie-data PVC at cutover (recipes, meal plans, uploaded media). apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -7,7 +8,7 @@ metadata: spec: accessModes: - ReadWriteOnce - storageClassName: standard + storageClassName: local-path resources: requests: storage: 2Gi diff --git a/argocd/manifests/mealie/service.yaml b/argocd/manifests/mealie-ringtail/service.yaml similarity index 100% rename from argocd/manifests/mealie/service.yaml rename to argocd/manifests/mealie-ringtail/service.yaml diff --git a/argocd/manifests/nvidia-device-plugin/kustomization.yaml b/argocd/manifests/nvidia-device-plugin/kustomization.yaml index a46edf6..f5a33ae 100644 --- a/argocd/manifests/nvidia-device-plugin/kustomization.yaml +++ b/argocd/manifests/nvidia-device-plugin/kustomization.yaml @@ -10,4 +10,4 @@ resources: images: - name: nvcr.io/nvidia/k8s-device-plugin - newTag: v0.19.0 + newTag: v0.19.2 diff --git a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml index dee2fd7..100e7a9 100644 --- a/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml +++ b/argocd/manifests/nvidia-device-plugin/time-slicing-config.yaml @@ -11,4 +11,4 @@ data: timeSlicing: resources: - name: nvidia.com/gpu - replicas: 2 + replicas: 4 diff --git a/argocd/manifests/paperless/deployment.yaml b/argocd/manifests/paperless-ringtail/deployment.yaml similarity index 53% rename from argocd/manifests/paperless/deployment.yaml rename to argocd/manifests/paperless-ringtail/deployment.yaml index cc2c013..de4f456 100644 --- a/argocd/manifests/paperless/deployment.yaml +++ b/argocd/manifests/paperless-ringtail/deployment.yaml @@ -1,3 +1,17 @@ +# Paperless-ngx on ringtail k3s — Nix image, multi-process. +# +# The upstream s6 image ran web + worker + scheduler + consumer (and DB +# migrations) in one container. The Nix image (containers/paperless/ +# default.nix) ships the binaries but no supervisor, so we run those as +# four containers in one pod, sharing the local data/consume dirs +# (emptyDir) and the NFS media volume; redis is colocated so +# PAPERLESS_REDIS=localhost works for all. A migrate initContainer runs +# DB migrations once before the app containers start. +# +# DB points in-cluster at the ringtail blumeops-pg (was pg.ops.eblu.me on +# indri). PAPERLESS_{DATA_DIR,MEDIA_ROOT,CONSUMPTION_DIR} are set +# explicitly because the Nix package does not default to the upstream +# /usr/src/paperless paths. apiVersion: apps/v1 kind: Deployment metadata: @@ -5,6 +19,8 @@ metadata: namespace: paperless spec: replicas: 1 + strategy: + type: Recreate selector: matchLabels: app: paperless @@ -16,27 +32,38 @@ spec: securityContext: seccompProfile: type: RuntimeDefault - containers: - - name: paperless - image: registry.ops.eblu.me/blumeops/paperless:kustomized + initContainers: + # redis as a native sidecar (restartPolicy: Always): starts before + # the migrate init and stays running for the app containers, so all + # of them reach PAPERLESS_REDIS=localhost:6379. + - name: redis + image: docker.io/library/redis:kustomized + restartPolicy: Always ports: - - containerPort: 8000 - name: http - env: + - containerPort: 6379 + volumeMounts: + - name: redis-data + mountPath: /data + resources: + requests: + memory: "32Mi" + cpu: "10m" + limits: + memory: "128Mi" + - name: migrate + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["paperless-ngx", "migrate", "--no-input"] + env: &paperless-env - name: PAPERLESS_URL value: "https://paperless.ops.eblu.me" - name: PAPERLESS_REDIS value: "redis://localhost:6379" - name: PAPERLESS_DBHOST - value: "pg.ops.eblu.me" + value: "blumeops-pg-rw.databases.svc.cluster.local" - name: PAPERLESS_DBPORT value: "5432" - name: PAPERLESS_DBNAME value: "paperless" - # Explicit port to override k8s-injected PAPERLESS_PORT env var - # (k8s sets PAPERLESS_PORT=tcp://... for a service named 'paperless') - - name: PAPERLESS_PORT - value: "8000" - name: PAPERLESS_DBUSER value: "paperless" - name: PAPERLESS_DBPASS @@ -44,6 +71,16 @@ spec: secretKeyRef: name: paperless-secrets key: db-password + # Explicit port to override the k8s-injected PAPERLESS_PORT + # (service named 'paperless' would set PAPERLESS_PORT=tcp://...) + - name: PAPERLESS_PORT + value: "8000" + - name: PAPERLESS_DATA_DIR + value: "/usr/src/paperless/data" + - name: PAPERLESS_MEDIA_ROOT + value: "/usr/src/paperless/media" + - name: PAPERLESS_CONSUMPTION_DIR + value: "/usr/src/paperless/consume" - name: PAPERLESS_SECRET_KEY valueFrom: secretKeyRef: @@ -55,7 +92,6 @@ spec: value: "eng" - name: PAPERLESS_TASK_WORKERS value: "1" - # Admin account (created on first startup) - name: PAPERLESS_ADMIN_USER value: "eblume" - name: PAPERLESS_ADMIN_PASSWORD @@ -65,8 +101,6 @@ spec: key: admin-password - name: PAPERLESS_ADMIN_MAIL value: "blume.erich@gmail.com" - # OIDC via Authentik - # Full JSON blob pulled from 1Password (includes client secret) - name: PAPERLESS_APPS value: "allauth.socialaccount.providers.openid_connect" - name: PAPERLESS_SOCIALACCOUNT_PROVIDERS @@ -82,19 +116,27 @@ spec: value: "false" - name: PAPERLESS_REDIRECT_LOGIN_TO_SSO value: "false" - volumeMounts: + volumeMounts: &paperless-mounts - name: data mountPath: /usr/src/paperless/data - name: media mountPath: /usr/src/paperless/media - name: consume mountPath: /usr/src/paperless/consume + containers: + - name: web + image: registry.ops.eblu.me/blumeops/paperless:kustomized + ports: + - containerPort: 8000 + name: http + env: *paperless-env + volumeMounts: *paperless-mounts resources: requests: memory: "256Mi" cpu: "100m" limits: - memory: "2Gi" + memory: "1Gi" cpu: "1000m" livenessProbe: httpGet: @@ -109,16 +151,42 @@ spec: initialDelaySeconds: 30 periodSeconds: 10 - - name: redis - image: docker.io/library/redis:kustomized - ports: - - containerPort: 6379 + - name: worker + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["celery", "--app", "paperless", "worker", "--loglevel", "INFO"] + env: *paperless-env + volumeMounts: *paperless-mounts resources: requests: - memory: "32Mi" - cpu: "10m" + memory: "256Mi" + cpu: "100m" limits: + memory: "1Gi" + cpu: "1000m" + + - name: beat + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["celery", "--app", "paperless", "beat", "--loglevel", "INFO"] + env: *paperless-env + volumeMounts: *paperless-mounts + resources: + requests: + memory: "64Mi" + cpu: "20m" + limits: + memory: "256Mi" + + - name: consumer + image: registry.ops.eblu.me/blumeops/paperless:kustomized + command: ["paperless-ngx", "document_consumer"] + env: *paperless-env + volumeMounts: *paperless-mounts + resources: + requests: memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" volumes: - name: data @@ -128,3 +196,6 @@ spec: claimName: paperless-media - name: consume emptyDir: {} + - name: redis-data + emptyDir: + sizeLimit: 1Gi diff --git a/argocd/manifests/paperless/external-secret.yaml b/argocd/manifests/paperless-ringtail/external-secret.yaml similarity index 100% rename from argocd/manifests/paperless/external-secret.yaml rename to argocd/manifests/paperless-ringtail/external-secret.yaml diff --git a/argocd/manifests/paperless/ingress-tailscale.yaml b/argocd/manifests/paperless-ringtail/ingress-tailscale.yaml similarity index 100% rename from argocd/manifests/paperless/ingress-tailscale.yaml rename to argocd/manifests/paperless-ringtail/ingress-tailscale.yaml diff --git a/argocd/manifests/paperless-ringtail/kustomization.yaml b/argocd/manifests/paperless-ringtail/kustomization.yaml new file mode 100644 index 0000000..41665b8 --- /dev/null +++ b/argocd/manifests/paperless-ringtail/kustomization.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: paperless + +resources: + - deployment.yaml + - service.yaml + - pv-nfs.yaml + - pvc.yaml + - ingress-tailscale.yaml + - external-secret.yaml + +images: + - name: registry.ops.eblu.me/blumeops/paperless + newTag: v2.20.15-fcac8e5-nix + # amd64 valkey built via nix (the v8.1.7-ecded30 tag without -nix is the + # arm64 Alpine build for indri and fails on ringtail with exec format error) + - name: docker.io/library/redis + newName: registry.ops.eblu.me/blumeops/valkey + newTag: v8.1.7-ecded30-nix diff --git a/argocd/manifests/paperless-ringtail/pv-nfs.yaml b/argocd/manifests/paperless-ringtail/pv-nfs.yaml new file mode 100644 index 0000000..2990d1a --- /dev/null +++ b/argocd/manifests/paperless-ringtail/pv-nfs.yaml @@ -0,0 +1,22 @@ +# NFS PersistentVolume for the Paperless document library, mounted from +# ringtail. Same sifaka export (/volume1/paperless) as the minikube PV, +# but a distinct PV name so both clusters can declare it during the +# parallel-run before cutover. +# +# Prerequisite: sifaka must have an NFS rule granting ringtail Read/Write +# (Squash=No mapping) on the paperless share — the same step done for +# immich. See [[sifaka-nfs-from-ringtail]]. +apiVersion: v1 +kind: PersistentVolume +metadata: + name: paperless-media-nfs-pv-ringtail +spec: + capacity: + storage: 500Gi + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + nfs: + server: sifaka + path: /volume1/paperless diff --git a/argocd/manifests/paperless/pvc.yaml b/argocd/manifests/paperless-ringtail/pvc.yaml similarity index 55% rename from argocd/manifests/paperless/pvc.yaml rename to argocd/manifests/paperless-ringtail/pvc.yaml index 4365c9f..8b44660 100644 --- a/argocd/manifests/paperless/pvc.yaml +++ b/argocd/manifests/paperless-ringtail/pvc.yaml @@ -1,5 +1,5 @@ -# PersistentVolumeClaim for Paperless document library -# Binds to the NFS PV for sifaka:/volume1/paperless +# PersistentVolumeClaim for the Paperless document library on ringtail. +# Binds the NFS PV for sifaka:/volume1/paperless. apiVersion: v1 kind: PersistentVolumeClaim metadata: @@ -9,7 +9,7 @@ spec: accessModes: - ReadWriteMany storageClassName: "" - volumeName: paperless-media-nfs-pv + volumeName: paperless-media-nfs-pv-ringtail resources: requests: storage: 500Gi diff --git a/argocd/manifests/paperless/service.yaml b/argocd/manifests/paperless-ringtail/service.yaml similarity index 100% rename from argocd/manifests/paperless/service.yaml rename to argocd/manifests/paperless-ringtail/service.yaml diff --git a/argocd/manifests/paperless/kustomization.yaml b/argocd/manifests/paperless/kustomization.yaml deleted file mode 100644 index 3e65578..0000000 --- a/argocd/manifests/paperless/kustomization.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization - -namespace: paperless - -resources: - - deployment.yaml - - service.yaml - - pv-nfs.yaml - - pvc.yaml - - ingress-tailscale.yaml - - external-secret.yaml - -images: - - name: registry.ops.eblu.me/blumeops/paperless - newTag: v2.20.13-07f52e9 - # TODO(DR-2026-04): authentik-redis is amd64-only (nix-built on ringtail). - # Was running under QEMU emulation before. Switched to upstream valkey - # during DR recovery. Build a multi-arch blumeops/redis or keep upstream. - - name: docker.io/library/redis - newName: docker.io/valkey/valkey - newTag: "8.1-alpine" diff --git a/argocd/manifests/paperless/pv-nfs.yaml b/argocd/manifests/paperless/pv-nfs.yaml deleted file mode 100644 index 8ee7526..0000000 --- a/argocd/manifests/paperless/pv-nfs.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# NFS PersistentVolume for Paperless document library -# Requires: NFS share on sifaka at /volume1/paperless with NFS permissions for indri -# -# To create on Synology: -# 1. Control Panel > Shared Folder > Create -# 2. Name: paperless, Location: Volume 1 -# 3. Control Panel > File Services > NFS > NFS Rules -# 4. Add rule for "paperless" share: Hostname=indri, Privilege=Read/Write, Squash=No mapping -apiVersion: v1 -kind: PersistentVolume -metadata: - name: paperless-media-nfs-pv -spec: - capacity: - storage: 500Gi - accessModes: - - ReadWriteMany - persistentVolumeReclaimPolicy: Retain - storageClassName: "" - nfs: - server: sifaka - path: /volume1/paperless diff --git a/argocd/manifests/prowler/cronjob-iac-scan.yaml b/argocd/manifests/prowler/cronjob-iac-scan.yaml index 49c8ce6..c1303a5 100644 --- a/argocd/manifests/prowler/cronjob-iac-scan.yaml +++ b/argocd/manifests/prowler/cronjob-iac-scan.yaml @@ -19,6 +19,13 @@ spec: - name: prowler image: registry.ops.eblu.me/blumeops/prowler:kustomized command: ["/bin/sh", "-c"] + # Prowler's --mutelist-file is a no-op for the IaC provider + # (it delegates to Trivy). The Prowler image's trivy shim + # injects --ignorefile $TRIVY_IGNOREFILE when set; see + # containers/prowler/Dockerfile. + env: + - name: TRIVY_IGNOREFILE + value: /mutelist/trivyignore.yaml args: - | DATEDIR=/reports/prowler-iac/$(date +%Y-%m-%d) @@ -31,8 +38,17 @@ spec: volumeMounts: - name: reports mountPath: /reports + - name: mutelist + mountPath: /mutelist + readOnly: true restartPolicy: OnFailure volumes: - name: reports persistentVolumeClaim: claimName: prowler-reports + - name: mutelist + configMap: + name: prowler-mutelist + items: + - key: trivyignore.yaml + path: trivyignore.yaml diff --git a/argocd/manifests/prowler/kustomization.yaml b/argocd/manifests/prowler/kustomization.yaml index 7024aff..1d92a6b 100644 --- a/argocd/manifests/prowler/kustomization.yaml +++ b/argocd/manifests/prowler/kustomization.yaml @@ -23,7 +23,8 @@ configMapGenerator: - mutelist/core-pod-security.yaml - mutelist/manual-node-checks.yaml - mutelist/rbac.yaml + - mutelist/trivyignore.yaml images: - name: registry.ops.eblu.me/blumeops/prowler - newTag: v5.23.0-7c1cd11 + newTag: v5.23.0-495e45d diff --git a/argocd/manifests/prowler/mutelist/apiserver.yaml b/argocd/manifests/prowler/mutelist/apiserver.yaml index 5a25d4f..fd077e8 100644 --- a/argocd/manifests/prowler/mutelist/apiserver.yaml +++ b/argocd/manifests/prowler/mutelist/apiserver.yaml @@ -6,48 +6,48 @@ Mutelist: "apiserver_always_pull_images_plugin": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: single-user-cluster, local-registry. Only the operator has cluster access; all images pulled from private zot registry." + Description: "Only the operator has cluster access; all images pulled from private zot registry." "apiserver_audit_log_maxage_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_audit_log_maxbackup_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_audit_log_maxsize_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_audit_log_path_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: observability-stack-audit. Alloy/Loki provides pod-level audit trail." + Description: "Alloy/Loki provides pod-level audit trail." "apiserver_deny_service_external_ips": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. No external IPs routable; cluster only reachable via tailnet." + Description: "No external IPs routable; cluster only reachable via tailnet." "apiserver_disable_profiling": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet." + Description: "Profiling endpoint unreachable from public internet." "apiserver_encryption_provider_config_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation, single-user-cluster. Etcd not network-exposed; only operator has node access." + Description: "Etcd not network-exposed; only operator has node access." "apiserver_kubelet_cert_auth": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. Kubelet API not exposed outside the node; minikube auto-generates certificates." + Description: "Kubelet API not exposed outside the node; minikube auto-generates certificates." "apiserver_request_timeout_set": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. API server only reachable via tailnet; DoS risk limited to trusted clients." + Description: "API server only reachable via tailnet; DoS risk limited to trusted clients." "apiserver_service_account_lookup_true": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: single-user-cluster. Only operator manages service accounts; no revoked tokens in circulation." + Description: "Only operator manages service accounts; no revoked tokens in circulation." "apiserver_strong_ciphers_only": Regions: ["*"] Resources: ["^kube-apiserver-minikube$"] - Description: "CC: tailscale-network-isolation. API server traffic encrypted by WireGuard at the network layer." + Description: "API server traffic encrypted by WireGuard at the network layer." diff --git a/argocd/manifests/prowler/mutelist/control-plane.yaml b/argocd/manifests/prowler/mutelist/control-plane.yaml index 2056691..d3cc34a 100644 --- a/argocd/manifests/prowler/mutelist/control-plane.yaml +++ b/argocd/manifests/prowler/mutelist/control-plane.yaml @@ -6,12 +6,12 @@ Mutelist: "controllermanager_disable_profiling": Regions: ["*"] Resources: ["^kube-controller-manager-minikube$"] - Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet." + Description: "Profiling endpoint unreachable from public internet." "scheduler_profiling": Regions: ["*"] Resources: ["^kube-scheduler-minikube$"] - Description: "CC: tailscale-network-isolation. Profiling endpoint unreachable from public internet." + Description: "Profiling endpoint unreachable from public internet." "kubelet_tls_cert_and_key": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: tailscale-network-isolation, single-user-cluster. Kubelet API not exposed outside node; minikube auto-generates certificates." + Description: "Kubelet API not exposed outside node; minikube auto-generates certificates." diff --git a/argocd/manifests/prowler/mutelist/core-pod-security.yaml b/argocd/manifests/prowler/mutelist/core-pod-security.yaml index c39e0c6..b1e986e 100644 --- a/argocd/manifests/prowler/mutelist/core-pod-security.yaml +++ b/argocd/manifests/prowler/mutelist/core-pod-security.yaml @@ -17,9 +17,8 @@ Mutelist: - "^kindnet-" - "^storage-provisioner$" Description: >- - CC: tailscale-network-isolation. Control-plane and networking - pods require hostNetwork by design. Host network itself is - only reachable via tailnet. + Control-plane and networking pods require hostNetwork by design. + Host network itself is only reachable via tailnet. "core_minimize_privileged_containers": Regions: ["*"] Resources: @@ -31,7 +30,6 @@ Mutelist: # Forgejo runner - "^forgejo-runner-" Description: >- - CC: single-user-cluster, operator-managed-pods, trusted-ci-only. kube-proxy: system pod, single-user cluster. ts-*/ingress-*: Tailscale operator-managed. forgejo-runner: DinD limited to trusted private forge repos. @@ -49,25 +47,24 @@ Mutelist: - "^nameserver-" - "^ingress-" Description: >- - CC: single-user-cluster, operator-managed-pods. System pods - managed by minikube and Tailscale operator; seccomp profiles - set by upstream. Single-user cluster limits exploit surface. + System pods managed by minikube and Tailscale operator; + seccomp profiles set by upstream. Single-user cluster limits + exploit surface. "core_minimize_hostPID_containers": Regions: ["*"] Resources: - "^prowler-" Description: >- - CC: ephemeral-privileged-jobs. Prowler CIS scanner requires - hostPID for file permission checks. Runs as CronJob with - 7-day TTL, not a persistent workload. + Prowler CIS scanner requires hostPID for file permission + checks. Runs as CronJob with 7-day TTL, not a persistent + workload. "core_minimize_root_containers_admission": Regions: ["*"] Resources: - "^grafana-" Description: >- - CC: init-container-isolation. Root limited to init-chown-data - container; all runtime containers run as UID 472 with caps - dropped. + Root limited to init-chown-data container; all runtime + containers run as UID 472 with caps dropped. "core_minimize_containers_added_capabilities": Regions: ["*"] Resources: @@ -77,10 +74,9 @@ Mutelist: # Grafana init-chown-data - "^grafana-" Description: >- - CC: single-user-cluster, init-container-isolation. System - pods: capabilities required by function (minikube-managed). - Grafana: CHOWN limited to init phase; runtime containers - drop ALL. + System pods: capabilities required by function + (minikube-managed). Grafana: CHOWN limited to init phase; + runtime containers drop ALL. "core_minimize_containers_capabilities_assigned": Regions: ["*"] Resources: @@ -88,5 +84,4 @@ Mutelist: - "^kindnet-" - "^grafana-" Description: >- - CC: single-user-cluster, init-container-isolation. See - core_minimize_containers_added_capabilities. + See core_minimize_containers_added_capabilities. diff --git a/argocd/manifests/prowler/mutelist/manual-node-checks.yaml b/argocd/manifests/prowler/mutelist/manual-node-checks.yaml index 9c8354d..c91a2a6 100644 --- a/argocd/manifests/prowler/mutelist/manual-node-checks.yaml +++ b/argocd/manifests/prowler/mutelist/manual-node-checks.yaml @@ -1,7 +1,7 @@ # Node-level and RBAC checks that Prowler reports as MANUAL because it -# cannot evaluate them from inside a pod. Compensated by automated -# verification in `mise run review-compliance-reports`, which SSHes into -# the minikube node and checks each condition directly every week. +# cannot evaluate them from inside a pod. Verified out-of-band by the +# node-verification block in `mise run review-compliance-reports`, which +# SSHes into the minikube node and checks each condition directly. Mutelist: Accounts: "*": @@ -9,51 +9,51 @@ Mutelist: "etcd_unique_ca": Regions: ["*"] Resources: ["^etcd-minikube$"] - Description: "CC: node-config-automated-verification. Etcd CA fingerprint verified different from cluster CA by review-compliance-reports." + Description: "Etcd CA fingerprint verified different from cluster CA by review-compliance-reports." "kubelet_conf_file_ownership": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports." + Description: "File ownership verified root:root by review-compliance-reports." "kubelet_conf_file_permissions": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File permissions verified 600 by review-compliance-reports." + Description: "File permissions verified 600 by review-compliance-reports." "kubelet_config_yaml_ownership": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports." + Description: "File ownership verified root:root by review-compliance-reports." "kubelet_config_yaml_permissions": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File permissions verified 644 by review-compliance-reports." + Description: "File permissions verified 644 by review-compliance-reports." "kubelet_service_file_ownership_root": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File ownership verified root:root by review-compliance-reports." + Description: "File ownership verified root:root by review-compliance-reports." "kubelet_service_file_permissions": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. File permissions verified 644 by review-compliance-reports." + Description: "File permissions verified 644 by review-compliance-reports." "kubelet_disable_read_only_port": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. readOnlyPort absence (defaults to 0) verified by review-compliance-reports." + Description: "readOnlyPort absence (defaults to 0) verified by review-compliance-reports." "kubelet_event_record_qps": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. eventRecordQPS absence (defaults to 5) verified by review-compliance-reports." + Description: "eventRecordQPS absence (defaults to 5) verified by review-compliance-reports." "kubelet_manage_iptables": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification. makeIPTablesUtilChains absence (defaults to true) verified by review-compliance-reports." + Description: "makeIPTablesUtilChains absence (defaults to true) verified by review-compliance-reports." "kubelet_strong_ciphers_only": Regions: ["*"] Resources: ["^kubelet-config$"] - Description: "CC: node-config-automated-verification, tailscale-network-isolation. Go default ciphers used; all traffic WireGuard-encrypted via tailnet." + Description: "Go default ciphers used; all traffic WireGuard-encrypted via tailnet." "rbac_cluster_admin_usage": Regions: ["*"] Resources: - "^cluster-admin$" - "^kubeadm:cluster-admins$" - "^minikube-rbac$" - Description: "CC: node-config-automated-verification, single-user-cluster. Only built-in/minikube cluster-admin bindings present; verified by review-compliance-reports." + Description: "Only built-in/minikube cluster-admin bindings present; verified by review-compliance-reports." diff --git a/argocd/manifests/prowler/mutelist/rbac.yaml b/argocd/manifests/prowler/mutelist/rbac.yaml index c9c52e4..324809d 100644 --- a/argocd/manifests/prowler/mutelist/rbac.yaml +++ b/argocd/manifests/prowler/mutelist/rbac.yaml @@ -13,9 +13,8 @@ Mutelist: # ArgoCD - "^argocd-" Description: >- - CC: single-user-cluster, sso-gated-admin-tools. Built-in - K8s roles: only operator can bind them. ArgoCD: requires - broad access but is SSO-gated via Authentik OIDC. + Built-in K8s roles: only operator can bind them. ArgoCD: + requires broad access but is SSO-gated via Authentik OIDC. "rbac_minimize_pod_creation_access": Regions: ["*"] Resources: @@ -26,14 +25,12 @@ Mutelist: # CloudNativePG operator - "^cnpg-manager$" Description: >- - CC: single-user-cluster. Built-in K8s roles and CNPG - operator. Only the operator can assign these roles; no - untrusted users have cluster access. + Built-in K8s roles and CNPG operator. Only the operator can + assign these roles; no untrusted users have cluster access. "rbac_minimize_service_account_token_creation": Regions: ["*"] Resources: - "^system:" Description: >- - CC: single-user-cluster. kube-controller-manager requires - token creation for SA management. Only operator manages - service accounts. + kube-controller-manager requires token creation for SA + management. Only operator manages service accounts. diff --git a/argocd/manifests/prowler/mutelist/trivyignore.yaml b/argocd/manifests/prowler/mutelist/trivyignore.yaml new file mode 100644 index 0000000..87af966 --- /dev/null +++ b/argocd/manifests/prowler/mutelist/trivyignore.yaml @@ -0,0 +1,37 @@ +# Trivy ignorefile for Prowler IaC scan. +# +# Prowler's `--mutelist-file` flag is a no-op for the IaC provider +# (iac_provider.py sets self._mutelist = None and delegates to Trivy). +# Trivy in turn does not auto-discover this YAML form from cwd, so the +# Prowler image ships a shim wrapper around `trivy` that injects +# --ignorefile $TRIVY_IGNOREFILE when the env var is set. The cronjob +# mounts this file and sets TRIVY_IGNOREFILE accordingly. +# +# Schema: https://trivy.dev/latest/docs/configuration/filtering/ +# IDs use the hyphenated form Trivy displays (KSV-0041, not KSV0041). +misconfigurations: + - id: KSV-0041 + paths: + - "argocd/manifests/external-secrets/rbac.yaml" + statement: >- + external-secrets-operator's entire function is to read and + synthesize Secret objects; ClusterRole over secrets is its + purpose. Both the controller and cert-controller are + upstream-defined. + - id: KSV-0041 + paths: + - "argocd/manifests/kube-state-metrics/rbac.yaml" + - "argocd/manifests/kube-state-metrics-ringtail/rbac.yaml" + statement: >- + KSM exposes only Secret metadata (name, namespace, type, labels), + never the data field. list/watch on secrets is required for + kube_secret_info / kube_secret_labels metrics. + - id: KSV-0114 + paths: + - "argocd/manifests/external-secrets/rbac.yaml" + statement: >- + cert-controller manages the external-secrets validating webhook + configurations to inject its own rotating CA bundle. RBAC is + scoped to two named webhooks (secretstore-validate, + externalsecret-validate) via resourceNames; KSV-0114 doesn't see + the resourceNames restriction so reports the full ClusterRole. diff --git a/argocd/manifests/shower/configmap.yaml b/argocd/manifests/shower/configmap.yaml new file mode 100644 index 0000000..6102c1e --- /dev/null +++ b/argocd/manifests/shower/configmap.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: shower-app-config + namespace: shower +data: + DJANGO_DEBUG: "0" + # The app's settings.py hardcodes ALLOWED_HOSTS = ["shower.eblu.me", + # "localhost", "127.0.0.1"] and exposes this env var as a comma-separated + # extras list. shower.ops.eblu.me is what Caddy on indri and the + # Tailscale ProxyGroup both send as the Host header, so the app needs to + # accept it. + DJANGO_ALLOWED_HOSTS: "shower.ops.eblu.me" + # /host/, /admin/, and Django's login surface are all tailnet-only — the + # public proxy 403s everything outside of `/` and `/prizes//`. + # /host/'s "Django admin" link follows DJANGO_ADMIN_URL. + DJANGO_ADMIN_URL: "https://shower.ops.eblu.me/admin/" + # /host/ is served on shower.ops.eblu.me (tailnet), but the QR codes it + # generates need to point at the public WAN hostname so guest phones can + # reach them. PUBLIC_URL_BASE overrides Django's request.build_absolute_uri() + # in the QR views — see shower/views.py:_public_url. Added in app v1.0.1. + DJANGO_PUBLIC_URL_BASE: "https://shower.eblu.me" diff --git a/argocd/manifests/shower/deployment.yaml b/argocd/manifests/shower/deployment.yaml new file mode 100644 index 0000000..70547aa --- /dev/null +++ b/argocd/manifests/shower/deployment.yaml @@ -0,0 +1,81 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: shower + namespace: shower +spec: + replicas: 1 + # SQLite + RWO data PVC: only one writer at a time. Recreate ensures the + # old pod's lock on the local-path volume is released before the new one + # mounts it. + strategy: + type: Recreate + selector: + matchLabels: + app: shower + template: + metadata: + labels: + app: shower + spec: + securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: shower + image: registry.ops.eblu.me/blumeops/shower:kustomized + securityContext: + runAsNonRoot: true + allowPrivilegeEscalation: false + ports: + - containerPort: 8000 + name: http + envFrom: + - configMapRef: + name: shower-app-config + - secretRef: + name: shower-app-secrets + volumeMounts: + - name: media + mountPath: /app/media + - name: data + mountPath: /app/data + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: / + port: 8000 + httpHeaders: + - name: Host + value: shower.ops.eblu.me + - name: X-Forwarded-Proto + value: https + initialDelaySeconds: 30 + periodSeconds: 30 + readinessProbe: + httpGet: + path: / + port: 8000 + httpHeaders: + - name: Host + value: shower.ops.eblu.me + - name: X-Forwarded-Proto + value: https + initialDelaySeconds: 10 + periodSeconds: 10 + volumes: + - name: media + persistentVolumeClaim: + claimName: shower-media + - name: data + persistentVolumeClaim: + claimName: shower-data diff --git a/argocd/manifests/shower/external-secret.yaml b/argocd/manifests/shower/external-secret.yaml new file mode 100644 index 0000000..005a7e9 --- /dev/null +++ b/argocd/manifests/shower/external-secret.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: shower-app-secrets + namespace: shower +spec: + refreshInterval: 1h + secretStoreRef: + kind: ClusterSecretStore + name: onepassword-blumeops + target: + name: shower-app-secrets + creationPolicy: Owner + data: + - secretKey: DJANGO_SECRET_KEY + remoteRef: + key: "Shower (blumeops)" + property: secret-key diff --git a/argocd/manifests/shower/ingress-tailscale.yaml b/argocd/manifests/shower/ingress-tailscale.yaml new file mode 100644 index 0000000..d09a696 --- /dev/null +++ b/argocd/manifests/shower/ingress-tailscale.yaml @@ -0,0 +1,30 @@ +# Tailscale Ingress for shower app. +# Exposes at shower.tail8d86e.ts.net. +# Caddy on indri proxies shower.ops.eblu.me here. The fly proxy then proxies +# shower.eblu.me through Caddy to this same endpoint (fly does not contact +# the k8s service directly — all traffic routes through indri's Caddy). +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: shower-tailscale + namespace: shower + annotations: + tailscale.com/proxy-class: "default" + tailscale.com/proxy-group: "ingress" + gethomepage.dev/enabled: "true" + gethomepage.dev/name: "Shower" + gethomepage.dev/group: "Home" + gethomepage.dev/icon: "mdi-baby" + gethomepage.dev/description: "Adelaide baby shower" + gethomepage.dev/href: "https://shower.ops.eblu.me" + gethomepage.dev/pod-selector: "app=shower" +spec: + ingressClassName: tailscale + defaultBackend: + service: + name: shower + port: + number: 8000 + tls: + - hosts: + - shower diff --git a/argocd/manifests/shower/kustomization.yaml b/argocd/manifests/shower/kustomization.yaml new file mode 100644 index 0000000..1c29224 --- /dev/null +++ b/argocd/manifests/shower/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: shower + +resources: + - configmap.yaml + - external-secret.yaml + - pv-nfs.yaml + - pvc.yaml + - service.yaml + - ingress-tailscale.yaml + - deployment.yaml + +images: + - name: registry.ops.eblu.me/blumeops/shower + newTag: v1.1.3-3645098-nix diff --git a/argocd/manifests/shower/pv-nfs.yaml b/argocd/manifests/shower/pv-nfs.yaml new file mode 100644 index 0000000..7354fb5 --- /dev/null +++ b/argocd/manifests/shower/pv-nfs.yaml @@ -0,0 +1,24 @@ +# NFS PersistentVolume for shower app media uploads (prize photos). +# +# Requires the `shower` share on sifaka with NFS exports matching the +# blumeops standard (192.168.1.0/24 + 100.64.0.0/10, all_squash → admin). +# See docs/how-to/operations/shower-app.md for the Synology web-UI walk +# and docs/reference/storage/sifaka.md for the exports table. +# +# Because all_squash rewrites every NFS write to admin:users (1024:100), +# the in-pod runAsUser does NOT have to match an on-disk uid. Mode 0777 +# on /volume1/shower lets the pod read back what it wrote. +apiVersion: v1 +kind: PersistentVolume +metadata: + name: shower-media-nfs-pv +spec: + capacity: + storage: 10Gi + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + storageClassName: "" + nfs: + server: sifaka + path: /volume1/shower diff --git a/argocd/manifests/shower/pvc.yaml b/argocd/manifests/shower/pvc.yaml new file mode 100644 index 0000000..47fee54 --- /dev/null +++ b/argocd/manifests/shower/pvc.yaml @@ -0,0 +1,30 @@ +# Media PVC — RWX NFS share for /app/media (prize photo uploads). +# SQLite DB lives in a separate local-path PVC; NFS file locking is not +# reliable enough for SQLite's WAL/journal. +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: shower-media + namespace: shower +spec: + accessModes: + - ReadWriteMany + storageClassName: "" + volumeName: shower-media-nfs-pv + resources: + requests: + storage: 10Gi +--- +# Database PVC — k3s local-path (default storage class) for SQLite. +# RWO is fine: the deployment runs with a single replica. +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: shower-data + namespace: shower +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 2Gi diff --git a/argocd/manifests/devpi/service.yaml b/argocd/manifests/shower/service.yaml similarity index 53% rename from argocd/manifests/devpi/service.yaml rename to argocd/manifests/shower/service.yaml index 42e1543..0a73aab 100644 --- a/argocd/manifests/devpi/service.yaml +++ b/argocd/manifests/shower/service.yaml @@ -1,13 +1,13 @@ apiVersion: v1 kind: Service metadata: - name: devpi - namespace: devpi + name: shower + namespace: shower spec: selector: - app: devpi + app: shower ports: - name: http - port: 3141 - targetPort: 3141 + port: 8000 + targetPort: 8000 protocol: TCP diff --git a/argocd/manifests/tailscale-operator-base/kustomization.yaml b/argocd/manifests/tailscale-operator-base/kustomization.yaml index 4519af6..9d117ef 100644 --- a/argocd/manifests/tailscale-operator-base/kustomization.yaml +++ b/argocd/manifests/tailscale-operator-base/kustomization.yaml @@ -6,8 +6,11 @@ namespace: tailscale # Upstream Tailscale operator manifest from forge mirror. # To upgrade: update the ref in the URL AND the newTag below. +# Must use the tailnet host forge.ops.eblu.me — the public forge.eblu.me +# black-holes /mirrors/ at the Fly edge (AI-scraper mitigation), which the +# in-cluster ArgoCD repo-server would otherwise hit and fail with a 403. resources: - - https://forge.eblu.me/mirrors/tailscale/raw/tag/v1.94.2/cmd/k8s-operator/deploy/manifests/operator.yaml + - https://forge.ops.eblu.me/mirrors/tailscale/raw/tag/v1.94.2/cmd/k8s-operator/deploy/manifests/operator.yaml - proxyclass.yaml - dnsconfig.yaml diff --git a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml index a14ca81..2d9ceb2 100644 --- a/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml +++ b/argocd/manifests/tailscale-operator-ringtail/kustomization.yaml @@ -8,3 +8,17 @@ resources: - ../tailscale-operator-base - proxygroup-ingress.yaml - external-secret.yaml + +# Rewrite the proxyclass image to our local nix-built mirror. +# Scoped to ringtail only; indri's tailscale-operator/kustomization.yaml still +# pulls from upstream docker.io. A strategic merge patch is used instead of +# kustomize's `images:` directive because that directive only rewrites images +# in standard k8s container fields, not custom-resource fields like +# ProxyClass.spec.statefulSet.pod.tailscaleContainer.image. +patches: + - path: proxyclass-image.yaml + target: + group: tailscale.com + version: v1alpha1 + kind: ProxyClass + name: default diff --git a/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml b/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml new file mode 100644 index 0000000..d1bf2a4 --- /dev/null +++ b/argocd/manifests/tailscale-operator-ringtail/proxyclass-image.yaml @@ -0,0 +1,11 @@ +apiVersion: tailscale.com/v1alpha1 +kind: ProxyClass +metadata: + name: default +spec: + statefulSet: + pod: + tailscaleContainer: + image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-0108b68-nix + tailscaleInitContainer: + image: registry.ops.eblu.me/blumeops/tailscale:v1.94.2-0108b68-nix diff --git a/argocd/manifests/teslamate/deployment.yaml b/argocd/manifests/teslamate-ringtail/deployment.yaml similarity index 81% rename from argocd/manifests/teslamate/deployment.yaml rename to argocd/manifests/teslamate-ringtail/deployment.yaml index 42859a7..cf8cc73 100644 --- a/argocd/manifests/teslamate/deployment.yaml +++ b/argocd/manifests/teslamate-ringtail/deployment.yaml @@ -1,3 +1,10 @@ +# TeslaMate on ringtail k3s — Nix image. +# +# The Nix image's Entrypoint waits for postgres, runs migrations +# (TeslaMate.Release.migrate), then starts the release — so no command +# override is needed. Stateless; all data lives in the teslamate database +# on the ringtail blumeops-pg (DATABASE_HOST already an in-cluster name, +# unchanged from minikube). See [[migrate-wave1-ringtail]]. apiVersion: apps/v1 kind: Deployment metadata: diff --git a/argocd/manifests/teslamate/external-secret-db.yaml b/argocd/manifests/teslamate-ringtail/external-secret-db.yaml similarity index 100% rename from argocd/manifests/teslamate/external-secret-db.yaml rename to argocd/manifests/teslamate-ringtail/external-secret-db.yaml diff --git a/argocd/manifests/teslamate/external-secret-encryption-key.yaml b/argocd/manifests/teslamate-ringtail/external-secret-encryption-key.yaml similarity index 100% rename from argocd/manifests/teslamate/external-secret-encryption-key.yaml rename to argocd/manifests/teslamate-ringtail/external-secret-encryption-key.yaml diff --git a/argocd/manifests/teslamate/ingress-tailscale.yaml b/argocd/manifests/teslamate-ringtail/ingress-tailscale.yaml similarity index 100% rename from argocd/manifests/teslamate/ingress-tailscale.yaml rename to argocd/manifests/teslamate-ringtail/ingress-tailscale.yaml diff --git a/argocd/manifests/teslamate/kustomization.yaml b/argocd/manifests/teslamate-ringtail/kustomization.yaml similarity index 90% rename from argocd/manifests/teslamate/kustomization.yaml rename to argocd/manifests/teslamate-ringtail/kustomization.yaml index a00586f..acb623e 100644 --- a/argocd/manifests/teslamate/kustomization.yaml +++ b/argocd/manifests/teslamate-ringtail/kustomization.yaml @@ -12,4 +12,4 @@ resources: images: - name: registry.ops.eblu.me/blumeops/teslamate - newTag: v3.0.0-08c698e + newTag: v3.0.0-fcac8e5-nix diff --git a/argocd/manifests/teslamate/service.yaml b/argocd/manifests/teslamate-ringtail/service.yaml similarity index 100% rename from argocd/manifests/teslamate/service.yaml rename to argocd/manifests/teslamate-ringtail/service.yaml diff --git a/argocd/manifests/teslamate/README.md b/argocd/manifests/teslamate/README.md deleted file mode 100644 index 7e1f9fc..0000000 --- a/argocd/manifests/teslamate/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# TeslaMate - -TeslaMate is a self-hosted Tesla data logger that collects and visualizes vehicle data. - -## Prerequisites - -### 1. Create 1Password Secrets - -Create two items in the blumeops 1Password vault: - -1. **TeslaMate DB Password** - - Generate a secure password for the teslamate PostgreSQL user - - Add a field named `password` with the generated value - -2. **TeslaMate Encryption Key** - - Generate with: `openssl rand -base64 32` - - Add a field named `key` with the generated value - - This encrypts Tesla API tokens at rest in the database - -### 2. Apply Kubernetes Secrets - -```bash -# Create namespace -kubectl create namespace teslamate - -# Apply database user secret (for CNPG) -op inject -i argocd/manifests/databases/secret-teslamate.yaml.tpl | kubectl apply -f - - -# Apply teslamate secrets -op inject -i argocd/manifests/teslamate/secret-encryption-key.yaml.tpl | kubectl apply -f - -op inject -i argocd/manifests/teslamate/secret-db.yaml.tpl | kubectl apply -f - -``` - -### 3. Create Database - -After the teslamate user exists in PostgreSQL (sync blumeops-pg first): - -```bash -PGPASSWORD=$(op read "op://blumeops/postgres/password") \ - psql -h pg.ops.eblu.me -U eblume -c "CREATE DATABASE teslamate OWNER teslamate;" -``` - -## Deployment - -```bash -# Sync ArgoCD apps -argocd app sync apps -argocd app sync blumeops-pg teslamate grafana grafana-config -``` - -## Tesla API Setup - -1. Access TeslaMate UI at https://tesla.tail8d86e.ts.net -2. Click "Sign in with Tesla" -3. Complete OAuth flow in browser -4. Tokens are encrypted and stored in database -5. Verify vehicle appears and data collection starts - -## Grafana Dashboards - -TeslaMate dashboards are available in Grafana at https://grafana.tail8d86e.ts.net - -They use the "TeslaMate" PostgreSQL datasource (not Prometheus). - -## Notes - -- MQTT is disabled (can be enabled later for Home Assistant integration) -- Timezone is set to America/Los_Angeles -- Encryption key protects Tesla API tokens at rest diff --git a/argocd/manifests/unpoller/kustomization.yaml b/argocd/manifests/unpoller/kustomization.yaml index 5b7a9e2..bf776bb 100644 --- a/argocd/manifests/unpoller/kustomization.yaml +++ b/argocd/manifests/unpoller/kustomization.yaml @@ -10,7 +10,7 @@ resources: images: - name: registry.ops.eblu.me/blumeops/unpoller - newTag: v2.34.0-613f05d + newTag: v3.2.0-4d1f4af configMapGenerator: - name: unpoller-config diff --git a/compensating-controls.yaml b/compensating-controls.yaml deleted file mode 100644 index b441341..0000000 --- a/compensating-controls.yaml +++ /dev/null @@ -1,148 +0,0 @@ -# Compensating Controls -# -# Documents controls that mitigate risks from suppressed or accepted security -# findings. Referenced by security tools (Prowler mutelist, Kingfisher config, -# etc.) via "CC: " in finding descriptions or suppression notes. -# -# Used by `mise run review-compensating-controls` to surface stale controls. -# -# Fields: -# id - kebab-case unique identifier, referenced from tool configs -# description - what the control actually does to mitigate risk -# created - date (YYYY-MM-DD) the control was documented -# last-reviewed - date (YYYY-MM-DD) or null -# notes - optional context - -controls: - - id: single-user-cluster - description: >- - Only the cluster operator (eblume) has kubectl access. No untrusted - users can create pods, access cached images, or bind RBAC roles. - created: 2026-03-30 - last-reviewed: 2026-04-01 - notes: >- - Verify by checking kubeconfig distribution and Tailscale ACLs. - If additional users gain cluster access, re-evaluate all findings - muted under this control. - - - id: tailscale-network-isolation - description: >- - Cluster is not internet-exposed. All access requires Tailscale - identity with ACL enforcement. Profiling endpoints, debug ports, - and control-plane APIs are unreachable from the public internet. - created: 2026-03-30 - last-reviewed: 2026-04-06 - notes: >- - Verify with 'tailscale serve status --json' on indri and review - Tailscale ACLs in pulumi/tailscale/. Only tag:flyio-target services - are publicly routable. - - - id: local-registry - description: >- - Operator-built services use a private zot registry - (registry.ops.eblu.me) for supply-chain control. Remaining - images are pulled from public registries without stored - credentials. No shared registry secrets are cached on cluster - nodes. - created: 2026-03-30 - last-reviewed: 2026-04-12 - notes: >- - Verify by checking image prefixes in kustomization.yaml files. - Known external-image categories: (1) upstream apps not yet - mirrored — immich, ollama, frigate, frigate-notify, valkey; - (2) infrastructure components — tailscale operator/proxy, - external-secrets, 1password-connect, forgejo-runner, docker - DinD, nvidia-device-plugin; (3) utility base images — busybox, - alpine (grafana init containers). Track upstream versions in - service-versions.yaml. Goal is to progressively mirror these - into zot. - - - id: sso-gated-admin-tools - description: >- - ArgoCD requires SSO authentication via Authentik OIDC. Wildcard - RBAC roles are mitigated by requiring authenticated identity - before any API access. - created: 2026-03-30 - last-reviewed: 2026-04-14 - notes: >- - Verify Authentik OIDC provider config for ArgoCD and that - anonymous access is disabled. Check ArgoCD --auth-token isn't - leaked. The workflow-bot API key account is scoped to sync/get - only. - - - id: operator-managed-pods - description: >- - Tailscale operator manages proxy pod specs (ts-*, ingress-*, - operator-*, nameserver-*). Pod security settings are set by the - operator, not user manifests. Operator is tracked in - service-versions.yaml and regularly updated. - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - Verify operator version is current via 'mise run service-review'. - Check Tailscale changelog for security fixes. If operator adds - seccomp support, remove these mutes. - - - id: ephemeral-privileged-jobs - description: >- - Prowler CIS scanner runs as a CronJob with 7-day TTL - auto-deletion, not as a persistent privileged workload. hostPID - exposure is time-bounded to scan duration (~20s). - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - Verify TTL is set in cronjob.yaml. Check that no persistent - pods run with hostPID. - - - id: trusted-ci-only - description: >- - Forgejo runner only executes workflows from repos on the private - forge (forge.ops.eblu.me). No external or untrusted repos can - trigger privileged CI jobs. - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - Verify runner registration is limited to the forge instance. - Check Forgejo runner config for repo allow-lists. - - - id: init-container-isolation - description: >- - Root privileges and added capabilities (CHOWN) are limited to - init containers that run once at pod startup. All runtime - containers run as non-root (UID 472) with all capabilities - dropped. - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - Verify by inspecting grafana deployment.yaml securityContext - for both init and runtime containers. If fsGroup alone can - handle PVC ownership, remove init-chown-data and this control. - - - id: node-config-automated-verification - description: >- - Prowler reports certain node-level checks as MANUAL because it runs - inside a pod and cannot evaluate kubelet file permissions, kubelet - config arguments, etcd CA separation, or cluster-admin RBAC bindings. - The review-compliance-reports script SSHes into the minikube node - weekly and programmatically verifies each condition, failing loudly - if any check deviates from expected values. - created: 2026-04-14 - last-reviewed: 2026-04-14 - notes: >- - Verification runs as part of 'mise run review-compliance-reports'. - If minikube node is unreachable, all checks report as FAIL. If new - MANUAL findings appear in Prowler, add corresponding verification - logic to the script and update the mutelist. - - - id: observability-stack-audit - description: >- - Alloy collects pod logs and ships them to Loki, providing an - audit trail for cluster activity. Compensates for missing - apiserver audit logging which minikube does not configure. - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - Verify Alloy DaemonSet is running and Loki is receiving logs. - Note this is weaker than native apiserver audit logs — it - captures pod stdout/stderr, not API request-level auditing. - Consider enabling minikube audit logging if supported. diff --git a/containers/alloy/Dockerfile b/containers/alloy/Dockerfile deleted file mode 100644 index f2f30f6..0000000 --- a/containers/alloy/Dockerfile +++ /dev/null @@ -1,68 +0,0 @@ -# Grafana Alloy telemetry collector -# Three-stage build: Web UI (Node), server (Go), runtime (Alpine) - -ARG CONTAINER_APP_VERSION=1.14.0 -ARG ALLOY_VERSION=v${CONTAINER_APP_VERSION} -ARG ALLOY_COMMIT=626a738319812d58ebc25ca6d71651f4925b8b18 - -FROM node:22-alpine AS ui-build - -ARG ALLOY_COMMIT -RUN apk add --no-cache git - -RUN mkdir /app && cd /app \ - && git init \ - && git remote add origin https://forge.ops.eblu.me/mirrors/alloy.git \ - && git fetch --depth 1 origin ${ALLOY_COMMIT} \ - && git checkout FETCH_HEAD - -WORKDIR /app/internal/web/ui -RUN npm ci -RUN npx tsc -b && npx vite build - -FROM golang:1.25-alpine3.22 AS build - -ARG ALLOY_VERSION -ARG ALLOY_COMMIT -RUN apk add --no-cache build-base git - -RUN mkdir /app && cd /app \ - && git init \ - && git remote add origin https://forge.ops.eblu.me/mirrors/alloy.git \ - && git fetch --depth 1 origin ${ALLOY_COMMIT} \ - && git checkout FETCH_HEAD - -WORKDIR /app - -# Copy pre-built web UI assets -COPY --from=ui-build /app/internal/web/ui/dist /app/internal/web/ui/dist - -ENV CGO_ENABLED=1 - -# promtail_journal_enabled omitted: requires systemd headers (libsystemd-dev) -# and our k8s deployments read pod logs from the filesystem, not journald -RUN RELEASE_BUILD=1 VERSION=${ALLOY_VERSION} \ - GO_TAGS="netgo embedalloyui" \ - SKIP_UI_BUILD=1 \ - make alloy - -FROM alpine:3.22 - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Alloy" -LABEL org.opencontainers.image.description="Grafana Alloy is an OpenTelemetry Collector distribution" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -RUN apk --no-cache add ca-certificates tzdata \ - && addgroup -g 473 alloy \ - && adduser -D -u 473 -G alloy alloy \ - && mkdir -p /var/lib/alloy/data \ - && chown -R alloy:alloy /var/lib/alloy - -COPY --from=build --chown=473:473 /app/build/alloy /bin/alloy - -ENTRYPOINT ["/bin/alloy"] -ENV ALLOY_DEPLOY_MODE=docker -CMD ["run", "/etc/alloy/config.alloy", "--storage.path=/var/lib/alloy/data"] diff --git a/containers/alloy/container.py b/containers/alloy/container.py new file mode 100644 index 0000000..41d3995 --- /dev/null +++ b/containers/alloy/container.py @@ -0,0 +1,95 @@ +"""Grafana Alloy — telemetry collector, native Dagger build. + +Three-stage build: Node (UI), Go (server via upstream Makefile with embedded +UI assets), Alpine (runtime). Source cloned from forge mirror. + +Notes: + - Builds via `make alloy` rather than plain `go build` so version stamping, + release flags, and the netgo+embedalloyui tags match upstream releases. + - promtail_journal_enabled is intentionally omitted: it requires + libsystemd-dev and our k8s deployments read pod logs from the filesystem, + not journald. + - Uses golang:alpine3.23 (currently Go 1.26.2 — matches alloy v1.16.0's + go.mod toolchain requirement and the go_build helper's image choice). +""" + +import dagger +from dagger import dag + +from blumeops.containers import ( + alpine_runtime, + clone_from_forge, + node_build, + oci_labels, +) + +VERSION = "v1.16.0" + + +async def build(src: dagger.Directory) -> dagger.Container: + source = clone_from_forge("alloy", VERSION) + + # Stage 1: Build the web UI (tsc + vite, not the package.json default). + ui = node_build( + source, + "internal/web/ui", + build_cmd=["sh", "-c", "npx tsc -b && npx vite build"], + ) + + # Stage 2: Build alloy via the upstream Makefile with embedded UI assets. + builder = ( + dag.container() + .from_("golang:alpine3.23") + .with_exec(["apk", "add", "--no-cache", "build-base", "git", "make"]) + .with_directory("/app", source) + .with_directory( + "/app/internal/web/ui/dist", + ui.directory("/app/internal/web/ui/dist"), + ) + .with_workdir("/app") + .with_env_variable("CGO_ENABLED", "1") + .with_env_variable("RELEASE_BUILD", "1") + .with_env_variable("VERSION", VERSION) + .with_env_variable("GO_TAGS", "netgo embedalloyui") + .with_env_variable("SKIP_UI_BUILD", "1") + .with_exec(["make", "alloy"]) + ) + + # Stage 3: Runtime as uid/gid 473 alloy. + runtime = alpine_runtime( + extra_apk=["ca-certificates", "tzdata"], + uid=473, + gid=473, + username="alloy", + ) + runtime = oci_labels( + runtime, + title="Alloy", + description="Grafana Alloy is an OpenTelemetry Collector distribution", + version=VERSION, + ) + return ( + runtime.with_file( + "/bin/alloy", + builder.file("/app/build/alloy"), + permissions=0o555, + ) + .with_exec( + [ + "sh", + "-c", + "mkdir -p /var/lib/alloy/data && chown -R alloy:alloy /var/lib/alloy", + ] + ) + .with_env_variable("ALLOY_DEPLOY_MODE", "docker") + .with_exposed_port(12345) + .with_user("alloy") + .with_entrypoint(["/bin/alloy"]) + .with_default_args( + args=[ + "run", + "/etc/alloy/config.alloy", + "--storage.path=/var/lib/alloy/data", + ] + ) + ) diff --git a/containers/alloy/default.nix b/containers/alloy/default.nix index e508a10..c884704 100644 --- a/containers/alloy/default.nix +++ b/containers/alloy/default.nix @@ -1,24 +1,24 @@ # Nix-built Grafana Alloy telemetry collector -# Builds v1.14.0 from forge mirror with embedded web UI +# Builds v1.16.0 from forge mirror with embedded web UI # Uses stdenv + make (not buildGoModule) due to multi-module workspace # with local replace directives (collector/ -> ../, ../syntax, ../extension) # Built with dockerTools.buildLayeredImage for efficient layer caching { pkgs ? import { } }: let - version = "1.14.0"; + version = "1.16.0"; src = pkgs.fetchgit { url = "https://forge.ops.eblu.me/mirrors/alloy.git"; rev = "v${version}"; - hash = "sha256-gxNz4XDE8XSl6LsP3k8DERqDdMLcmbWKfXZGGyRULkg="; + hash = "sha256-q5R2noxBZ3OPyZqmB+bx3iJKWFxC2WIprcgh9RwjLzk="; }; ui = pkgs.buildNpmPackage { inherit version; pname = "alloy-ui"; src = "${src}/internal/web/ui"; - npmDepsHash = "sha256-GT0yisPn+3FCtWL3he0i5zPMlaWNparQDefU69G4Yis="; + npmDepsHash = "sha256-vResNUT4auDsK9ngnJYfMUUOYr/ikPhrvakqCjGq2Q8="; buildPhase = '' runHook preBuild @@ -40,11 +40,12 @@ let pname = "alloy-go-modules"; inherit src version; - nativeBuildInputs = with pkgs; [ go git cacert ]; + nativeBuildInputs = with pkgs; [ go_1_26 git cacert ]; buildPhase = '' export GOPATH=$TMPDIR/go export GOFLAGS=-modcacherw + export GOTOOLCHAIN=local # Download modules for all three go.mod files go mod download cd syntax && go mod download && cd .. @@ -56,7 +57,7 @@ let ''; outputHashMode = "recursive"; - outputHash = "sha256-rD7zqomSVv4d8NaC7jXXgihuQvK8guaAN0KrsBRWMVQ="; + outputHash = "sha256-9/v85HyDInJB+9qHauKVuDol6Yf5mkXfMWgCr7RdRTk="; outputHashAlgo = "sha256"; }; @@ -65,7 +66,7 @@ let pname = "alloy"; nativeBuildInputs = with pkgs; [ - go + go_1_26 git gnumake cacert @@ -77,6 +78,7 @@ let export HOME=$TMPDIR export GOPATH=$TMPDIR/go export GOFLAGS=-modcacherw + export GOTOOLCHAIN=local # Populate module cache from pre-fetched modules mkdir -p $GOPATH/pkg diff --git a/containers/cv/Dockerfile b/containers/cv/Dockerfile deleted file mode 100644 index 9bfebe0..0000000 --- a/containers/cv/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -# CV/Resume Static Site Server -# Downloads and serves a CV site tarball (HTML+CSS+PDF) via nginx -# -# Configuration (via environment): -# CV_RELEASE_URL - URL to download the CV content tarball -# -# The container downloads the tarball on startup, extracts it, and serves with nginx. - -ARG CONTAINER_APP_VERSION=1.0.3 - -FROM nginx:alpine - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="CV" -LABEL org.opencontainers.image.description="Static site server for CV/resume" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -# Install curl for downloading release assets -RUN apk add --no-cache curl - -# Copy startup script and nginx config -COPY start.sh /start.sh -COPY default.conf /etc/nginx/conf.d/default.conf -RUN chmod +x /start.sh - -EXPOSE 80 - -CMD ["/start.sh"] diff --git a/containers/cv/default.conf b/containers/cv/default.conf deleted file mode 100644 index 7c89b08..0000000 --- a/containers/cv/default.conf +++ /dev/null @@ -1,33 +0,0 @@ -server { - listen 80; - server_name _; - root /usr/share/nginx/html; - index index.html; - - # Enable gzip compression - gzip on; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml text/javascript; - - # Cache static assets - location ~* \.(css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ { - expires 1y; - add_header Cache-Control "public, immutable"; - } - - # Force PDF download - location = /resume.pdf { - add_header Content-Disposition 'attachment; filename="erich-blume-resume.pdf"'; - } - - # Serve files directly - location / { - try_files $uri $uri/ =404; - } - - # Health check endpoint - location /healthz { - access_log off; - return 200 "ok\n"; - add_header Content-Type text/plain; - } -} diff --git a/containers/cv/start.sh b/containers/cv/start.sh deleted file mode 100644 index bb81c20..0000000 --- a/containers/cv/start.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -set -e - -HTML_DIR="/usr/share/nginx/html" - -# Check for required environment variable -if [ -z "$CV_RELEASE_URL" ]; then - echo "Error: CV_RELEASE_URL environment variable is required" - echo "Set it to the URL of the CV content tarball to serve" - exit 1 -fi - -echo "Downloading CV content from: $CV_RELEASE_URL" - -# Download the tarball -if ! curl -fsSL "$CV_RELEASE_URL" -o /tmp/cv.tar.gz; then - echo "Error: Failed to download CV content from $CV_RELEASE_URL" - exit 1 -fi - -# Clear existing content and extract -rm -rf "${HTML_DIR:?}"/* -echo "Extracting CV content to $HTML_DIR" -tar -xzf /tmp/cv.tar.gz -C "$HTML_DIR" -rm /tmp/cv.tar.gz - -echo "CV content extracted successfully" -echo "Starting nginx..." - -# Start nginx in foreground -exec nginx -g "daemon off;" diff --git a/containers/devpi/container.py b/containers/devpi/container.py deleted file mode 100644 index 0067e95..0000000 --- a/containers/devpi/container.py +++ /dev/null @@ -1,56 +0,0 @@ -"""devpi PyPI server and caching proxy — native Dagger build. - -Single-stage build: install devpi-server and devpi-web into a Python slim image. -""" - -import dagger -from dagger import dag - -from blumeops.containers import oci_labels - -VERSION = "6.19.3" - -DEVPI_WEB_VERSION = "5.0.2" -PYTHON_BASE = "python:3.12-slim" - - -async def build(src: dagger.Directory) -> dagger.Container: - ctr = ( - dag.container() - .from_(PYTHON_BASE) - .with_exec( - [ - "pip", - "install", - "--no-cache-dir", - f"devpi-server=={VERSION}", - f"devpi-web=={DEVPI_WEB_VERSION}", - ] - ) - .with_exec( - [ - "useradd", - "-r", - "-u", - "1000", - "devpi", - ] - ) - .with_exec(["mkdir", "-p", "/devpi"]) - .with_exec(["chown", "devpi:devpi", "/devpi"]) - .with_file( - "/usr/local/bin/start.sh", - src.file("containers/devpi/start.sh"), - ) - .with_exec(["chmod", "+x", "/usr/local/bin/start.sh"]) - .with_user("devpi") - .with_workdir("/devpi") - .with_exposed_port(3141) - .with_entrypoint(["/usr/local/bin/start.sh"]) - ) - return oci_labels( - ctr, - title="devpi", - description="devpi PyPI server and caching proxy", - version=VERSION, - ) diff --git a/containers/devpi/start.sh b/containers/devpi/start.sh deleted file mode 100644 index 8ed46a2..0000000 --- a/containers/devpi/start.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -set -e - -SERVERDIR="${DEVPI_SERVERDIR:-/devpi}" -HOST="${DEVPI_HOST:-0.0.0.0}" -# Note: Can't use DEVPI_PORT - Kubernetes auto-sets it for service discovery -PORT="${DEVPI_LISTEN_PORT:-3141}" -OUTSIDE_URL="${DEVPI_OUTSIDE_URL:-}" - -# Check if devpi is initialized -if [ ! -f "$SERVERDIR/.serverversion" ]; then - echo "Initializing devpi server..." - - if [ -z "$DEVPI_ROOT_PASSWORD" ]; then - echo "ERROR: DEVPI_ROOT_PASSWORD environment variable must be set for initialization" - exit 1 - fi - - devpi-init --serverdir "$SERVERDIR" --root-passwd "$DEVPI_ROOT_PASSWORD" - echo "Devpi initialized successfully" -fi - -# Build command -CMD=(devpi-server --serverdir "$SERVERDIR" --host "$HOST" --port "$PORT") - -if [ -n "$OUTSIDE_URL" ]; then - CMD+=(--outside-url "$OUTSIDE_URL") -fi - -echo "Starting devpi-server..." -exec "${CMD[@]}" diff --git a/containers/external-secrets/container.py b/containers/external-secrets/container.py new file mode 100644 index 0000000..6be5765 --- /dev/null +++ b/containers/external-secrets/container.py @@ -0,0 +1,51 @@ +"""External Secrets Operator — native Dagger build. + +Two-stage build: Go binary (all providers), Alpine runtime. +Source cloned from forge mirror. + +A single binary serves as the controller, webhook, and cert-controller; the +Deployments select the role via a subcommand passed in `args:`, so the image +ENTRYPOINT must be the binary itself (matching upstream's distroless image). +""" + +import dagger + +from blumeops.containers import ( + alpine_runtime, + clone_from_forge, + go_build, + oci_labels, +) + +VERSION = "v2.2.0" + + +async def build(src: dagger.Directory) -> dagger.Container: + source = clone_from_forge("external-secrets", VERSION) + + # Upstream `make build` compiles every secret provider into a single + # static binary (`-tags all_providers`, CGO disabled). Mirror that so the + # local image is functionally identical to ghcr.io/.../external-secrets. + backend = go_build( + source, + "/external-secrets", + tags="all_providers", + ) + + runtime = alpine_runtime( + extra_apk=["ca-certificates"], + create_user=False, + ) + runtime = oci_labels( + runtime, + title="External Secrets Operator", + description=( + "Kubernetes operator that integrates external secret management systems" + ), + version=VERSION, + ) + return ( + runtime.with_file("/bin/external-secrets", backend.file("/external-secrets")) + .with_user("65534") + .with_entrypoint(["/bin/external-secrets"]) + ) diff --git a/containers/external-secrets/default.nix b/containers/external-secrets/default.nix new file mode 100644 index 0000000..eabe03d --- /dev/null +++ b/containers/external-secrets/default.nix @@ -0,0 +1,56 @@ +# Nix-built External Secrets Operator (amd64, for ringtail k3s). +# Builds v2.2.0 from the forge mirror with all secret providers compiled in, +# faithful to upstream's `make build` (-tags all_providers). The container.py +# sibling builds the arm64 image for indri's minikube; this default.nix builds +# the amd64 image on ringtail's nix-container-builder. +{ pkgs ? import { } }: + +let + version = "2.2.0"; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/external-secrets.git"; + rev = "v${version}"; + hash = "sha256-eAocOAp5s4CFRrpKfQr2lf3Ji+6nQQ1A5/eTw5B7v9U="; + }; + + # external-secrets v2.2.0 requires Go >= 1.26.1; nixpkgs default go is 1.25.x. + external-secrets = (pkgs.buildGoModule.override { go = pkgs.go_1_26; }) { + inherit src version; + pname = "external-secrets"; + vendorHash = "sha256-0xuBK3fjAplPLAElHvKB6d+2lDz+De/s91fV4dPZwjE="; + + doCheck = false; + + subPackages = [ "." ]; + + tags = [ "all_providers" ]; + + ldflags = [ "-s" "-w" ]; + + meta = with pkgs.lib; { + description = "Kubernetes operator that integrates external secret management systems"; + homepage = "https://github.com/external-secrets/external-secrets"; + license = licenses.asl20; + mainProgram = "external-secrets"; + }; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/external-secrets"; + contents = [ + external-secrets + pkgs.cacert + pkgs.tzdata + ]; + + config = { + Entrypoint = [ "${external-secrets}/bin/external-secrets" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + ]; + User = "65534"; + }; +} diff --git a/containers/forgejo-runner/container.py b/containers/forgejo-runner/container.py index ffaca88..dfb2edf 100644 --- a/containers/forgejo-runner/container.py +++ b/containers/forgejo-runner/container.py @@ -13,7 +13,7 @@ from blumeops.containers import ( oci_labels, ) -VERSION = "12.7.3" +VERSION = "12.8.2" async def build(src: dagger.Directory) -> dagger.Container: @@ -34,7 +34,7 @@ async def build(src: dagger.Directory) -> dagger.Container: # Stage 2: Runtime runtime = alpine_runtime( - extra_apk=["git", "bash", "ca-certificates"], + extra_apk=["git", "bash", "ca-certificates", "gettext-envsubst"], uid=1000, gid=1000, username="runner", diff --git a/containers/frigate-notify/default.nix b/containers/frigate-notify/default.nix new file mode 100644 index 0000000..701b194 --- /dev/null +++ b/containers/frigate-notify/default.nix @@ -0,0 +1,66 @@ +# Nix-built frigate-notify — polls Frigate webapi and pushes alerts to ntfy. +{ pkgs ? import { } }: + +let + version = "0.5.4"; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/frigate-notify.git"; + rev = "v${version}"; + hash = "sha256-c/QOSQNNJ+ElMDm45lBOsru/ujBhCWethiRefj3hBOk="; + }; + + frigate-notify = pkgs.buildGoModule { + inherit src version; + pname = "frigate-notify"; + + vendorHash = "sha256-Ho9oaK01wJDPf3ufV2klV1dG4qFNVNJkWmWvEgAy10s="; + + doCheck = false; + subPackages = [ "." ]; + + # `goolm` swaps the matrix crypto backend from libolm (CGO) to pure-Go olm, + # avoiding the libolm.h dependency. Our deployment doesn't use matrix, but + # the package is imported unconditionally. + tags = [ "goolm" ]; + + ldflags = [ "-s" "-w" ]; + + meta = with pkgs.lib; { + description = "Bridge between Frigate NVR events and notification services"; + homepage = "https://github.com/0x2142/frigate-notify"; + license = licenses.mit; + mainProgram = "frigate-notify"; + }; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/frigate-notify"; + contents = [ + frigate-notify + pkgs.cacert + pkgs.tzdata + ]; + + # Upstream Dockerfile expects WORKDIR=/app (config at ./config.yml, logfile at + # ./log/app.log via lumberjack). Create /app world-writable so nonroot can + # write logs; the config is mounted in from a ConfigMap. + extraCommands = '' + mkdir -p app + chmod 1777 app + ''; + + config = { + Entrypoint = [ "${frigate-notify}/bin/frigate-notify" ]; + WorkingDir = "/app"; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + ]; + ExposedPorts = { + "8000/tcp" = { }; + }; + User = "65534"; + }; +} diff --git a/containers/homepage/Dockerfile b/containers/homepage/Dockerfile deleted file mode 100644 index 6e53e1c..0000000 --- a/containers/homepage/Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# Homepage - self-hosted services dashboard -# Two-stage build: Node.js build, Alpine runtime - -ARG CONTAINER_APP_VERSION=v1.11.0 -ARG HOMEPAGE_VERSION=${CONTAINER_APP_VERSION} - -FROM node:24-slim AS builder - -ARG HOMEPAGE_VERSION -RUN apt-get update && apt-get install -y --no-install-recommends git ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -RUN git clone --depth 1 --branch ${HOMEPAGE_VERSION} \ - https://forge.ops.eblu.me/mirrors/homepage.git /app - -WORKDIR /app -RUN mkdir -p config \ - && corepack enable && corepack prepare pnpm@latest --activate \ - && pnpm install --frozen-lockfile \ - && NEXT_TELEMETRY_DISABLED=1 pnpm run build - -FROM node:24-alpine - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Homepage" -LABEL org.opencontainers.image.description="A self-hosted services landing page" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -WORKDIR /app - -COPY --from=builder --chown=1000:1000 /app/public ./public -COPY --from=builder --chown=1000:1000 /app/.next/standalone/ ./ -COPY --from=builder --chown=1000:1000 /app/.next/static/ ./.next/static - -RUN mkdir -p /app/config && chown 1000:1000 /app/config - -ENV NODE_ENV=production -ENV PORT=3000 -EXPOSE 3000 - -HEALTHCHECK --interval=10s --timeout=3s --start-period=20s \ - CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:3000/api/healthcheck || exit 1 - -USER 1000 -CMD ["node", "server.js"] diff --git a/containers/homepage/default.nix b/containers/homepage/default.nix new file mode 100644 index 0000000..6217847 --- /dev/null +++ b/containers/homepage/default.nix @@ -0,0 +1,130 @@ +# Nix-built gethomepage/homepage dashboard +# Builds v1.11.0 from forge mirror. +# +# Adapted from nixpkgs pkgs/by-name/ho/homepage-dashboard (commit master), +# changed to fetch from our forge mirror and wrap with dockerTools for an +# amd64 image runnable on ringtail's k3s. +# +# The preBuild substitutions are not optional — without them Next.js writes +# its file-system-cache to a read-only path and prerender state breaks after +# restart (nixpkgs issues #328621 and #458494). +{ pkgs ? import { } }: + +let + version = "1.11.0"; + + homepage = pkgs.stdenv.mkDerivation (finalAttrs: { + pname = "homepage-dashboard"; + inherit version; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/homepage.git"; + rev = "v${version}"; + hash = "sha256-jnv9PnClm/jIQ4uU6c4A1UiAmwoihG0l6k3fUbD47I4="; + }; + + pnpmDeps = pkgs.fetchPnpmDeps { + inherit (finalAttrs) pname version src; + pnpm = pkgs.pnpm_10; + fetcherVersion = 3; + hash = "sha256-X5j9XppbcasGuC7fUsj4XzbaQFM9WcRcXjgJHN/inR8="; + }; + + nativeBuildInputs = [ + pkgs.makeBinaryWrapper + pkgs.nodejs_24 + pkgs.pnpmConfigHook + pkgs.pnpm_10 + ]; + + buildInputs = [ + pkgs.nodePackages.node-gyp-build + ]; + + env.PYTHON = "${pkgs.python3}/bin/python"; + + preBuild = '' + substituteInPlace node_modules/next/dist/server/lib/incremental-cache/file-system-cache.js \ + --replace-fail 'this.serverDistDir = ctx.serverDistDir;' \ + 'this.serverDistDir = require("path").join((process.env.NIXPKGS_HOMEPAGE_CACHE_DIR || "/tmp/homepage-cache"), "homepage");' + + for bundle in node_modules/next/dist/compiled/next-server/*.runtime.prod.js; do + substituteInPlace "$bundle" \ + --replace-fail 'this.serverDistDir=e.serverDistDir' \ + 'this.serverDistDir=(process.env.NIXPKGS_HOMEPAGE_CACHE_DIR||"/tmp/homepage-cache")+"/homepage"' + done + ''; + + buildPhase = '' + runHook preBuild + mkdir -p config + pnpm build + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + + mkdir -p $out/{bin,share} + cp -r .next/standalone $out/share/homepage/ + cp -r public $out/share/homepage/public + chmod +x $out/share/homepage/server.js + + mkdir -p $out/share/homepage/.next + cp -r .next/static $out/share/homepage/.next/static + + makeWrapper "${pkgs.lib.getExe pkgs.nodejs_24}" $out/bin/homepage \ + --set-default PORT 3000 \ + --set-default HOMEPAGE_CONFIG_DIR /app/config \ + --set-default NIXPKGS_HOMEPAGE_CACHE_DIR /tmp/homepage-cache \ + --add-flags "$out/share/homepage/server.js" \ + --prefix PATH : "${pkgs.lib.makeBinPath [ pkgs.unixtools.ping ]}" + + runHook postInstall + ''; + + doDist = false; + }); +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/homepage"; + contents = [ + homepage + pkgs.cacert + pkgs.tzdata + ]; + + extraCommands = '' + mkdir -p tmp + chmod 1777 tmp + ''; + + # /app/config must be writable by the runtime user (1000): homepage seeds + # missing skeleton configs (proxmox.yaml, etc.) and writes /app/config/logs. + # The deployment mounts ConfigMap files at /app/config/.yaml via + # subPath, which leaves the parent dir as image filesystem — so its + # ownership has to be set at build time. + fakeRootCommands = '' + mkdir -p app/config + chown -R 1000:1000 app + ''; + enableFakechroot = true; + + config = { + Entrypoint = [ "${homepage}/bin/homepage" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + "TMPDIR=/tmp" + "NIXPKGS_HOMEPAGE_CACHE_DIR=/tmp/homepage-cache" + "HOMEPAGE_CONFIG_DIR=/app/config" + "NEXT_TELEMETRY_DISABLED=1" + "PORT=3000" + ]; + ExposedPorts = { + "3000/tcp" = { }; + }; + User = "1000"; + }; +} diff --git a/containers/mealie/Dockerfile b/containers/mealie/Dockerfile deleted file mode 100644 index 8df38bf..0000000 --- a/containers/mealie/Dockerfile +++ /dev/null @@ -1,145 +0,0 @@ -# Mealie — self-hosted recipe manager -# Built from source via forge mirror of mealie-recipes/mealie -# Based on upstream docker/Dockerfile (multi-stage: Node frontend + Python backend) - -ARG CONTAINER_APP_VERSION=v3.12.0 - -############################################### -# Frontend Build -############################################### -FROM node:24-slim AS frontend-builder - -ARG CONTAINER_APP_VERSION -RUN apt-get update && apt-get install --no-install-recommends -y git ca-certificates && rm -rf /var/lib/apt/lists/* - -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/mealie.git /src - -WORKDIR /src/frontend - -RUN yarn install \ - --prefer-offline \ - --frozen-lockfile \ - --non-interactive \ - --production=false \ - --network-timeout 1000000 - -RUN yarn generate - -############################################### -# Python Base -############################################### -FROM python:3.12-slim AS python-base - -ENV MEALIE_HOME="/app" -ENV PYTHONUNBUFFERED=1 \ - PYTHONDONTWRITEBYTECODE=1 \ - PIP_NO_CACHE_DIR=off \ - PIP_DISABLE_PIP_VERSION_CHECK=on \ - PIP_DEFAULT_TIMEOUT=100 \ - VENV_PATH="/opt/mealie" - -ENV PATH="$VENV_PATH/bin:$PATH" - -RUN useradd -u 911 -U -d $MEALIE_HOME -s /bin/bash abc \ - && usermod -G users abc \ - && mkdir $MEALIE_HOME - -############################################### -# Backend Package Build -############################################### -FROM python-base AS backend-builder - -ARG CONTAINER_APP_VERSION -RUN apt-get update \ - && apt-get install --no-install-recommends -y curl git ca-certificates \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install uv - -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/mealie.git /src - -WORKDIR /src - -COPY --from=frontend-builder /src/frontend/dist ./mealie/frontend - -RUN uv build --out-dir dist - -RUN uv export --no-editable --no-emit-project --extra pgsql --format requirements-txt --output-file dist/requirements.txt \ - && MEALIE_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])") \ - && echo "mealie[pgsql]==${MEALIE_VERSION} \\" >> dist/requirements.txt \ - && pip hash dist/mealie-${MEALIE_VERSION}-py3-none-any.whl | tail -n1 | tr -d '\n' >> dist/requirements.txt \ - && echo " \\" >> dist/requirements.txt \ - && pip hash dist/mealie-${MEALIE_VERSION}.tar.gz | tail -n1 >> dist/requirements.txt - -############################################### -# Python Venv Build -############################################### -FROM python-base AS venv-builder - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - build-essential \ - libpq-dev \ - libwebp-dev \ - ffmpeg \ - libsasl2-dev libldap2-dev libssl-dev \ - gnupg gnupg2 gnupg1 \ - && rm -rf /var/lib/apt/lists/* - -RUN python3 -m venv --upgrade-deps $VENV_PATH - -COPY --from=backend-builder /src/dist /dist - -RUN . $VENV_PATH/bin/activate \ - && pip install --require-hashes -r /dist/requirements.txt --find-links /dist - -############################################### -# Production Image -############################################### -FROM python-base AS production - -ENV PRODUCTION=true -ENV TESTING=false - -RUN apt-get update \ - && apt-get install --no-install-recommends -y \ - curl \ - ffmpeg \ - gosu \ - iproute2 \ - libldap-common \ - libldap2 \ - && rm -rf /var/lib/apt/lists/* - -RUN mkdir -p /run/secrets - -COPY --from=venv-builder $VENV_PATH $VENV_PATH - -ENV NLTK_DATA="/nltk_data/" -RUN mkdir -p $NLTK_DATA -RUN python -m nltk.downloader -d $NLTK_DATA averaged_perceptron_tagger_eng - -VOLUME ["$MEALIE_HOME/data/"] -ENV APP_PORT=9000 - -EXPOSE ${APP_PORT} - -COPY --from=backend-builder /src/docker/healthcheck.sh $MEALIE_HOME/healthcheck.sh -RUN chmod +x $MEALIE_HOME/healthcheck.sh -HEALTHCHECK CMD $MEALIE_HOME/healthcheck.sh - -ENV HOST=0.0.0.0 - -COPY --from=backend-builder /src/docker/entry.sh $MEALIE_HOME/run.sh -RUN chmod +x $MEALIE_HOME/run.sh - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Mealie" -LABEL org.opencontainers.image.description="Self-hosted recipe manager" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -ENTRYPOINT ["/app/run.sh"] diff --git a/containers/mealie/default.nix b/containers/mealie/default.nix new file mode 100644 index 0000000..e55efe3 --- /dev/null +++ b/containers/mealie/default.nix @@ -0,0 +1,69 @@ +# Nix-built Mealie for ringtail (amd64). +# +# Replaces the from-source Dockerfile build (Node frontend + Python venv) +# with nixpkgs' mealie, which ships a single `mealie` gunicorn entrypoint +# serving the prebuilt frontend + backend — so this is a clean single- +# process wrap (unlike paperless, which is multi-process). +# +# Mealie stores its DB as SQLite under DATA_DIR (the mealie-data PVC at +# /app/data); there is no postgres. The run wrapper mirrors the nixpkgs +# mealie NixOS module: run `libexec/init_db` (Alembic migrations) first, +# then exec gunicorn. +# +# Self-pins nixos-unstable: stable nixpkgs lags at 3.9.2, unstable carries +# 3.16.0. This is a forward 4-minor bump from the v3.12.0 Dockerfile build +# (the deferred upgrade) — mealie auto-migrates the SQLite DB forward on +# startup via init_db; the source PVC is retained for rollback. The version +# assertion makes nix-build fail if a pin bump changes the version. +let + nixpkgs = fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/331800de5053fcebacf6813adb5db9c9dca22a0c.tar.gz"; + sha256 = "1p54fm6dkbq62kpi55cr4wyx7b1nsajpsnjgs64cmp073fwi15f7"; + }; + pkgs = import nixpkgs { system = "x86_64-linux"; }; + + version = "3.16.0"; + + app = pkgs.mealie; + + # Mirror the NixOS module's mealie service: init_db (Alembic) then + # gunicorn bound to the app port. DATA_DIR/env come from the image + + # k8s manifest. + mealie-run = pkgs.writeShellScriptBin "mealie-run" '' + set -e + ${app}/libexec/init_db + exec ${pkgs.lib.getExe app} -b 0.0.0.0:9000 + ''; +in + +assert app.version == version; + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/mealie"; + + contents = [ + app + mealie-run + pkgs.bashInteractive + pkgs.coreutils + pkgs.cacert + pkgs.tzdata + # python3 (stdlib sqlite3) for the borgmatic k8s-sqlite-dump helper, + # which runs `python3 -c "...sqlite3...backup..."` inside the pod. + # Same nixpkgs python mealie is built against, so ~no added closure. + pkgs.python3 + ]; + + config = { + Cmd = [ "${mealie-run}/bin/mealie-run" ]; + Env = [ + "DATA_DIR=/app/data" + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "PYTHONUNBUFFERED=1" + "PRODUCTION=true" + ]; + ExposedPorts = { + "9000/tcp" = { }; + }; + }; +} diff --git a/containers/paperless/Dockerfile b/containers/paperless/Dockerfile deleted file mode 100644 index a7b4e65..0000000 --- a/containers/paperless/Dockerfile +++ /dev/null @@ -1,156 +0,0 @@ -# syntax=docker/dockerfile:1 -# Paperless-ngx — self-hosted document management -# Built from source via forge mirror of paperless-ngx/paperless-ngx -# Closely follows upstream Dockerfile structure with git clone instead of COPY - -ARG CONTAINER_APP_VERSION=v2.20.13 - -############################################### -# Stage 1: Clone source (reused by later stages) -############################################### -FROM docker.io/library/alpine:3.22 AS source - -ARG CONTAINER_APP_VERSION -RUN apk add --no-cache git -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/paperless-ngx.git /src - -############################################### -# Stage 2: Compile frontend -############################################### -FROM --platform=$BUILDPLATFORM docker.io/node:20-trixie-slim AS compile-frontend - -COPY --from=source /src/src-ui /src/src-ui -WORKDIR /src/src-ui - -RUN set -eux \ - && npm update -g pnpm \ - && npm install -g corepack@latest \ - && corepack enable \ - && pnpm install - -RUN set -eux \ - && ./node_modules/.bin/ng build --configuration production - -############################################### -# Stage 3: s6-overlay base -############################################### -FROM ghcr.io/astral-sh/uv:0.9.15-python3.12-trixie-slim AS s6-overlay-base - -WORKDIR /usr/src/s6 - -ENV S6_BEHAVIOUR_IF_STAGE2_FAILS=2 \ - S6_CMD_WAIT_FOR_SERVICES_MAXTIME=0 \ - S6_VERBOSITY=1 \ - PATH=/command:$PATH - -ARG TARGETARCH -ARG TARGETVARIANT -ARG S6_OVERLAY_VERSION=3.2.1.0 - -RUN set -eux \ - && apt-get update \ - && apt-get install --yes --quiet --no-install-recommends curl xz-utils \ - && S6_ARCH="" \ - && if [ "${TARGETARCH}${TARGETVARIANT}" = "amd64" ]; then S6_ARCH="x86_64"; \ - elif [ "${TARGETARCH}${TARGETVARIANT}" = "arm64" ]; then S6_ARCH="aarch64"; fi \ - && if [ -z "${S6_ARCH}" ]; then echo "Error: Cannot determine arch"; exit 1; fi \ - && curl --fail --silent --show-error --location --remote-name-all --parallel \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz" \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-noarch.tar.xz.sha256" \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-${S6_ARCH}.tar.xz" \ - "https://github.com/just-containers/s6-overlay/releases/download/v${S6_OVERLAY_VERSION}/s6-overlay-${S6_ARCH}.tar.xz.sha256" \ - && sha256sum --check ./*.sha256 \ - && tar --directory / -Jxpf s6-overlay-noarch.tar.xz \ - && tar --directory / -Jxpf s6-overlay-${S6_ARCH}.tar.xz \ - && rm ./*.tar.xz ./*.sha256 \ - && apt-get --yes purge curl xz-utils \ - && apt-get --yes autoremove --purge \ - && rm -rf /var/lib/apt/lists/* - -# Copy rootfs (s6 service definitions, init scripts) -COPY --from=source /src/docker/rootfs / - -############################################### -# Stage 4: Main application -############################################### -FROM s6-overlay-base AS main-app - -ARG CONTAINER_APP_VERSION -ARG DEBIAN_FRONTEND=noninteractive -ARG TARGETARCH -ARG JBIG2ENC_VERSION=0.30 - -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONWARNINGS="ignore:::django.http.response:517" \ - PNGX_CONTAINERIZED=1 \ - UV_LINK_MODE=copy \ - UV_CACHE_DIR=/cache/uv/ - -# Runtime packages -RUN set -eux \ - && apt-get update \ - && apt-get install --yes --quiet --no-install-recommends \ - curl gosu tzdata fonts-liberation gettext ghostscript gnupg \ - icc-profiles-free imagemagick postgresql-client \ - tesseract-ocr tesseract-ocr-eng tesseract-ocr-deu tesseract-ocr-fra \ - tesseract-ocr-ita tesseract-ocr-spa unpaper pngquant jbig2dec \ - libxml2 libxslt1.1 qpdf file libmagic1 media-types zlib1g \ - libzbar0 poppler-utils \ - && curl --fail --silent --show-error --location --remote-name-all \ - "https://github.com/paperless-ngx/builder/releases/download/jbig2enc-trixie-v${JBIG2ENC_VERSION}/jbig2enc_${JBIG2ENC_VERSION}-1_${TARGETARCH}.deb" \ - && dpkg --install ./jbig2enc_${JBIG2ENC_VERSION}-1_${TARGETARCH}.deb \ - && cp /etc/ImageMagick-6/paperless-policy.xml /etc/ImageMagick-6/policy.xml \ - && rm --force *.deb \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /usr/src/paperless/src/ - -# Python dependencies -COPY --from=source /src/pyproject.toml /src/uv.lock /usr/src/paperless/src/ - -RUN --mount=type=cache,target=${UV_CACHE_DIR},id=python-cache \ - set -eux \ - && apt-get update \ - && apt-get install --yes --quiet --no-install-recommends \ - build-essential default-libmysqlclient-dev pkg-config \ - && uv export --quiet --no-dev --all-extras --format requirements-txt --output-file requirements.txt \ - && uv pip install --system --no-python-downloads --python-preference system --requirements requirements.txt \ - && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" snowball_data \ - && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" stopwords \ - && python3 -W ignore::RuntimeWarning -m nltk.downloader -d "/usr/share/nltk_data" punkt_tab \ - && apt-get --yes purge build-essential default-libmysqlclient-dev pkg-config \ - && apt-get --yes autoremove --purge \ - && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Copy backend source -COPY --from=source /src/src ./ - -# Copy compiled frontend -COPY --from=compile-frontend /src/src/documents/static/frontend/ ./documents/static/frontend/ - -# Create user and finalize -RUN set -eux \ - && addgroup --gid 1000 paperless \ - && useradd --uid 1000 --gid paperless --home-dir /usr/src/paperless paperless \ - && mkdir -p /usr/src/paperless/data /usr/src/paperless/media \ - /usr/src/paperless/consume /usr/src/paperless/export \ - && chown -R paperless:paperless /usr/src/paperless \ - && s6-setuidgid paperless python3 manage.py collectstatic --clear --no-input --link \ - && s6-setuidgid paperless python3 manage.py compilemessages - -VOLUME ["/usr/src/paperless/data", "/usr/src/paperless/media", \ - "/usr/src/paperless/consume", "/usr/src/paperless/export"] - -ENTRYPOINT ["/init"] -EXPOSE 8000 - -HEALTHCHECK --interval=30s --timeout=10s --retries=5 \ - CMD [ "curl", "-fs", "-S", "-L", "--max-time", "2", "http://localhost:8000" ] - -LABEL org.opencontainers.image.title="Paperless-ngx" -LABEL org.opencontainers.image.description="Self-hosted document management system" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" diff --git a/containers/paperless/default.nix b/containers/paperless/default.nix new file mode 100644 index 0000000..734d909 --- /dev/null +++ b/containers/paperless/default.nix @@ -0,0 +1,77 @@ +# Nix-built Paperless-ngx for ringtail (amd64). +# +# Replaces the from-source Dockerfile build (s6-overlay) with nixpkgs' +# paperless-ngx, which already bundles the full OCR/imaging closure +# (tesseract, ghostscript, imagemagick, qpdf, poppler, jbig2enc) and the +# NLTK data via wrappers — so the image stays lean. +# +# Unlike the upstream s6 image, this image does NOT run all processes +# itself. Paperless is multi-process; on ringtail it runs as four +# containers sharing this one image, each with a different command: +# web -> paperless-web (granian, the wrapper below) +# worker -> celery --app paperless worker +# beat -> celery --app paperless beat +# consumer -> paperless-ngx document_consumer +# plus a redis/valkey sidecar. The PYTHONPATH/granian invocation mirrors +# the nixpkgs paperless NixOS module's paperless-web service exactly. +# +# Self-pins nixos-unstable: stable nixpkgs lags at 2.19.6, while unstable +# carries 2.20.15 — a same-minor forward patch bump from the previous +# Dockerfile build (v2.20.13). The version assertion makes nix-build fail +# if a pin bump changes the version, forcing an explicit acknowledgment +# here and in service-versions.yaml (enforced by container-version-check). +let + nixpkgs = fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/331800de5053fcebacf6813adb5db9c9dca22a0c.tar.gz"; + sha256 = "1p54fm6dkbq62kpi55cr4wyx7b1nsajpsnjgs64cmp073fwi15f7"; + }; + pkgs = import nixpkgs { system = "x86_64-linux"; }; + + version = "2.20.15"; + + app = pkgs.paperless-ngx; + + # Mirror the NixOS module's paperless-web service: granian serving the + # ASGI app with the package's propagated deps + src on PYTHONPATH. + pythonPath = + "${app.python.pkgs.makePythonPath app.propagatedBuildInputs}:${app}/lib/paperless-ngx/src"; + + paperless-web = pkgs.writeShellScriptBin "paperless-web" '' + export PYTHONPATH="${pythonPath}" + export PAPERLESS_NLTK_DIR="${app.nltkDataDir}" + exec ${app.python.pkgs.granian}/bin/granian \ + --interface asginl --ws \ + --host 0.0.0.0 --port 8000 \ + "paperless.asgi:application" + ''; +in + +assert app.version == version; + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/paperless"; + + contents = [ + app + paperless-web + pkgs.bashInteractive + pkgs.coreutils + pkgs.cacert + pkgs.tzdata + ]; + + config = { + # Default command is the web server; worker/beat/consumer containers + # override `command` in their k8s manifests. + Cmd = [ "${paperless-web}/bin/paperless-web" ]; + Env = [ + "PAPERLESS_NLTK_DIR=${app.nltkDataDir}" + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "PYTHONUNBUFFERED=1" + "PNGX_CONTAINERIZED=1" + ]; + ExposedPorts = { + "8000/tcp" = { }; + }; + }; +} diff --git a/containers/prowler/Dockerfile b/containers/prowler/Dockerfile index bd74bdb..c5157cb 100644 --- a/containers/prowler/Dockerfile +++ b/containers/prowler/Dockerfile @@ -44,10 +44,28 @@ RUN ARCH=$(dpkg --print-architecture) \ && apt-get update && apt-get install -y --no-install-recommends wget ca-certificates \ && wget -q "https://github.com/aquasecurity/trivy/releases/download/v${TRIVY_VERSION}/trivy_${TRIVY_VERSION}_${TRIVY_ARCH}.tar.gz" -O /tmp/trivy.tar.gz \ && tar xzf /tmp/trivy.tar.gz -C /usr/local/bin trivy \ - && chmod +x /usr/local/bin/trivy \ + && mv /usr/local/bin/trivy /usr/local/bin/trivy.real \ + && chmod +x /usr/local/bin/trivy.real \ && rm /tmp/trivy.tar.gz \ && apt-get purge -y wget && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* +# Shim: Prowler's IaC provider invokes `trivy fs` directly with no +# --ignorefile flag, so any TRIVY_IGNOREFILE the user sets is ignored. +# This wrapper injects --ignorefile when the env var points at a real +# file and the invocation is `trivy fs ...`. Other subcommands and +# global-only invocations (--version, --help) pass through unchanged. +# TODO(upstream): contribute --ignorefile plumbing to prowler-cloud/prowler +# iac_provider.py so this shim isn't necessary. +RUN printf '%s\n' \ + '#!/bin/sh' \ + 'if [ "${1:-}" = "fs" ] && [ -n "${TRIVY_IGNOREFILE:-}" ] && [ -f "${TRIVY_IGNOREFILE}" ]; then' \ + ' shift' \ + ' exec /usr/local/bin/trivy.real fs --ignorefile "${TRIVY_IGNOREFILE}" "$@"' \ + 'fi' \ + 'exec /usr/local/bin/trivy.real "$@"' \ + > /usr/local/bin/trivy \ + && chmod +x /usr/local/bin/trivy + RUN addgroup --gid 1000 prowler \ && adduser --uid 1000 --gid 1000 --disabled-password --gecos "" prowler \ && mkdir -p /tmp/.cache/trivy && chown prowler:prowler /tmp/.cache/trivy diff --git a/containers/quartz/Dockerfile b/containers/quartz/Dockerfile deleted file mode 100644 index 8ffd44c..0000000 --- a/containers/quartz/Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -# Quartz Static Site Server -# Downloads and serves a Quartz-built static site from a release bundle -# -# Configuration (via environment): -# DOCS_RELEASE_URL - URL to download the static site tarball -# -# The container downloads the tarball on startup, extracts it, and serves with nginx. - -ARG CONTAINER_APP_VERSION=1.28.2 -ARG NGINX_VERSION=${CONTAINER_APP_VERSION} - -FROM nginx:${NGINX_VERSION}-alpine - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="Quartz" -LABEL org.opencontainers.image.description="Static site server for Quartz-built documentation" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -# Install curl for downloading release assets -RUN apk add --no-cache curl - -# Copy startup script and nginx config -COPY start.sh /start.sh -COPY default.conf /etc/nginx/conf.d/default.conf -RUN chmod +x /start.sh - -EXPOSE 80 - -CMD ["/start.sh"] diff --git a/containers/quartz/default.conf b/containers/quartz/default.conf deleted file mode 100644 index 64eec4e..0000000 --- a/containers/quartz/default.conf +++ /dev/null @@ -1,34 +0,0 @@ -server { - listen 80; - server_name _; - root /usr/share/nginx/html; - index index.html; - - # Enable gzip compression - gzip on; - gzip_types text/plain text/css application/json application/javascript text/xml application/xml text/javascript; - - # Cache static assets - location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ { - expires 1y; - add_header Cache-Control "public, immutable"; - } - - # Static file serving — no SPA fallback. - # Quartz generates complete HTML for every page, so all valid URLs - # map to real files. Non-existent paths get 404.html (generated by - # Quartz's NotFoundPage plugin), preventing the spider-trap issue - # where crawlers would get index.html for fabricated URLs. - location / { - try_files $uri $uri/ $uri.html =404; - } - - error_page 404 /404.html; - - # Health check endpoint - location /healthz { - access_log off; - return 200 "ok\n"; - add_header Content-Type text/plain; - } -} diff --git a/containers/quartz/start.sh b/containers/quartz/start.sh deleted file mode 100644 index 778eeb1..0000000 --- a/containers/quartz/start.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -set -e - -HTML_DIR="/usr/share/nginx/html" - -# Check for required environment variable -if [ -z "$DOCS_RELEASE_URL" ]; then - echo "Error: DOCS_RELEASE_URL environment variable is required" - echo "Set it to the URL of the static site tarball to serve" - exit 1 -fi - -echo "Downloading docs from: $DOCS_RELEASE_URL" - -# Download the tarball -if ! curl -fsSL "$DOCS_RELEASE_URL" -o /tmp/docs.tar.gz; then - echo "Error: Failed to download docs from $DOCS_RELEASE_URL" - exit 1 -fi - -# Clear existing content and extract -rm -rf "${HTML_DIR:?}"/* -echo "Extracting docs to $HTML_DIR" -tar -xzf /tmp/docs.tar.gz -C "$HTML_DIR" -rm /tmp/docs.tar.gz - -echo "Docs extracted successfully" -echo "Starting nginx..." - -# Start nginx in foreground -exec nginx -g "daemon off;" diff --git a/containers/runner-job-image/Dockerfile b/containers/runner-job-image/Dockerfile deleted file mode 100644 index 0018c64..0000000 --- a/containers/runner-job-image/Dockerfile +++ /dev/null @@ -1,84 +0,0 @@ -# Forgejo Actions Job Execution Image -# -# This image is used as the job execution environment for Forgejo Actions. -# The host runner daemon creates containers from this image to run workflow steps. -# -# Build logic (container images, docs site) runs inside Dagger containers, -# so this image only needs: git, Docker CLI, Dagger CLI, ArgoCD CLI, uv, yq, and basic tools. -# -# Usage: Configure runner with label like: -# docker:docker://registry.ops.eblu.me/blumeops/runner-job-image:latest - -ARG CONTAINER_APP_VERSION=0.20.1 - -FROM debian:bookworm-slim - -ARG TARGETARCH -ARG CONTAINER_APP_VERSION -ARG DAGGER_VERSION=${CONTAINER_APP_VERSION} - -LABEL org.opencontainers.image.title="Runner Job Image" -LABEL org.opencontainers.image.description="Forgejo Actions job execution environment" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -# Install base dependencies -RUN apt-get update && apt-get install -y --no-install-recommends \ - ca-certificates \ - curl \ - git \ - gnupg \ - jq \ - tzdata \ - && rm -rf /var/lib/apt/lists/* - -# Install Node.js (required by actions/checkout and other JavaScript Actions) -RUN curl -fsSL https://deb.nodesource.com/setup_22.x | bash - \ - && apt-get install -y --no-install-recommends nodejs \ - && rm -rf /var/lib/apt/lists/* \ - && node --version - -# Install Docker CLI (Dagger shells out to `docker` to provision its engine) -RUN install -m 0755 -d /etc/apt/keyrings \ - && curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc \ - && chmod a+r /etc/apt/keyrings/docker.asc \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian bookworm stable" > /etc/apt/sources.list.d/docker.list \ - && apt-get update \ - && apt-get install -y --no-install-recommends docker-ce-cli \ - && rm -rf /var/lib/apt/lists/* - -# Install uv (Python package runner for towncrier) -RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ - && mv /root/.local/bin/uv /usr/local/bin/uv \ - && mv /root/.local/bin/uvx /usr/local/bin/uvx - -# Install argocd CLI (for syncing apps from workflows) -RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" \ - && curl -fsSL -o /usr/local/bin/argocd \ - "https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-${ARCH}" \ - && chmod +x /usr/local/bin/argocd \ - && argocd version --client - -# Install Dagger CLI (for running Dagger CI pipelines) -RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" \ - && curl -fsSL -o /tmp/dagger.tar.gz \ - "https://dl.dagger.io/dagger/releases/${DAGGER_VERSION}/dagger_v${DAGGER_VERSION}_linux_${ARCH}.tar.gz" \ - && tar -xzf /tmp/dagger.tar.gz -C /usr/local/bin dagger \ - && rm /tmp/dagger.tar.gz \ - && dagger version - -# Install yq (for editing YAML files in workflows) -RUN ARCH="${TARGETARCH:-$(dpkg --print-architecture)}" \ - && curl -fsSL -o /usr/local/bin/yq \ - "https://github.com/mikefarah/yq/releases/latest/download/yq_linux_${ARCH}" \ - && chmod +x /usr/local/bin/yq \ - && yq --version - -# Install flyctl (for Fly.io cache purge after docs deploy) -RUN curl -L https://fly.io/install.sh | sh \ - && mv /root/.fly/bin/flyctl /usr/local/bin/fly \ - && rm -rf /root/.fly - -# Default to bash -CMD ["/bin/bash"] diff --git a/containers/runner-job-image/container.py b/containers/runner-job-image/container.py new file mode 100644 index 0000000..c5710ff --- /dev/null +++ b/containers/runner-job-image/container.py @@ -0,0 +1,79 @@ +"""Forgejo Actions job execution image — native Dagger build. + +The forgejo-runner daemon creates containers from this image to run +workflow steps. Contains the tools workflows reach for: git, Docker CLI, +Node.js (for JavaScript Actions), Dagger CLI, ArgoCD CLI, uv, yq, flyctl. + +VERSION tracks the Dagger CLI version, the primary build tool. +""" + +import dagger + +from blumeops.containers import alpine_runtime, oci_labels + +VERSION = "0.20.6" + + +async def build(src: dagger.Directory) -> dagger.Container: + # Map `uname -m` to the arch suffix each upstream uses. + arch_setup = ( + 'ARCH_UNAME="$(uname -m)"; ' + 'case "$ARCH_UNAME" in ' + " x86_64) ARCH=amd64 ;; " + " aarch64) ARCH=arm64 ;; " + ' *) echo "unsupported arch: $ARCH_UNAME" >&2; exit 1 ;; ' + "esac; " + ) + + runtime = alpine_runtime( + extra_apk=[ + "bash", + "ca-certificates", + "curl", + "docker-cli", + "git", + "gnupg", + "jq", + "nodejs", + "npm", + "tzdata", + ], + create_user=False, + ) + runtime = oci_labels( + runtime, + title="Runner Job Image", + description="Forgejo Actions job execution environment", + version=VERSION, + ) + + install_tools = ( + arch_setup + + "set -eux; " + # Dagger CLI (pinned) + + f'curl -fsSL -o /tmp/dagger.tar.gz "https://dl.dagger.io/dagger/releases/{VERSION}/dagger_v{VERSION}_linux_${{ARCH}}.tar.gz"; ' + + "tar -xzf /tmp/dagger.tar.gz -C /usr/local/bin dagger; " + + "rm /tmp/dagger.tar.gz; " + + "dagger version; " + # ArgoCD CLI (latest — matches cluster server version over time) + + 'curl -fsSL -o /usr/local/bin/argocd "https://github.com/argoproj/argo-cd/releases/latest/download/argocd-linux-${ARCH}"; ' + + "chmod +x /usr/local/bin/argocd; " + + "argocd version --client; " + # yq (latest) + + 'curl -fsSL -o /usr/local/bin/yq "https://github.com/mikefarah/yq/releases/latest/download/yq_linux_${ARCH}"; ' + + "chmod +x /usr/local/bin/yq; " + + "yq --version; " + # uv / uvx (latest; musl target auto-selected by installer) + + "curl -LsSf https://astral.sh/uv/install.sh " + + '| env UV_INSTALL_DIR=/usr/local/bin UV_UNMANAGED_INSTALL="/usr/local/bin" sh; ' + + "uv --version; " + # flyctl (latest) + + "curl -L https://fly.io/install.sh | sh; " + + "mv /root/.fly/bin/flyctl /usr/local/bin/fly; " + + "rm -rf /root/.fly; " + + "fly version" + ) + + return runtime.with_exec(["sh", "-c", install_tools]).with_default_args( + args=["/bin/bash"] + ) diff --git a/containers/shower/default.nix b/containers/shower/default.nix new file mode 100644 index 0000000..c5bd41e --- /dev/null +++ b/containers/shower/default.nix @@ -0,0 +1,278 @@ +# Nix-built shower app container — Adelaide / Heidi / Addie baby shower. +# +# The app is published as a wheel to the Forgejo PyPI index at +# https://forge.ops.eblu.me/api/packages/eblume/pypi/ (tailnet-only — the +# public forge.eblu.me /api/packages/* surface is blocked at the Fly edge). +# We can't point pip at Forgejo's simple index even from the tailnet, +# because Forgejo's index returns absolute file URLs hardcoded to its +# public ROOT_URL (forge.eblu.me), which then 403s. So both the wheel and +# the sdist are pulled by direct `fetchurl` against forge.ops.eblu.me, and +# the wheel is then handed to `pip install` as a local path; transitive +# deps come from pypi.ops.eblu.me. Build runs on the nix-container-builder +# runner (ringtail, amd64) so the image is native. +# +# Going through pip-install-target rather than nixpkgs Python packages +# sidesteps two issues we hit going through `python.pkgs.buildPythonPackage`: +# 1. python314Packages.django still aliases to Django 4.2 LTS, which +# doesn't support Python 3.14 at all. +# 2. django-axes pulls selenium + browser fonts into its check phase +# and the nix sandbox can't provide those. +# +# To bump the version: +# 1. Update `version` below. +# 2. Set `outputHash` to `pkgs.lib.fakeHash`, run the build, copy the +# real hash out of the error, and commit it. +{ pkgs ? import { } }: + +let + version = "1.1.3"; + + python = pkgs.python314; + + # The repo's top-level static/ directory (vendored Sortable + cropper + # JS/CSS, prize placeholder SVG) isn't shipped in the wheel — hatchling + # only packages config/ and shower/, leaving the repo-root static/ + # behind. Pull the sdist (which contains the full source tree) and + # extract just the static/ subtree into the image as /app/static. + # local_settings adds it to STATICFILES_DIRS so collectstatic at boot + # picks it up alongside the Django admin's static files. + # + # Fetched from forge.ops.eblu.me (tailnet) because /api/packages/* is + # blocked at the fly edge — see fly/nginx.conf forge.eblu.me block. + # Hash is the upstream sha256 from forge PyPI's simple index. + showerSdist = pkgs.fetchurl { + name = "adelaide_baby_shower_app-${version}.tar.gz"; + url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}.tar.gz"; + hash = "sha256-a3rCwEdOB+rnYXqsWDifyltpyKUgkOj0ikWB+WGQYKE="; + }; + + # Wheel pulled from forge.ops.eblu.me (tailnet) for the same reason the + # sdist is: Forgejo's PyPI simple index would return forge.eblu.me URLs + # that the Fly edge 403s on /api/packages/*. We hand this path to pip + # below so it never touches the forge index at all. + showerWheel = pkgs.fetchurl { + name = "adelaide_baby_shower_app-${version}-py3-none-any.whl"; + url = "https://forge.ops.eblu.me/api/packages/eblume/pypi/files/adelaide-baby-shower-app/${version}/adelaide_baby_shower_app-${version}-py3-none-any.whl"; + hash = "sha256-a6j91gBigG4IzE2DVTBntnZ46Yrx9b5PgHn+Uro98Tk="; + }; + + staticAssets = pkgs.runCommand "shower-static-assets-${version}" { } '' + ${pkgs.gnutar}/bin/tar -xzf ${showerSdist} -C $TMPDIR + cp -r $TMPDIR/adelaide_baby_shower_app-${version}/static $out + ''; + + # Fixed-output derivation: pip-installs the app wheel + every transitive + # dep into a single target dir. FODs get network access in exchange for + # a pinned output hash, which means the whole dependency closure is + # immutable across rebuilds. + pyDepsFOD = pkgs.stdenv.mkDerivation { + pname = "shower-python-deps-fod"; + inherit version; + + dontUnpack = true; + + nativeBuildInputs = [ python pkgs.cacert pkgs.removeReferencesTo ]; + + buildPhase = '' + runHook preBuild + + export HOME=$TMPDIR + export SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt + export PIP_DISABLE_PIP_VERSION_CHECK=1 + + ${python}/bin/python -m venv "$TMPDIR/venv" + "$TMPDIR/venv/bin/pip" install --upgrade pip + + # Nix store paths embed a 32-char hash prefix, which pip's wheel + # filename parser rejects ("Invalid wheel filename"). Copy to a + # clean filename in TMPDIR before installing. + cp ${showerWheel} "$TMPDIR/${showerWheel.name}" + + "$TMPDIR/venv/bin/pip" install \ + --no-cache-dir \ + --index-url=https://pypi.ops.eblu.me/root/pypi/+simple/ \ + "$TMPDIR/${showerWheel.name}" \ + gunicorn + + runHook postBuild + ''; + + installPhase = '' + runHook preInstall + + mkdir -p $out/lib/python3.14 $out/bin + cp -r "$TMPDIR/venv/lib/python3.14/site-packages" $out/lib/python3.14/site-packages + + for script in "$TMPDIR/venv/bin/"*; do + [ -f "$script" ] || continue + name=$(basename "$script") + case "$name" in + python*|pip*|activate*) continue ;; + esac + cp "$script" "$out/bin/$name" + chmod +x "$out/bin/$name" + done + + # --- Strip Nix store references (FOD outputs must be self-contained) --- + # The wrapper derivation below restores them via autoPatchelfHook + a + # python wrapper that points pyc-less imports at the on-image python. + + # Strip bytecode entirely — pyc files embed compile-time paths. + find $out -type f -name '*.pyc' -delete + find $out -type d -name '__pycache__' -exec rm -rf {} + 2>/dev/null || true + + # Dynamically discover all nix store references and strip them. We + # don't have a static list because pip pulls in stdenv via Python's + # build env (gcc-lib, libstdc++, etc.) and the closure is opaque. + { find $out -type f -print0 \ + | xargs -0 grep -aohE '/nix/store/[a-z0-9]{32}-[^/"[:space:]]+' 2>/dev/null \ + || true; } | sort -u > $TMPDIR/store-refs.txt + echo "Found $(wc -l < $TMPDIR/store-refs.txt) unique store path references to strip" + + refs_args="" + while IFS= read -r ref; do + refs_args="$refs_args -t $ref" + done < $TMPDIR/store-refs.txt + + if [ -n "$refs_args" ]; then + find $out -type f -exec remove-references-to $refs_args {} + 2>/dev/null || true + fi + + remaining=$({ find $out -type f -print0 | xargs -0 grep -cl '/nix/store/' 2>/dev/null || true; } | wc -l) + echo "Files with remaining store references: $remaining" + + runHook postInstall + ''; + + outputHashMode = "recursive"; + outputHashAlgo = "sha256"; + # Pinned dep closure — reproducible until version bumps. To recompute, + # set to pkgs.lib.fakeHash and read the failure. + outputHash = "sha256-1xx2qWAIwherklHIPXo6IOKkKHML1KUrUx6pbkMxffc="; + + dontFixup = true; + }; + + # Non-FOD wrapper: re-applies RPATHs to pre-built .so files (pillow, + # scipy) so they find libstdc++ / libz / etc. at runtime. autoPatchelfHook + # discovers needed libraries from buildInputs. + pyDeps = pkgs.stdenv.mkDerivation { + pname = "shower-python-deps"; + inherit version; + + dontUnpack = true; + + nativeBuildInputs = [ pkgs.autoPatchelfHook ]; + + buildInputs = with pkgs; [ + python + stdenv.cc.cc.lib # libstdc++, libgcc_s + zlib + libjpeg + libwebp + libtiff + openjpeg + lcms2 + freetype + ]; + + installPhase = '' + cp -r ${pyDepsFOD} $out + chmod -R u+w $out + ''; + }; + + sitePackages = "${pyDeps}/lib/python3.14/site-packages"; + + # Settings shim — config/settings.py's `BASE_DIR = parent.parent` would + # otherwise resolve to site-packages, scattering db.sqlite3 / media / + # staticfiles into the venv. Pin them to /app/{data,media,data/staticfiles}. + localSettings = pkgs.writeText "local_settings.py" '' + from pathlib import Path + + from config.settings import * # noqa: F401,F403 + + DATABASES["default"]["NAME"] = "/app/data/db.sqlite3" + MEDIA_ROOT = "/app/media" + STATIC_ROOT = "/app/data/staticfiles" + # /app/static comes from the repo-root static/ subtree of the sdist + # (see default.nix staticAssets). Added because the wheel doesn't + # ship vendored Sortable/cropper assets. + STATICFILES_DIRS = [Path("/app/static")] + ''; + + # PYTHONPATH, DJANGO_SETTINGS_MODULE, PATH, and HOME live in the image's + # `Env` block below — that way `kubectl exec deploy/shower -- python -m + # django ` Just Works without an inline `env` ceremony. + # The entrypoint just changes directory and runs the boot sequence. + entrypoint = pkgs.writeShellScript "shower-entrypoint" '' + set -eu + + cd /app + + mkdir -p /app/data /app/media + + echo "shower: running migrations" + python -m django migrate --noinput + + echo "shower: collecting static files" + python -m django collectstatic --noinput --clear + + echo "shower: starting gunicorn" + exec gunicorn \ + --bind 0.0.0.0:8000 \ + --workers 2 \ + --forwarded-allow-ips='*' \ + config.wsgi:application + ''; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/shower"; + contents = [ + python + pyDeps + pkgs.cacert + pkgs.tzdata + pkgs.bashInteractive + pkgs.coreutils + ]; + + extraCommands = '' + mkdir -p app/data app/media tmp + chmod 1777 tmp + cp ${localSettings} app/local_settings.py + cp -r ${staticAssets} app/static + chmod -R u+w app/static + ''; + + fakeRootCommands = '' + chown -R 1000:1000 app + ''; + enableFakechroot = true; + + config = { + Entrypoint = [ "${entrypoint}" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + "TZ=America/Los_Angeles" + "TMPDIR=/tmp" + "LANG=C.UTF-8" + "LC_ALL=C.UTF-8" + "PYTHONDONTWRITEBYTECODE=1" + "HOME=/app/data" + "PATH=${pyDeps}/bin:${python}/bin:/bin" + # /app first so local_settings.py is importable; sitePackages second so + # django, gunicorn, etc. resolve. Inherited by entrypoint + any + # `kubectl exec` so manual django subcommands work without ceremony. + "PYTHONPATH=/app:${sitePackages}" + "DJANGO_SETTINGS_MODULE=local_settings" + ]; + ExposedPorts = { + "8000/tcp" = { }; + }; + User = "1000"; + WorkingDir = "/app"; + }; +} diff --git a/containers/tailscale/default.nix b/containers/tailscale/default.nix new file mode 100644 index 0000000..8e87f76 --- /dev/null +++ b/containers/tailscale/default.nix @@ -0,0 +1,77 @@ +# Nix-built tailscale container for ringtail's tailscale-operator ProxyClass +# Builds v1.94.2 from forge mirror; mirrors upstream Dockerfile contents. +# Built with dockerTools.buildLayeredImage on the ringtail nix-container-builder. +{ pkgs ? import { } }: + +let + version = "1.94.2"; + + src = pkgs.fetchgit { + url = "https://forge.ops.eblu.me/mirrors/tailscale.git"; + rev = "v${version}"; + hash = "sha256-qjWVB8xWVgIVUgrf27F6hwiFIE+4ERXWeHv26ugg/x4="; + }; + + tailscale = pkgs.buildGoModule { + inherit src version; + pname = "tailscale"; + vendorHash = "sha256-WeMTOkERj4hvdg4yPaZ1gRgKnhRIBXX55kUVbX/k/xM="; + + subPackages = [ + "cmd/tailscale" + "cmd/tailscaled" + "cmd/containerboot" + ]; + + ldflags = [ + "-s" + "-w" + "-X tailscale.com/version.longStamp=${version}" + "-X tailscale.com/version.shortStamp=${version}" + ]; + + doCheck = false; + + meta = with pkgs.lib; { + description = "The easiest, most secure way to use WireGuard"; + homepage = "https://tailscale.com"; + license = licenses.bsd3; + }; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/tailscale"; + tag = "v${version}"; + + contents = [ + tailscale + pkgs.cacert + pkgs.iptables + pkgs.iproute2 + pkgs.tzdata + pkgs.busybox + ]; + + # Match upstream Dockerfile: symlink iptables-legacy over iptables. + # Synology NAS and similar hosts don't support nftables. + # Also recreate the /tailscale/run.sh compat symlink. + extraCommands = '' + rm -f usr/sbin/iptables usr/sbin/ip6tables + ln -s ${pkgs.iptables}/bin/iptables-legacy usr/sbin/iptables || true + ln -s ${pkgs.iptables}/bin/ip6tables-legacy usr/sbin/ip6tables || true + mkdir -p tailscale + ln -s /bin/containerboot tailscale/run.sh + mkdir -p tmp + chmod 1777 tmp + ''; + + config = { + Entrypoint = [ "/bin/containerboot" ]; + Env = [ + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + "TZDIR=${pkgs.tzdata}/share/zoneinfo" + "PATH=/bin:/usr/bin:/usr/sbin" + ]; + }; +} diff --git a/containers/teslamate/container.py b/containers/teslamate/container.py deleted file mode 100644 index 519d77d..0000000 --- a/containers/teslamate/container.py +++ /dev/null @@ -1,104 +0,0 @@ -"""TeslaMate — Tesla data logger. - -Two-stage build: Elixir+Node (builder), Debian slim (runtime). -Source cloned from forge mirror. -""" - -import dagger -from dagger import dag - -from blumeops.containers import clone_from_forge, oci_labels - -VERSION = "v3.0.0" - - -async def build(src: dagger.Directory) -> dagger.Container: - source = clone_from_forge("teslamate", VERSION) - - # Stage 1: Build Elixir release with Node.js assets - builder = ( - dag.container() - .from_("elixir:1.19.5-otp-26") - .with_exec( - [ - "bash", - "-c", - "apt-get update" - " && apt-get install -y ca-certificates curl gnupg git zstd brotli" - " && mkdir -p /etc/apt/keyrings" - " && curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" - " | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg" - ' && echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg]' - ' https://deb.nodesource.com/node_22.x nodistro main"' - " > /etc/apt/sources.list.d/nodesource.list" - " && apt-get update" - " && apt-get install -y nodejs" - " && apt-get clean" - " && rm -rf /var/lib/apt/lists/*", - ] - ) - .with_exec(["mix", "local.rebar", "--force"]) - .with_exec(["mix", "local.hex", "--force"]) - .with_directory("/opt/app", source) - .with_workdir("/opt/app") - .with_env_variable("MIX_ENV", "prod") - .with_exec(["mix", "deps.get", "--only", "prod"]) - .with_exec(["mix", "deps.compile"]) - .with_exec( - [ - "npm", - "ci", - "--prefix", - "./assets", - "--progress=false", - "--no-audit", - "--loglevel=error", - ] - ) - .with_exec(["mix", "assets.deploy"]) - .with_exec(["mix", "compile"]) - .with_exec( - ["bash", "-c", "SKIP_LOCALE_DOWNLOAD=true mix release --path /opt/built"] - ) - ) - - # Stage 2: Debian slim runtime - entrypoint = src.file("containers/teslamate/entrypoint.sh") - - runtime = ( - dag.container() - .from_("debian:trixie-slim") - .with_exec( - [ - "bash", - "-c", - "apt-get update && apt-get install -y --no-install-recommends" - " libodbc2 libsctp1 libssl3t64 libstdc++6" - " netcat-openbsd tini tzdata" - " && apt-get clean" - " && rm -rf /var/lib/apt/lists/*" - " && groupadd --gid 10001 --system nonroot" - " && useradd --uid 10000 --system --gid nonroot" - " --home-dir /home/nonroot --shell /sbin/nologin nonroot", - ] - ) - ) - runtime = oci_labels( - runtime, - title="TeslaMate", - description="Tesla data logger and visualization", - version=VERSION, - ) - return ( - runtime.with_env_variable("LANG", "C.UTF-8") - .with_env_variable("SRTM_CACHE", "/opt/app/.srtm_cache") - .with_env_variable("HOME", "/opt/app") - .with_workdir("/opt/app") - .with_directory("/opt/app", builder.directory("/opt/built"), owner="nonroot") - .with_exec(["mkdir", "-p", "/opt/app/.srtm_cache"]) - .with_file("/entrypoint.sh", entrypoint, permissions=0o555, owner="nonroot") - .with_user("nonroot") - .with_exposed_port(4000) - .with_entrypoint(["tini", "--", "/bin/dash", "/entrypoint.sh"]) - .with_default_args(args=["bin/teslamate", "start"]) - ) diff --git a/containers/teslamate/default.nix b/containers/teslamate/default.nix new file mode 100644 index 0000000..e126561 --- /dev/null +++ b/containers/teslamate/default.nix @@ -0,0 +1,122 @@ +# Nix-built TeslaMate for ringtail (amd64). +# +# Replaces the Dagger container.py (Elixir+Node builder -> Debian slim). +# TeslaMate is NOT in nixpkgs, so this is a from-scratch beamPackages +# mixRelease: an Elixir/Phoenix release with npm-built assets. +# +# Pinned to the same nixos-unstable rev as paperless/mealie for a +# consistent toolchain. The BEAM combo is pinned to erlang_27 + elixir_1_18 +# (teslamate requires elixir ~> 1.17; upstream's image uses OTP 26, so we +# stay off the default OTP 28 which elixir 1.18 does not target). +# +# Source comes from the forge mirror (supply-chain control), pinned by the +# v3.0.0 tag's commit so builtins.fetchGit needs no hash. +let + nixpkgs = fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/331800de5053fcebacf6813adb5db9c9dca22a0c.tar.gz"; + sha256 = "1p54fm6dkbq62kpi55cr4wyx7b1nsajpsnjgs64cmp073fwi15f7"; + }; + pkgs = import nixpkgs { system = "x86_64-linux"; }; + lib = pkgs.lib; + + version = "3.0.0"; + + beamPackages = pkgs.beam.packages.erlang_27; + elixir = beamPackages.elixir_1_18; + + src = builtins.fetchGit { + url = "https://forge.ops.eblu.me/mirrors/teslamate.git"; + ref = "refs/tags/v${version}"; + rev = "3281154d42330786a182c1bbe094ecda0b1c5578"; + }; + + # ex_cldr downloads locale JSON from GitHub at compile time, which the + # build sandbox blocks. teslamate's cldr.ex reads the data dir from the + # LOCALES env var; point it at the pre-fetched elixir-cldr data so no + # download is attempted (with SKIP_LOCALE_DOWNLOAD=true disabling the + # forced refresh). CLDR data version matches the compile-time errors. + cldrData = pkgs.fetchFromGitHub { + owner = "elixir-cldr"; + repo = "cldr"; + rev = "v2.46.0"; + sha256 = "1iwzk9dc754l72vpf8vsisdjncnjx26pz509552b6vnm49xbxyji"; + }; + + teslamate = beamPackages.mixRelease { + pname = "teslamate"; + inherit version src elixir; + + # Keep the build-generated Erlang cookie in the release. mixRelease + # strips it by default (expecting RELEASE_COOKIE at runtime), but the + # start script reads releases/COOKIE. teslamate is single-node (no + # distributed Erlang exposed), so a baked-in cookie is fine. + removeCookie = false; + + mixFodDeps = beamPackages.fetchMixDeps { + pname = "mix-deps-teslamate"; + inherit src version elixir; + hash = "sha256-DDrREiM1BIMgD2qFPTK8QyjOYlnfE3XlnaH/jk7G2go="; + }; + + # Frontend assets. esbuild + sass are devDeps and the esbuild platform + # binary is an optional dep, so npm ci must include both. We run npm ci + # here (not a separate derivation) because assets/package.json has + # file:../deps/phoenix references that only resolve once mixFodDeps has + # populated deps/. npmConfigHook wires up the offline cache from npmDeps; + # then `node scripts/build.js` (custom esbuild) + `mix phx.digest`. + nativeBuildInputs = [ pkgs.nodejs pkgs.npmHooks.npmConfigHook ]; + npmDeps = pkgs.fetchNpmDeps { + name = "teslamate-npm-deps"; + src = src + "/assets"; + hash = "sha256-XyiaUkT/c4rZnNxmxhVLb+vEXnc64A1hjOrnR5fhaEk="; + }; + npmRoot = "assets"; + + preBuild = '' + export SKIP_LOCALE_DOWNLOAD=true + export LOCALES=${cldrData}/priv/cldr + ( cd assets && npm ci --include=dev --include=optional && node scripts/build.js ) + mix phx.digest --no-deps-check + ''; + }; +in + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/teslamate"; + + contents = [ + teslamate + pkgs.bashInteractive + pkgs.coreutils + pkgs.dash + pkgs.netcat-openbsd + pkgs.cacert + pkgs.tzdata + ]; + + config = { + # Mirror entrypoint.sh: wait for postgres, run migrations, then start. + Entrypoint = [ + "${pkgs.dash}/bin/dash" + "-c" + '' + : "''${DATABASE_HOST:=127.0.0.1}" + : "''${DATABASE_PORT:=5432}" + while ! ${pkgs.netcat-openbsd}/bin/nc -z "$DATABASE_HOST" "$DATABASE_PORT" 2>/dev/null; do + echo "waiting for postgres at $DATABASE_HOST:$DATABASE_PORT"; sleep 1 + done + ${teslamate}/bin/teslamate eval "TeslaMate.Release.migrate" + exec ${teslamate}/bin/teslamate start + '' + ]; + Env = [ + "HOME=/opt/app" + "SRTM_CACHE=/opt/app/.srtm_cache" + "LANG=C.UTF-8" + "SSL_CERT_FILE=${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt" + ]; + ExposedPorts = { + "4000/tcp" = { }; + }; + }; +} diff --git a/containers/teslamate/entrypoint.sh b/containers/teslamate/entrypoint.sh deleted file mode 100644 index f66117e..0000000 --- a/containers/teslamate/entrypoint.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/env dash -set -e - -: "${DATABASE_HOST:="127.0.0.1"}" -: "${DATABASE_PORT:=5432}" -: "${ULIMIT_MAX_NOFILE:=65536}" - -# prevent memory bloat in some misconfigured versions of Docker/containerd -# where the nofiles limit is very large. 0 means don't set it. -if test "${ULIMIT_MAX_NOFILE}" != 0 && test "$(ulimit -n)" -gt "${ULIMIT_MAX_NOFILE}"; then - ulimit -n "${ULIMIT_MAX_NOFILE}" -fi - -# wait until Postgres is ready -while ! nc -z "${DATABASE_HOST}" "${DATABASE_PORT}" 2>/dev/null; do - echo waiting for postgres at "${DATABASE_HOST}":"${DATABASE_PORT}" - sleep 1s -done - -# apply migrations -bin/teslamate eval "TeslaMate.Release.migrate" - -exec "$@" diff --git a/containers/unpoller/Dockerfile b/containers/unpoller/Dockerfile deleted file mode 100644 index 241b375..0000000 --- a/containers/unpoller/Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -# UnPoller — UniFi metrics exporter for Prometheus -# Two-stage build: Go compilation, then minimal Alpine runtime - -ARG CONTAINER_APP_VERSION=v2.34.0 - -FROM golang:alpine3.22 AS build - -ARG CONTAINER_APP_VERSION -RUN apk add --no-cache git - -RUN git clone --depth 1 --branch ${CONTAINER_APP_VERSION} \ - https://forge.ops.eblu.me/mirrors/unpoller.git /app - -WORKDIR /app - -ENV CGO_ENABLED=0 - -RUN go build -ldflags="-s -w \ - -X main.version=${CONTAINER_APP_VERSION} \ - -X main.builtBy=blumeops \ - -X golift.io/version.Version=${CONTAINER_APP_VERSION} \ - -X golift.io/version.Branch=HEAD \ - -X golift.io/version.BuildUser=blumeops \ - -X golift.io/version.Revision=blumeops-build" \ - -o /bin/unpoller . - -FROM alpine:3.22 - -ARG CONTAINER_APP_VERSION -LABEL org.opencontainers.image.title="UnPoller" -LABEL org.opencontainers.image.description="UniFi metrics exporter for Prometheus" -LABEL org.opencontainers.image.version="${CONTAINER_APP_VERSION}" -LABEL org.opencontainers.image.source="https://forge.eblu.me/eblume/blumeops" -LABEL org.opencontainers.image.vendor="blumeops" - -RUN apk add --no-cache ca-certificates tzdata - -COPY --from=build /bin/unpoller /usr/bin/unpoller - -EXPOSE 9130 -USER 65534:65534 -ENTRYPOINT ["/usr/bin/unpoller"] -CMD ["--config", "/etc/unpoller/up.conf"] diff --git a/containers/unpoller/container.py b/containers/unpoller/container.py new file mode 100644 index 0000000..bfc75ba --- /dev/null +++ b/containers/unpoller/container.py @@ -0,0 +1,53 @@ +"""UnPoller — UniFi metrics exporter for Prometheus. + +Two-stage build: Go backend, Alpine runtime. +Source cloned from forge mirror. +""" + +import dagger + +from blumeops.containers import ( + alpine_runtime, + clone_from_forge, + go_build, + oci_labels, +) + +VERSION = "v3.2.0" + + +async def build(src: dagger.Directory) -> dagger.Container: + source = clone_from_forge("unpoller", VERSION) + + backend = go_build( + source, + "/unpoller", + ldflags=( + f"-s -w " + f"-X main.version={VERSION} " + f"-X main.builtBy=blumeops " + f"-X golift.io/version.Version={VERSION} " + f"-X golift.io/version.Branch=HEAD " + f"-X golift.io/version.BuildUser=blumeops " + f"-X golift.io/version.Revision=blumeops-build" + ), + ) + + runtime = alpine_runtime( + extra_apk=["ca-certificates", "tzdata"], + create_user=False, + ) + runtime = oci_labels( + runtime, + title="UnPoller", + description="UniFi metrics exporter for Prometheus", + version=VERSION, + ) + return ( + runtime.with_file("/usr/bin/unpoller", backend.file("/unpoller")) + .with_exposed_port(9130) + .with_user("65534") + .with_default_args( + args=["/usr/bin/unpoller", "--config", "/etc/unpoller/up.conf"] + ) + ) diff --git a/containers/valkey/container.py b/containers/valkey/container.py new file mode 100644 index 0000000..34e8524 --- /dev/null +++ b/containers/valkey/container.py @@ -0,0 +1,48 @@ +"""Valkey — native Dagger build (arm64, indri). + +Alpine 3.22 base with the `valkey` apk package (8.1.x — Redis-compatible). +Used by paperless (sidecar) on indri. immich on ringtail uses the +nix-built amd64 variant from `default.nix` in this directory. +""" + +import dagger +from dagger import dag + +from blumeops.containers import oci_labels + +# Alpine 3.22 currently ships valkey 8.1.7-r0. Alpine 3.23 jumps to 9.0 — +# hold on 3.22 to keep this aligned with the 8.1 line. +VERSION = "8.1.7" +ALPINE_PIN = "8.1.7-r0" + +ALPINE_BASE = "alpine:3.22" + + +async def build(src: dagger.Directory) -> dagger.Container: + ctr = ( + dag.container() + .from_(ALPINE_BASE) + .with_exec(["apk", "add", "--no-cache", f"valkey={ALPINE_PIN}"]) + .with_exec(["mkdir", "-p", "/data"]) + .with_exec(["chown", "valkey:valkey", "/data"]) + .with_workdir("/data") + .with_exposed_port(6379) + .with_user("valkey") + .with_default_args( + args=[ + "valkey-server", + "--bind", + "0.0.0.0", + "--protected-mode", + "no", + "--dir", + "/data", + ] + ) + ) + return oci_labels( + ctr, + title="Valkey", + description="Valkey high-performance key/value datastore (Redis-compatible)", + version=VERSION, + ) diff --git a/containers/valkey/default.nix b/containers/valkey/default.nix new file mode 100644 index 0000000..9cb1713 --- /dev/null +++ b/containers/valkey/default.nix @@ -0,0 +1,30 @@ +# Nix-built Valkey for ringtail (amd64) +# Companion to container.py (Alpine 3.22, arm64 on indri). +# Used by immich-ringtail which needs an amd64 image; paperless on indri +# continues to use the Alpine container.py build. +# +# The version assertion ensures nix-build fails if a flake.lock update +# changes the Valkey version — forcing an explicit version acknowledgment +# here and in service-versions.yaml (enforced by container-version-check). +{ pkgs ? import { } }: + +let + version = "8.1.7"; +in + +assert pkgs.valkey.version == version; + +pkgs.dockerTools.buildLayeredImage { + name = "blumeops/valkey"; + contents = [ + pkgs.valkey + ]; + + config = { + Entrypoint = [ "${pkgs.valkey}/bin/valkey-server" ]; + Cmd = [ "--bind" "0.0.0.0" "--protected-mode" "no" "--dir" "/data" ]; + ExposedPorts = { + "6379/tcp" = { }; + }; + }; +} diff --git a/dagger.json b/dagger.json index c982487..3309378 100644 --- a/dagger.json +++ b/dagger.json @@ -1,8 +1,7 @@ { "name": "blumeops", - "engineVersion": "v0.20.1", + "engineVersion": "v0.20.6", "sdk": { "source": "python" - }, - "source": "." + } } diff --git a/docs/changelog.d/+agent-file-neutralization.ai.md b/docs/changelog.d/+agent-file-neutralization.ai.md deleted file mode 100644 index da16fba..0000000 --- a/docs/changelog.d/+agent-file-neutralization.ai.md +++ /dev/null @@ -1 +0,0 @@ -Adopt `AGENTS.md` as the canonical agent instruction file, keep `CLAUDE.md` as a compatibility shim, and update docs to reference the neutral file and the correct agent-change-process path. diff --git a/docs/changelog.d/+argocd-resource-limits.infra.md b/docs/changelog.d/+argocd-resource-limits.infra.md deleted file mode 100644 index ba24a5a..0000000 --- a/docs/changelog.d/+argocd-resource-limits.infra.md +++ /dev/null @@ -1 +0,0 @@ -Add resource limits to all ArgoCD pods to prevent unbounded resource consumption during node-wide pressure events. diff --git a/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md b/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md new file mode 100644 index 0000000..2e931d4 --- /dev/null +++ b/docs/changelog.d/+external-secrets-main-sha-rebuild.infra.md @@ -0,0 +1 @@ +Rebuilt the locally-built external-secrets image from the `main` branch so the deployed tag (`v2.2.0-0e70a1b`) traces to a `main` commit rather than the now-merged feature branch, giving a stable provenance reference. diff --git a/docs/changelog.d/+external-secrets-stable-main-sha.infra.md b/docs/changelog.d/+external-secrets-stable-main-sha.infra.md new file mode 100644 index 0000000..fbe3c21 --- /dev/null +++ b/docs/changelog.d/+external-secrets-stable-main-sha.infra.md @@ -0,0 +1 @@ +Rebuilt the external-secrets images off `main` and repointed both clusters to the stable main-sha tags (`v2.2.0-13895bb` arm64 / `v2.2.0-13895bb-nix` amd64), so the deployed images on indri and ringtail trace to the same `main` commit rather than earlier feature-branch builds. diff --git a/docs/changelog.d/+fix-forge-static-assets.bugfix.md b/docs/changelog.d/+fix-forge-static-assets.bugfix.md deleted file mode 100644 index de0517e..0000000 --- a/docs/changelog.d/+fix-forge-static-assets.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Fixed forge.eblu.me static assets (CSS, JS, images, fonts) not loading — the proxy's static asset cache block was missing the `Host` header, so Caddy couldn't route the requests. diff --git a/docs/changelog.d/+heph-hub-v1.2.1.infra.md b/docs/changelog.d/+heph-hub-v1.2.1.infra.md new file mode 100644 index 0000000..c203323 --- /dev/null +++ b/docs/changelog.d/+heph-hub-v1.2.1.infra.md @@ -0,0 +1 @@ +Bumped the indri heph hub to v1.2.1, which adds the hub `GET /config` endpoint and ships the heph-pwa **Login with Authentik** flow (Authorization Code + PKCE). Pairs with the Authentik `heph` provider redirect URIs registered earlier. diff --git a/docs/changelog.d/+review-navidrome-doc.doc.md b/docs/changelog.d/+review-navidrome-doc.doc.md deleted file mode 100644 index fbe5e79..0000000 --- a/docs/changelog.d/+review-navidrome-doc.doc.md +++ /dev/null @@ -1 +0,0 @@ -Review and refresh the Navidrome reference card: add `last-reviewed`, correct the scanner env var name, document the current image/version, and record routing and runtime details from the manifests. diff --git a/docs/changelog.d/+runner-logs-auth.feature.md b/docs/changelog.d/+runner-logs-auth.feature.md deleted file mode 100644 index 9ee6fa1..0000000 --- a/docs/changelog.d/+runner-logs-auth.feature.md +++ /dev/null @@ -1 +0,0 @@ -runner-logs now authenticates with Forgejo API token and auto-detects the repo from git remote. Job logs are fetched via SSH to indri (reading Forgejo's on-disk zstd log files) instead of the web endpoint, which doesn't support token auth for private repos. diff --git a/docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md b/docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md new file mode 100644 index 0000000..cc29cf7 --- /dev/null +++ b/docs/changelog.d/+tailscale-operator-mirror-tailnet-url.bugfix.md @@ -0,0 +1 @@ +Fixed the `tailscale-operator` and `tailscale-operator-ringtail` ArgoCD apps showing `Unknown` sync status. Their shared base kustomization fetched the upstream operator manifest from the public `forge.eblu.me/mirrors/...`, which the AI-scraper mitigation now black-holes (403). Pointed the remote resource at the tailnet host `forge.ops.eblu.me` instead, which the in-cluster repo-server can reach. diff --git a/docs/changelog.d/external-secrets-ringtail-nix.infra.md b/docs/changelog.d/external-secrets-ringtail-nix.infra.md new file mode 100644 index 0000000..9ce3f85 --- /dev/null +++ b/docs/changelog.d/external-secrets-ringtail-nix.infra.md @@ -0,0 +1 @@ +Completed the external-secrets localization for the ringtail (amd64) cluster. The indri Dagger build (`container.py`) only produces an arm64 image; added `containers/external-secrets/default.nix` to build the amd64 variant on ringtail's nix-container-builder, and gave `external-secrets-ringtail` a thin kustomize overlay that reuses the shared manifest and points at the `-nix` image. Both clusters now run the locally-built external-secrets binary on their native architecture. diff --git a/docs/changelog.d/heph-indri-hub.infra.md b/docs/changelog.d/heph-indri-hub.infra.md new file mode 100644 index 0000000..6761cb7 --- /dev/null +++ b/docs/changelog.d/heph-indri-hub.infra.md @@ -0,0 +1 @@ +Added the [[hephaestus]] (`heph`) sync hub to indri as a self-updating LaunchAgent managed by Ansible (`ansible/roles/heph`, tag `heph`). The hub runs `hephd --mode server` behind `heph.ops.eblu.me` (Caddy TLS), with self-update on a 10-minute interval and the heph-pwa mobile shell served from `--web-root`. Access is gated by a new Authentik device-code (RFC 8628) OIDC application. Indri is now the canonical hub; other devices (e.g. gilbert) attach as offline-capable spokes. The hub's store was seeded from gilbert via the data-safe Path A bring-up (copy store, reset `meta.origin`). diff --git a/docs/changelog.d/heph-offline-access.bugfix.md b/docs/changelog.d/heph-offline-access.bugfix.md new file mode 100644 index 0000000..e9721bc --- /dev/null +++ b/docs/changelog.d/heph-offline-access.bugfix.md @@ -0,0 +1 @@ +Granted the `offline_access` scope on the Authentik `heph` OAuth2 provider so hephaestus spokes receive a durable 30-day refresh token. Previously the refresh token was session-bound, so spoke sync would silently fail with a `400 Bad Request` on the `refresh_token` grant once the Authentik session lapsed. diff --git a/docs/changelog.d/heph-pwa-redirect-uris.infra.md b/docs/changelog.d/heph-pwa-redirect-uris.infra.md new file mode 100644 index 0000000..f887eed --- /dev/null +++ b/docs/changelog.d/heph-pwa-redirect-uris.infra.md @@ -0,0 +1 @@ +Registered the heph-pwa redirect URIs (`https://heph.ops.eblu.me/`, plus `http://localhost:8787/` for dev) on the Authentik `heph` OAuth2 provider, enabling the PWA's new Authorization Code + PKCE "Login with Authentik" flow (and the token-endpoint CORS it needs). Pairs with hephaestus PR #9. diff --git a/docs/changelog.d/local-external-secrets.infra.md b/docs/changelog.d/local-external-secrets.infra.md new file mode 100644 index 0000000..13cbb05 --- /dev/null +++ b/docs/changelog.d/local-external-secrets.infra.md @@ -0,0 +1 @@ +Localized the external-secrets controller image. It now builds from the forge mirror via a native Dagger `container.py` (single `all_providers` static Go binary, faithful to upstream's `make build`) and is served from `registry.ops.eblu.me/blumeops/external-secrets` instead of `ghcr.io`, bringing another platform component under local supply-chain control. diff --git a/docs/changelog.d/reviews-jun4.doc.md b/docs/changelog.d/reviews-jun4.doc.md new file mode 100644 index 0000000..f1aeaa8 --- /dev/null +++ b/docs/changelog.d/reviews-jun4.doc.md @@ -0,0 +1 @@ +Reviewed four never-reviewed reference cards (`cluster`, `ntfy`, `tempo`, `alloy`) and corrected drift: minikube is now Kubernetes v1.35.0; ntfy, tempo, and alloy-k8s images are now locally-built `registry.ops.eblu.me/blumeops/*` nix containers (v2.19.2, v2.10.3, v1.16.0) rather than upstream Docker Hub; the Fly.io alloy binary is v1.16.1; and the ringtail workload list reflects the in-progress minikube→k3s migration. diff --git a/docs/changelog.d/reviews-jun4.infra.md b/docs/changelog.d/reviews-jun4.infra.md new file mode 100644 index 0000000..c128e70 --- /dev/null +++ b/docs/changelog.d/reviews-jun4.infra.md @@ -0,0 +1 @@ +Upgraded the nvidia-device-plugin on ringtail from v0.19.0 to v0.19.2 (upstream patch release: CDI/Tegra fixes and dependency bumps, no breaking changes for our manifest-based CDI + RuntimeClass setup). diff --git a/docs/explanation/ai-scraper-mitigation.md b/docs/explanation/ai-scraper-mitigation.md new file mode 100644 index 0000000..fe4ba3d --- /dev/null +++ b/docs/explanation/ai-scraper-mitigation.md @@ -0,0 +1,201 @@ +--- +title: AI Scraper Mitigation +modified: 2026-06-01 +last-reviewed: 2026-06-01 +tags: + - explanation + - fly-io + - forgejo + - security + - networking +--- + +# AI Scraper Mitigation on the Public Proxy + +> **Note:** This article was drafted by AI and reviewed by Erich. I plan to rewrite all explanatory content in my own words — these serve as placeholders to establish the documentation structure. + +How BlumeOps keeps AI crawlers from running up the [[expose-service-publicly|Fly.io proxy]] egress bill and DoS-ing [[forgejo|Forgejo]] on [[indri]]. + +## The incident + +A $29.60 Fly.io invoice arrived, nearly all of it a single line: + +``` +Bandwidth: Egress (iad) — 958,524,714,138 bytes — $19.17 +``` + +The `iad` (Ashburn) region is a red herring: the proxy machine runs in `sjc`, +but Fly bills egress at the edge PoP nearest the *client*, so `iad` just means +"the traffic went to clients on the US East Coast." + +Tracing it through the nginx access logs (shipped to Loki via [[alloy|Alloy]]): + +| Signal | Value | +|--------|-------| +| Total proxy egress (30d) | ~1.25 TB | +| Share that was `forge.eblu.me` | **99.95%** | +| Share of forge egress that was `/mirrors/*` | **~71%** | +| Share that was declared AI bots | **~85%+** | +| Top offenders | Meta `meta-externalagent` (66% of bytes), OpenAI `GPTBot` (16%), Amazonbot, Bytespider | +| Forgejo `5xx` (upstream timeouts) | tens of thousands/day, spiking to 112k | + +The crawlers were walking [[forgejo|Forgejo]]'s git-history browse endpoints — +`src/commit/`, `commits/`, `blame/`, `raw/commit/`, plus `.patch`/`.diff` +and `?page=N` pagination. That URL space is effectively **infinite**: every +file × every commit × every page, multiplied across every mirrored repo. A +crawler that follows links never finishes, and every page is a cache `MISS` +that both tunnels to indri *and* bills as egress. + +Two distinct harms, not one: + +1. **Cost** — ~1.25 TB/mo of egress on a free-tier-ish proxy. +2. **Availability** — the crawl alone generates ~400–530k requests/day, + enough to time out Forgejo regardless of how much RAM [[indri]] has. Moving + egress elsewhere would *not* fix this; the crawl has to be throttled at the + source. + +`robots.txt` already `Disallow`s `/mirrors/`, `/user/`, and archive/download +paths — but **`meta-externalagent` and `GPTBot` ignore it.** For these agents, +`robots.txt` is a dead letter, which is why edge enforcement is required. + +## The tiered plan + +### Tier 1 — Black-hole `/mirrors/*` (shipped) + +The mirror repositories (`tailscale`, `prometheus`, `mealie`, `paperless-ngx`, +…) are mirrors of *already-public upstreams*, kept for supply-chain control +(see [[spork-strategy]] and the container/mirror story in [[why-gitops]]). They +are consumed by CI, gilbert, and other tailnet clients over +`forge.ops.eblu.me`. Their web UI on the public internet served **no +legitimate audience** — only scrapers. So the proxy now returns `403` for +anything under `/mirrors/`, pointing humans at the tailnet host: + +```nginx +location ^~ /mirrors/ { + return 403 "Mirror repositories are tailnet-only — use forge.ops.eblu.me.\n"; +} +``` + +The `^~` modifier matters: without it, the regex `location` blocks for static +assets (`*.css`, `*.js`, release downloads) would match first and leak content +under `/mirrors/`. `^~` tells nginx to stop at the prefix match and skip the +regex round. + +This is config, not bot-fighting — we simply stopped serving an infinite +tarpit to the world. It removes ~71% of forge egress and a large share of the +upstream timeouts, with zero impact on any human or tailnet consumer. It +mirrors the existing tailnet-only blocks for `/api/packages/` and `/swagger`. + +The `403` is also a small act of public shaming. Blocked requests are served a +"roll of dishonour" page (`fly/naughty.html`, status kept at `403` via +`error_page 403 /naughty.html`) that names the offending operators and their +share of the stolen bytes, and every response carries an `X-Naughty-Scrapers` +header: + +``` +X-Naughty-Scrapers: OpenAI/GPTBot, Meta/meta-externalagent, Amazonbot, ByteDance/Bytespider — robots.txt ignorers +``` + +Petty? A little. But it costs nothing, documents *why* the block exists for the +next person who hits it, and the page is a few KB versus the megabytes of git +HTML the crawlers were taking. + +**Trade-off accepted:** mirror release-artifact downloads over WAN now also +`403`. Legitimate consumers already pull these over the tailnet, and the public +exposure was the same crawl liability, so this is intentional. + +### Tier 2 — Defend the repos that *stay* public (planned) + +`/eblume/*` is intentionally public (a public profile is a feature). But the +same git-history endpoints are still a tarpit there, just lower-volume. Two +layers, in increasing order of effort and effectiveness: + +#### 2a. User-agent denylist (cheap, evadable) + +Block the declared AI crawlers at the edge regardless of path: + +```nginx +# Illustrative — not yet deployed. +map $http_user_agent $is_ai_bot { + default 0; + "~*meta-externalagent" 1; + "~*GPTBot" 1; + "~*ClaudeBot" 1; + "~*Amazonbot" 1; + "~*Bytespider" 1; + "~*SemrushBot" 1; +} +# in the forge.eblu.me server block: +if ($is_ai_bot) { return 403; } +``` + +This catches ~85% of *current* traffic for a few lines of config. It is +trivially evadable — a scraper need only spoof a browser UA — so it is a +speed-bump, not a wall. Keep `robots.txt` too: well-behaved crawlers +(Googlebot, Bingbot) do honor it, and it documents intent. + +#### 2b. Anubis proof-of-work gateway (the real wall) + +[Anubis](https://github.com/TecharoHQ/anubis) is a Go reverse proxy that +weighs each request with a browser-based proof-of-work challenge before passing +it upstream. It was written for *exactly this scenario* — its author built it +after Amazon's scraper took down their Git server — and is widely deployed in +front of Forgejo/Gitea (Codeberg, the UN, etc.). Headless scrapers that can't +run the challenge JS never reach the application; humans clear it once and +proceed. + +Why it fits BlumeOps better than the alternatives: + +- **It attacks cost *and* availability at once.** Bots receive a few-KB + challenge page instead of MB of git HTML (egress collapses) and never reach + Forgejo (timeouts collapse). No other single lever does both. +- **It stays in-house.** No third party terminates our TLS or sees our + traffic. + +Placement options: + +| Where | Pros | Cons | +|-------|------|------| +| On [[indri]], between [[caddy|Caddy]] and Forgejo | Protects every path and every entry (WAN *and* tailnet); one config | Adds a hop and a service to the indri critical path; the challenge page still tunnels back through Fly for WAN clients (small egress) | +| On the Fly proxy machine, in front of nginx | Challenge served at the edge — bots never even tunnel to indri | Fly VM is small (512 MB); another moving part in the boot sequence alongside `tailscaled`/nginx/`fail2ban`/Alloy | + +Leaning toward Caddy-side on indri for simplicity and uniform coverage, but +this is the open design question for Tier 2. Anubis is MIT-licensed and the +author has signalled a future move to an `equi-x`-based challenge, so pin a +version and track upstream. + +### Tier 3 — Move egress off Fly entirely (rejected) + +A [[#The incident|Cloudflare]] Tunnel (`cloudflared` on indri → Cloudflare +edge) would make this a non-problem on the cost axis: Cloudflare does not meter +proxied bandwidth, and it bundles free AI-bot mitigation (Bot Fight Mode, the +"block AI scrapers" toggle, Managed Challenge, AI Labyrinth). One move would +zero the egress bill and add bot defense. + +**We are not doing this, on principle.** Cloudflare is a solid platform and a +defensible engineering choice — but it already sits in front of an enormous +fraction of the modern web, and routing BlumeOps through it would add one more +site to the pile of the internet that one company can see and gate. BlumeOps +deliberately keeps its own backbone ([[expose-service-publicly|Fly + Tailscale ++ Caddy]], DNS at [[gandi|Gandi]] — see the "no Cloudflare dependency" line in +that doc). This is a values decision, not a technical one: we would rather pay +a few dollars and run our own mitigation than centralize on Cloudflare. + +It is also worth noting that **Tier 3 would not, by itself, fix the upstream +timeouts** — free egress just means we'd stop *caring* that bots crawl, while +they continued to hammer Forgejo. Crawl mitigation (Tier 1 + Tier 2) is +required regardless of where egress is billed. + +## Summary + +| Tier | Lever | Cost | Availability | Status | +|------|-------|------|--------------|--------| +| 1 | Black-hole `/mirrors/*` at edge | −~71% | big drop | **shipped** | +| 2a | UA denylist on remaining repos | −most of the rest | further drop | planned | +| 2b | Anubis PoW gateway | −near-total | near-total | planned | +| 3 | Cloudflare Tunnel | −total | needs 2b anyway | **rejected (principle)** | + +The guiding insight: the cheapest, lowest-risk mitigation is to **not serve an +infinite-URL surface that has no human audience.** Everything past Tier 1 is +about defending the surface we *do* want public, in-house, without ceding +control of our traffic to a third party. diff --git a/docs/how-to/configuration/gandi-operations.md b/docs/how-to/configuration/gandi-operations.md deleted file mode 100644 index 0be00dc..0000000 --- a/docs/how-to/configuration/gandi-operations.md +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: Gandi Operations -modified: 2026-02-17 -last-reviewed: 2026-02-17 -tags: - - how-to - - dns - - pulumi ---- - -# Gandi Operations - -How to manage DNS records and cycle the Gandi API token. - -## Prerequisites - -- Pulumi CLI installed (`brew install pulumi`) -- Access to 1Password blumeops vault (for PAT) -- On the tailnet (Pulumi resolves indri's IP via MagicDNS) - -## Preview and Apply DNS Changes - -```bash -# Preview changes (always do this first) -mise run dns-preview - -# Apply changes -mise run dns-up -``` - -Both tasks fetch the Gandi PAT from 1Password automatically. - -To run Pulumi directly: - -```bash -export GANDI_PERSONAL_ACCESS_TOKEN=$(op read "op://vg6xf6vvfmoh5hqjjhlhbeoaie/mco6ka3dc3rmw7zkg2dhia5d2m/pat") -cd pulumi/gandi -pulumi preview -pulumi up --yes -``` - -## Cycle the Gandi PAT - -The Gandi Personal Access Token has a maximum lifetime of 90 days. Currently set to 30 days as a security compromise, though shorter may be appropriate given infrequent use. - -### 1. Create a new PAT - -Go to the [Gandi admin console](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat) and create a new token: - -- **Name:** `blumeops-pulumi` (or similar) -- **Expiration:** 30 days (max 90; shorter is fine if you run this rarely) -- **Required permission:** Manage domain name technical configurations -- **Also enable:** See and renew domain names - -Copy the new PAT to your clipboard. - -### 2. Update 1Password - -With the new PAT on your clipboard: - -```bash -op item edit mco6ka3dc3rmw7zkg2dhia5d2m pat="$(pbpaste)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie -``` - -### 3. Delete the old PAT - -Return to the Gandi admin console and delete the previous token. - -### 4. Verify - -```bash -mise run dns-preview -``` - -A successful preview confirms the new PAT is working. - -## Break-Glass Override - -If MagicDNS is unavailable and Pulumi can't resolve indri's IP, set the target IP manually. Find indri's current Tailscale IP via `tailscale status` or the admin console: - -```bash -export BLUMEOPS_REVERSE_PROXY_IP= -mise run dns-up -``` - -## Related - -- [[gandi]] - DNS configuration reference -- [[caddy]] - Reverse proxy (also uses a Gandi token for TLS) -- [[update-tailscale-acls]] - Similar Pulumi workflow for Tailscale diff --git a/docs/how-to/configuration/manage-eblu-me-dns.md b/docs/how-to/configuration/manage-eblu-me-dns.md new file mode 100644 index 0000000..4c37d4c --- /dev/null +++ b/docs/how-to/configuration/manage-eblu-me-dns.md @@ -0,0 +1,52 @@ +--- +title: Manage eblu.me DNS Records +modified: 2026-04-27 +last-reviewed: 2026-04-27 +tags: + - how-to + - dns + - pulumi +--- + +# Manage eblu.me DNS Records + +How to add, change, and apply DNS records for `eblu.me` via [[pulumi]]. + +## Prerequisites + +- Pulumi CLI installed (`brew install pulumi`) +- 1Password access (`blumeops` vault) — Pulumi reads the Gandi PAT from there +- On the tailnet — Pulumi resolves [[indri]]'s IP via MagicDNS at apply time + +## Preview and apply + +```bash +mise run dns-preview # always do this first +mise run dns-up # apply +``` + +Both fetch the PAT from 1Password automatically. The Pulumi program is in `pulumi/gandi/`; stack is `eblu-me`. + +## Adding a record + +Edit `pulumi/gandi/__main__.py` and add a `gandi.livedns.Record(...)`. The stack config (`Pulumi.eblu-me.yaml`) only holds `domain` and `subdomain`; everything else is in the program. + +After editing, preview, then apply. + +## Break-glass: override the indri target IP + +The wildcard `*.ops.eblu.me` is computed from `indri.tail8d86e.ts.net` via MagicDNS at apply time. If MagicDNS is unavailable: + +```bash +export BLUMEOPS_REVERSE_PROXY_IP= +mise run dns-up +``` + +Find the IP via `tailscale status` or the Tailscale admin console. + +## Related + +- [[gandi]] — Gandi reference card +- [[rotate-gandi-pat]] — Rotate the PAT shared with [[caddy]] +- [[pulumi]] — Pulumi tooling reference +- [[routing]] — Service URLs and routing architecture diff --git a/docs/how-to/configuration/manage-forgejo-mirrors.md b/docs/how-to/configuration/manage-forgejo-mirrors.md index 7f98549..5d150dc 100644 --- a/docs/how-to/configuration/manage-forgejo-mirrors.md +++ b/docs/how-to/configuration/manage-forgejo-mirrors.md @@ -137,13 +137,13 @@ Return to [GitHub token settings](https://github.com/settings/tokens?type=beta) Trigger a manual sync on one mirror to confirm the new PAT works: -1. Go to any mirror repo on forge (e.g., `mirrors/cloudnative-pg`) -2. Click the sync button (circular arrows icon) next to the mirror status +1. Go to any mirror repo's settings page on forge (e.g., `https://forge.eblu.me/mirrors/cloudnative-pg/settings`) +2. In the "Mirror settings" section, click "Synchronize now" 3. Confirm the sync completes without errors ## Related - [[forgejo]] — Forgejo service reference -- [[gandi-operations]] — Similar PAT rotation workflow for Gandi DNS +- [[rotate-gandi-pat]] — Similar PAT rotation workflow for Gandi DNS - [[spork-strategy]] — floating-branch soft-fork strategy explanation - [[create-a-spork]] — create a spork on top of a mirror diff --git a/docs/how-to/configuration/rotate-fly-deploy-token.md b/docs/how-to/configuration/rotate-fly-deploy-token.md new file mode 100644 index 0000000..9abe5f0 --- /dev/null +++ b/docs/how-to/configuration/rotate-fly-deploy-token.md @@ -0,0 +1,122 @@ +--- +title: Rotate the Fly.io API Token +modified: 2026-05-04 +last-reviewed: 2026-05-04 +tags: + - how-to + - fly-io + - secrets +--- + +# Rotate the Fly.io API Token + +How to rotate the Fly.io API token used to deploy [[flyio-proxy]]. The token lives in 1Password at `op://blumeops/fly.io admin/add more/deploy-token` and is consumed by [`mise run fly-deploy`](../../../mise-tasks/fly-deploy) and the `deploy-fly` Forgejo workflow (via the `FLY_DEPLOY_TOKEN` secret). + +## When to rotate + +- Every 75 days (heph recurring task) +- After any compromise / accidental disclosure +- If `fly deploy` starts returning auth errors + +Fly.io tokens default to a 20-year expiry, but a short rotation cadence limits the blast radius of an undetected leak. Token expiry is set to **90 days** (longer than the rotation window), leaving a 15-day buffer if a rotation is delayed. + +## Scope + +Use **`fly tokens create org`**, not `deploy`. + +| Scope | What it grants | Practical blast radius (this org) | +|-------|---------------|-----------------------------------| +| `deploy` | Manage one app and its resources | Same single-app surface as `org` for current setup | +| `org` | Manage one org and its resources | Adds: ability to create new apps (billing abuse) and read org-level metadata | +| `readonly` | Read one org | Not enough to deploy | +| Personal access token | Full account | Excessive | + +The personal Fly org currently contains a single app (`blumeops-proxy`), so the marginal blast radius of `org` over `deploy` is small. The benefit of `org` is that `fly status` works without a `Metrics token unavailable: ... context canceled` warning. That warning happens because `fly status` always tries to fetch org-level metrics-token info, and an app-scoped `deploy` token can't query the org. The warning is benign but persistent and could mask a real future failure. + +If a second Fly app is ever added to this org, reconsider — at that point the marginal scope cost of `org` grows. + +## Procedure + +### 1. Authenticate flyctl with the current token + +```fish +fly auth login +``` + +(Browser-based. Required to mint a new token, since the existing deploy token can't create tokens.) + +### 2. Mint the new token and store it + +The token is shown only once at creation, so combine the mint and the 1Password write into a single command. Pick the form for your shell. + +`fish`: + +```fish +op item edit on5slfaygtdjrxmdwezyhfmqsq "add more.deploy-token=(fly tokens create org --org personal --name 'blumeops-proxy deploy '(date +%Y-%m-%d) --expiry 2160h)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +`bash` / `zsh`: + +```bash +op item edit on5slfaygtdjrxmdwezyhfmqsq "add more.deploy-token=$(fly tokens create org --org personal --name "blumeops-proxy deploy $(date +%Y-%m-%d)" --expiry 2160h)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +(`2160h` = 90 days, paired with the 75-day rotation cadence for a 15-day buffer.) + +If you'd rather paste manually: + +```fish +fly tokens create org --org personal --name "blumeops-proxy deploy $(date +%Y-%m-%d)" --expiry 2160h +op item edit on5slfaygtdjrxmdwezyhfmqsq 'add more.deploy-token=' --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +> **op validator gotcha:** If `op item edit` returns `Password item requires ps value`, the item's primary `password` field is empty. The 1Password CLI validator rejects edits to a Password-category item with no primary password, even when you're only touching a section field. Set a placeholder once and future rotations will work: +> +> ```fish +> op item edit on5slfaygtdjrxmdwezyhfmqsq 'password=unused - see deploy-token field' --vault vg6xf6vvfmoh5hqjjhlhbeoaie +> ``` + +### 3. Sync to Forgejo Actions + +The `deploy-fly` workflow reads the same token from a Forgejo Actions secret named `FLY_DEPLOY_TOKEN`, populated by the `forgejo_actions_secrets` ansible role: + +```fish +mise run provision-indri -- --tags forgejo_actions_secrets +``` + +### 4. Verify + +```fish +mise run fly-deploy +``` + +A successful deploy confirms the new token works locally. Watch for the metrics-token warning — it should be **absent** with an `org`-scoped token. If still present, the rotation produced a `deploy`-scoped token by mistake. + +Then trigger the CI workflow (push a no-op commit touching `fly/`, or dispatch manually) to confirm Forgejo Actions has the new secret. + +### 5. Revoke the old token + +```fish +fly tokens list +fly tokens revoke +``` + +## Debugging + +### `fly deploy` returns "unauthorized" + +Token is invalid (expired, revoked, or wrong scope). Repeat the procedure. + +### `Metrics token unavailable: ... context canceled` after rotation + +The new token was created with `deploy` scope, not `org`. Either accept it (cosmetic) or re-mint with `fly tokens create org`. + +### Forgejo Actions deploy fails but local works + +The Forgejo secret wasn't synced. Re-run `mise run provision-indri -- --tags forgejo_actions_secrets` and confirm the secret value in Forgejo matches 1Password. + +## Related + +- [[flyio-proxy]] — Service reference card +- [[manage-flyio-proxy]] — Day-to-day operations and Tailscale auth-key rotation (separate 90-day rotation) +- [[expose-service-publicly]] — Full setup architecture diff --git a/docs/how-to/configuration/rotate-gandi-pat.md b/docs/how-to/configuration/rotate-gandi-pat.md new file mode 100644 index 0000000..5ce6f81 --- /dev/null +++ b/docs/how-to/configuration/rotate-gandi-pat.md @@ -0,0 +1,125 @@ +--- +title: Rotate the Gandi PAT +modified: 2026-04-27 +last-reviewed: 2026-04-27 +tags: + - how-to + - dns + - secrets +--- + +# Rotate the Gandi PAT + +How to rotate the Gandi Personal Access Token. **One PAT** is shared by [[caddy]] (TLS via ACME DNS-01) and Pulumi (DNS records). It lives in 1Password at `op://blumeops/gandi - blumeops/pat`. + +## When to rotate + +- Every 60 days (heph recurring task) +- After any compromise / accidental disclosure +- Whenever Gandi starts rejecting the PAT (see [Debugging](#debugging)) + +Gandi caps PAT lifetime at 90 days; rotating at 60 leaves a 30-day buffer. + +## Prerequisites + +- Access to the [Gandi PAT admin console](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat) +- 1Password (`blumeops` vault) +- Ability to run `mise run provision-indri` (ssh to [[indri]] + 1Password biometric) + +## Procedure + +### 1. Create a new PAT in Gandi + +In the [Gandi PAT console](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat), create a token: + +- **Name:** `blumeops` +- **Expiration:** **90 days** (the max — paired with the 60-day rotation cadence) +- **Permissions:** + - Manage domain name technical configurations *(required — DNS records and ACME TXT writes)* + - See and renew domain names + +Other permissions are not used. + +Copy the new PAT to your clipboard. + +### 2. Update 1Password + +```bash +op item edit mco6ka3dc3rmw7zkg2dhia5d2m pat="$(pbpaste)" --vault vg6xf6vvfmoh5hqjjhlhbeoaie +``` + +### 3. Push to indri + +The PAT lives in two places: 1Password (read by Pulumi at runtime) and `~/.config/caddy/gandi-token` on indri (read by Caddy at startup). The 1Password edit only updates the first. + +```bash +mise run provision-indri --tags caddy +``` + +This re-fetches the PAT from 1Password, writes it to indri, and restarts Caddy. Caddy will renew any due certificates within minutes. + +### 4. Verify + +```bash +mise run dns-preview +``` + +A successful preview confirms Pulumi can use the PAT. + +```bash +ssh indri 'tail -50 ~/Library/Logs/mcquack.caddy.err.log' \ + | grep -E "obtained|renew|error" +``` + +Expect to see no `LiveDNS returned a 403` lines, and either no renewal activity (if no certs were due) or `certificate obtained successfully`. + +### 5. Delete the old PAT in Gandi + +Return to the Gandi PAT console and delete the previous token. + +### 6. Clean up orphan ACME records + +Each successful Caddy renewal leaves orphan `_acme-challenge.ops` TXT records in the zone (a bug in `libdns/gandi` v1.1.0 — see the script docstring). Cadence aligns with rotation: + +```bash +mise run dns-acme-cleanup --dry-run +mise run dns-acme-cleanup +``` + +## Debugging + +### Caddy logs `LiveDNS returned a 403` + +The PAT is invalid (expired, revoked, or insufficient scope). **Gandi returns 403 — not 401 — for an expired PAT**, which can read as a permissions issue. The most common cause is plain expiry. Rotate. + +### `mise run dns-preview` returns 403 + +Same root cause — Pulumi and Caddy share this PAT. + +### After a fresh PAT, Caddy still fails + +Check that the value on indri matches 1Password: + +```bash +diff <(ssh indri 'cat ~/.config/caddy/gandi-token') \ + <(op read 'op://blumeops/gandi - blumeops/pat') +``` + +If they differ, `mise run provision-indri --tags caddy` was skipped or failed. + +Confirm the new PAT works against Gandi directly: + +```bash +curl -s -o /dev/null -w "HTTP %{http_code}\n" \ + -H "Authorization: Bearer $(op read 'op://blumeops/gandi - blumeops/pat')" \ + https://api.gandi.net/v5/livedns/domains/eblu.me +``` + +`200` = healthy. `403` = scope or expiry. `401` = malformed token. + +## Related + +- [[gandi]] — Gandi reference card +- [[manage-eblu-me-dns]] — DNS records workflow (separate operation, same PAT) +- [[caddy]] — Reverse proxy that uses the PAT for TLS +- [[mise-tasks]] — `dns-acme-cleanup`, `provision-indri`, `dns-preview` reference diff --git a/docs/how-to/configuration/update-tooling-dependencies.md b/docs/how-to/configuration/update-tooling-dependencies.md index 8b09e6d..2bfe887 100644 --- a/docs/how-to/configuration/update-tooling-dependencies.md +++ b/docs/how-to/configuration/update-tooling-dependencies.md @@ -28,33 +28,45 @@ Out of scope: ArgoCD-deployed service images, Ansible role versions, NixOS flake ### 1. Check prek hook versions -For each repo in `prek.toml` with a `rev =` value, check the upstream GitHub releases page for a newer tag. Update each `rev` to the latest release tag. Also check `additional_dependencies` entries for PyPI version bumps. - -Verify after updating: +For each repo in `prek.toml` with a `rev =` value, check the upstream GitHub releases page for a newer tag. Update each `rev` to the **commit SHA** of the latest release with a trailing `# vX.Y.Z` comment (matches the `additional_dependencies` and Forgejo workflow pinning style). Also check `additional_dependencies` entries for PyPI version bumps and pin them with `==`. ```fish +git ls-remote --tags https://github.com//.git 'refs/tags/v*' | sort -t/ -k3 -V | tail -5 +``` + +Clear the prek cache before verifying — it can grow to several GiB (one venv per hook per version) and old cached environments can mask resolution failures or stale catalogs: + +```fish +prek clean prek run --all-files ``` ### 2. Check Fly.io Dockerfile pins -Review `fly/Dockerfile` for pinned image tags: +Review `fly/Dockerfile` for pinned image digests. Each `FROM` and `COPY --from=` uses `image@sha256:...` digest pinning with a comment line above documenting the human-readable version. - **nginx** — check [Docker Hub](https://hub.docker.com/_/nginx) for latest stable alpine tag - **grafana/alloy** — check [GitHub releases](https://github.com/grafana/alloy/releases) -- **tailscale/tailscale** — uses `stable` rolling tag, no action needed +- **tailscale/tailscale** — pinned to a known-good version. Do not bump to v1.96.5 or later (MagicDNS regression breaks the proxy boot) + +To resolve a tag to a digest: + +```fish +docker buildx imagetools inspect docker.io/: +# Use the top-level "Digest:" line (multi-arch index) — not the per-platform sub-digest +``` After updating, the deploy-fly workflow will build and deploy on merge to main. Verify with `fly status -a blumeops-proxy` after deploy. -### 3. Normalize mise task dependency bounds +### 3. Pin mise task dependencies -Mise tasks use `uv run --script` with inline PEP 723 dependency metadata. Check that lower bounds are consistent across all scripts: +Mise tasks use `uv run --script` with inline PEP 723 dependency metadata. All packages are pinned with `==` (PEP 508 doesn't support hashes inline). Check that pinned versions are consistent across all scripts: ```fish grep -r 'dependencies' mise-tasks/ | grep '# dependencies' ``` -Ensure all scripts using the same package agree on the minimum version. When a package has a new major or breaking minor release, bump the lower bound across all scripts at once. +For each package in use (`httpx`, `rich`, `typer`, `pyyaml`), pick the latest PyPI version and update every script in lockstep — divergence between scripts is the failure mode this catches. Bump everything together; don't leave one script behind. ### 4. Pin Forgejo workflow action versions diff --git a/docs/how-to/forgejo-runner/configure-k8s-runner.md b/docs/how-to/forgejo-runner/configure-k8s-runner.md new file mode 100644 index 0000000..3c095d0 --- /dev/null +++ b/docs/how-to/forgejo-runner/configure-k8s-runner.md @@ -0,0 +1,100 @@ +--- +title: Configure K8s Forgejo Runner +modified: 2026-04-20 +last-reviewed: 2026-04-20 +tags: + - how-to + - forgejo-runner + - ci +--- + +# Configure K8s Forgejo Runner + +Configure the Kubernetes Forgejo runner on [[indri]] using declarative `server.connections` config instead of first-boot `register`. + +## Why This Flow + +The older bootstrap pattern used `forgejo-runner register` on container start and persisted `/data/.runner` in an `emptyDir`. That works, but it depends on deprecated CLI flows and mutates runner identity at runtime. + +The preferred pattern is: + +- Create runner credentials once on the Forgejo host +- Store the runner UUID and token in 1Password +- Inject them into Kubernetes via [[external-secrets]] +- Render `server.connections` in `argocd/manifests/forgejo-runner/config.yaml` + +This keeps runner identity under secret management and makes pod restarts idempotent. + +## Create Runner Credentials + +On [[indri]], use Forgejo's local CLI instead of the web UI: + +```bash +ssh indri 'cd ~/code/3rd/forgejo && ./forgejo forgejo-cli actions register \ + --name k8s-runner \ + --scope instance \ + --secret "$(openssl rand -hex 32)"' +``` + +This returns a runner UUID. The generated secret becomes the runner token. Store both in 1Password under the "Forgejo Secrets" item as: + +- `runner_k8s_uuid` +- `runner_k8s_token` + +## Kubernetes Secret Wiring + +Expose those fields with `argocd/manifests/forgejo-runner/external-secret.yaml` and make them available to the runner container as environment variables. + +The deployment should not carry registration-only env vars like `FORGEJO_URL`, `RUNNER_NAME`, or `RUNNER_TOKEN`. + +## Runner Config + +Keep the runner configuration in `argocd/manifests/forgejo-runner/config.yaml`. The key change is adopting `server.connections`: + +```yaml +server: + connections: + forgejo: + url: https://forge.ops.eblu.me + uuid: ${FORGEJO_RUNNER_UUID} + token: ${FORGEJO_RUNNER_TOKEN} + labels: + - k8s:docker://registry.ops.eblu.me/blumeops/runner-job-image: +``` + +Other settings that still matter for this deployment: + +- `runner.capacity: 2` +- `runner.timeout: 3h` +- `runner.shutdown_timeout: 3h` +- `container.network: host` +- `container.docker_host: tcp://127.0.0.1:2375` + +We do not currently use cache configuration, extra volume mounts, or multiple Forgejo connections. + +## Deployment Shape + +The pod still runs two containers: + +1. `runner` — Forgejo runner daemon +2. `dind` — Docker-in-Docker sidecar + +The startup script only needs to wait for DinD and then launch the daemon. It should no longer call `forgejo-runner register` or depend on `/data/.runner`. + +## Upgrade Procedure + +When bumping the runner version: + +1. Update `VERSION` in `containers/forgejo-runner/container.py` +2. Review release notes for runner breaking changes +3. Confirm `config.yaml` is still compatible with the current runner defaults +4. Build and release the updated `forgejo-runner` image +5. Update `argocd/manifests/forgejo-runner/kustomization.yaml` to the new image tag +6. Validate workflows with [[validate-forgejo-workflows]] +7. Sync the `forgejo-runner` ArgoCD app and trigger a test workflow + +## Related + +- [[validate-forgejo-workflows]] — Validate workflow schema against the deployed runner line +- [[forgejo-runner]] — Service reference +- [[build-container-image]] — Build and release the runner image diff --git a/docs/how-to/forgejo-runner/review-runner-config-v12.md b/docs/how-to/forgejo-runner/review-runner-config-v12.md deleted file mode 100644 index af50090..0000000 --- a/docs/how-to/forgejo-runner/review-runner-config-v12.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: Review Runner Config for v12 -modified: 2026-02-27 -last-reviewed: 2026-02-27 -tags: - - how-to - - forgejo-runner - - ci ---- - -# Review Runner Config for v12 - -Compare the current runner ConfigMap against the v12.7.0 default config to identify new, changed, or deprecated keys. - -## Findings - -Compared `forgejo-runner generate-config` output from v6.3.1 and v12.7.0. Our config is minimal and remains valid for v12. - -### New sections in v12 (not adopted) - -- **`server.connections`** — multi-server polling. Not needed (single Forgejo instance). -- **`cache.secret_url`** — load cache secret from file URL. Not needed. -- **`runner.report_retry`** — retry config for log uploads. Defaults are fine. - -### Changed semantics - -- **`container.docker_host`** — v12 supports `unix://` and `ssh://` URLs. Our explicit `tcp://127.0.0.1:2375` still correct for DinD sidecar. -- **`cache`** section restructured with proxy/server split and better docs. We don't configure cache, so defaults apply. - -### Config update applied - -Added `shutdown_timeout: 3h` to allow graceful job completion on pod termination (v12 default, was missing from our v6 config). Added review date comment. - -`container.valid_volumes` and `container.options` left empty — our jobs use host networking and don't mount volumes. Can harden later if needed. - -## Related - -- [[upgrade-k8s-runner]] — Parent goal -- [[validate-workflows-against-v12]] — Sibling prerequisite diff --git a/docs/how-to/forgejo-runner/upgrade-k8s-runner.md b/docs/how-to/forgejo-runner/upgrade-k8s-runner.md deleted file mode 100644 index 3d285ac..0000000 --- a/docs/how-to/forgejo-runner/upgrade-k8s-runner.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: Upgrade K8s Forgejo Runner to v12 -modified: 2026-02-27 -last-reviewed: 2026-02-27 -tags: - - how-to - - forgejo-runner - - ci ---- - -# Upgrade K8s Forgejo Runner to v12 - -Upgrade the k8s forgejo-runner daemon from v6.3.1 to v12.7.0 (or latest v12.x at time of execution). - -## Background - -The k8s runner on indri (minikube) uses the upstream `code.forgejo.org/forgejo/runner` image, currently pinned to v6.3.1. The latest is v12.7.0. The runner is still in alpha and uses major version bumps for each breaking change, so v6→v12 crosses six major versions. The ringtail runner is already at ~v12.6.4 via nixpkgs and needs no work. - -Blast radius is low — if the upgrade breaks CI, revert the image tag in `argocd/manifests/forgejo-runner/deployment.yaml` and sync. - -## Breaking Changes Crossed - -| Version | Change | Impact | -|---------|--------|--------| -| v7.0 | CLI `--gitea-instance` → `--forgejo-instance`; `FORGEJO_*` env vars | Low — our registration doesn't use the old flag | -| v8.0 | Workflow schema validation; default image → `node:22-bookworm` | Workflows must pass validation | -| v9.0 | Stricter schema + actions validation; `forgejo-runner validate` added | Same — but now we have a tool | -| v10.0 | Cache isolation; skip v10.0.0 (regression) | Low | -| v11.0 | License MIT → GPLv3 | Non-technical | -| v12.0 | Git binary required; git worktrees for remote actions | Low — OCI image includes git | - -## Execution Steps - -Once prerequisites are met: - -1. Update `argocd/manifests/forgejo-runner/deployment.yaml`: - - Change runner image from `code.forgejo.org/forgejo/runner:6.3.1` to `code.forgejo.org/forgejo/runner:12.7.0` -2. Update `argocd/manifests/forgejo-runner/config.yaml` with any config changes from [[review-runner-config-v12]] -3. Push, sync ArgoCD: `argocd app sync forgejo-runner` -4. Verify runner registers and connects: check Forgejo admin → runners -5. Trigger a test workflow (manual dispatch of `build-container.yaml` or `branch-cleanup.yaml`) -6. Update `service-versions.yaml` to note the daemon version - -## Rollback - -Revert the image tag to `6.3.1` in `deployment.yaml`, push, and sync. - -## Related - -- [[forgejo]] — Forgejo service reference -- [[validate-workflows-against-v12]] — Pre-upgrade workflow validation -- [[review-runner-config-v12]] — Config format review diff --git a/docs/how-to/forgejo-runner/validate-workflows-against-v12.md b/docs/how-to/forgejo-runner/validate-forgejo-workflows.md similarity index 61% rename from docs/how-to/forgejo-runner/validate-workflows-against-v12.md rename to docs/how-to/forgejo-runner/validate-forgejo-workflows.md index 5f98502..ed21de7 100644 --- a/docs/how-to/forgejo-runner/validate-workflows-against-v12.md +++ b/docs/how-to/forgejo-runner/validate-forgejo-workflows.md @@ -1,20 +1,20 @@ --- -title: Validate Workflows Against v12 +title: Validate Forgejo Workflows modified: 2026-04-11 -last-reviewed: 2026-02-27 +last-reviewed: 2026-04-20 tags: - how-to - forgejo-runner - ci --- -# Validate Workflows Against v12 +# Validate Forgejo Workflows -Run `forgejo-runner validate` (available from v9.0+) against all workflow files to catch schema issues before upgrading the k8s runner daemon. +Run `forgejo-runner validate` against all workflow files to catch schema issues before upgrading the k8s runner daemon. ## Result -All 6 workflows pass v12.7.0 schema validation with no changes needed: +All current workflows pass the validation step with no changes needed: - `branch-cleanup.yaml` — OK - `build-blumeops.yaml` — OK @@ -27,7 +27,7 @@ All 6 workflows pass v12.7.0 schema validation with no changes needed: 1. `validate_workflows` function added to `src/blumeops/main.py` (formerly `.dagger/src/blumeops_ci/main.py`) - Uses `forgejo-runner validate --directory .` inside the upstream runner container - - `runner_version` parameter (default `12.7.0`) pins to deployed version + - `runner_version` parameter pins validation to the deployed runner line 2. `mise run validate-workflows` task wired to `dagger call validate-workflows` 3. Pre-commit hook triggers on `.forgejo/workflows/` changes @@ -41,5 +41,4 @@ dagger call validate-workflows --src=. ## Related -- [[upgrade-k8s-runner]] — Parent goal -- [[review-runner-config-v12]] — Sibling prerequisite +- [[configure-k8s-runner]] — Runner configuration and upgrade flow diff --git a/docs/how-to/immich/cnpg-on-ringtail.md b/docs/how-to/immich/cnpg-on-ringtail.md new file mode 100644 index 0000000..153e674 --- /dev/null +++ b/docs/how-to/immich/cnpg-on-ringtail.md @@ -0,0 +1,52 @@ +--- +title: CNPG Operator on Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - postgres + - ringtail +--- + +# CNPG Operator on Ringtail + +Bring up the `cloudnative-pg` operator on `k3s-ringtail`. Today the +operator only exists on `minikube-indri` (see +`argocd/apps/cloudnative-pg.yaml`, destination `kubernetes.default.svc`). + +Prerequisite of [[migrate-immich-to-ringtail]]; consumed by +[[immich-pg-on-ringtail]]. + +## What to do + +- Add a sibling `argocd/apps/cloudnative-pg-ringtail.yaml` pointing + at the same mirror (`mirrors/cloudnative-pg`, tag `v1.27.1`), + destination `https://ringtail.tail8d86e.ts.net:6443`, + namespace `cnpg-system`. +- Mirror the `ServerSideApply=true` and `CreateNamespace=true` sync + options (the CRDs exceed the annotation size limit). +- Sync `apps` then `cloudnative-pg-ringtail`. Verify the operator + pod is running on ringtail. + +## Verification + +```fish +kubectl --context=k3s-ringtail -n cnpg-system get pods +kubectl --context=k3s-ringtail get crd clusters.postgresql.cnpg.io +``` + +## Why a separate app + +Each ArgoCD app targets a single cluster via `destination.server`. +We could parameterize with ApplicationSets, but blumeops' convention +is to duplicate the manifest with a `-ringtail` suffix (see +`alloy-ringtail`, `external-secrets-ringtail`, etc.). Keep the +convention. + +## Out of scope + +- Postgres clusters themselves (`immich-pg`, etc.) — those come from + [[immich-pg-on-ringtail]]. +- Removing the minikube cnpg operator. That happens at the very end + of the indri-k8s decommission, not in this chain. diff --git a/docs/how-to/immich/immich-app-on-ringtail.md b/docs/how-to/immich/immich-app-on-ringtail.md new file mode 100644 index 0000000..51b619d --- /dev/null +++ b/docs/how-to/immich/immich-app-on-ringtail.md @@ -0,0 +1,91 @@ +--- +title: Immich App on Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - immich +--- + +# Immich App on Ringtail + +Bring up `immich-server`, `immich-machine-learning`, and +`immich-valkey` on ringtail. This card stands the stack up against +the *new* pg cluster — it does not move user traffic. Cutover lives +in [[immich-cutover-and-decommission]]. + +## What to do + +- New manifest dir `argocd/manifests/immich-ringtail/` (the suffix + matches the `-ringtail` convention used by other apps). Port from + `argocd/manifests/immich/`: + - `deployment-server.yaml` — point `DB_HOSTNAME` at the ringtail + pg service. + - `deployment-ml.yaml` — use `runtimeClassName: nvidia` + a + `resources.limits` for `nvidia.com/gpu: 1`. Use the `-cuda` tag + of the immich-ml image (set in kustomization). Ringtail is + single-node, so no node selector needed. See + `argocd/manifests/frigate/` for the existing GPU pod pattern. + + **GPU contention discovery:** ringtail's `nvidia-device-plugin` + is configured with `timeSlicing.replicas: 2`. Frigate + Ollama + already consume both virtual slices. Adding immich-ml requires + bumping the count to >= 3. Edit + `argocd/manifests/nvidia-device-plugin/configmap.yaml` (or + wherever the device-plugin config lives) and re-sync the + `nvidia-device-plugin` ArgoCD app. The plugin pod restarts and + the new advertised count appears as the node's + `nvidia.com/gpu` allocatable. + - `deployment-valkey.yaml` — straight port, BUT use the upstream + multi-arch `docker.io/valkey/valkey:` image — do NOT + use the `registry.ops.eblu.me/blumeops/valkey` rewrite in the + kustomization. That mirror was built on indri (arm64) and is + single-arch; pulling it on ringtail (amd64) gets `exec format + error` in CrashLoopBackOff. The mirror should eventually carry + a multi-arch tag, at which point the rewrite can return. + - `service*.yaml` — straight port. + - `pvc-ml-cache.yaml` — straight port (empty `local-path` PVC). + - `pv-nfs.yaml` + `pvc.yaml` — already covered by + [[sifaka-nfs-from-ringtail]] (may live in this dir or theirs). + - `ingress-tailscale.yaml` — ProxyGroup ingress, **must not** set + an explicit `host:` (or use `host: *`) per the lesson on + ProxyGroup VIP routing. + **Hostname collision warning:** the minikube ingress claims the + Tailscale device name `photos` (`tls.hosts: [photos]`). Two + devices on the tailnet cannot share that name. While the + ringtail deployment is being staged it must use a *different* + `tls.hosts` value (e.g. `photos-ringtail`) so it can coexist + with the running minikube one. The flip to `photos` happens at + cutover time, *after* the minikube ingress has been removed. + See [[immich-cutover-and-decommission#Cutover sequence]]. + - `kustomization.yaml` — same `images:` block (server, ML, valkey). +- New ArgoCD app `argocd/apps/immich-ringtail.yaml` targeting + ringtail, namespace `immich`. **Manual sync only** until the + cutover. +- Existing `argocd/apps/immich.yaml` (minikube) stays untouched + during this card — both apps exist briefly. + +## Bring it up against a copy of the DB + +Use the throwaway/test path from [[immich-pg-data-migration#Dry run +before real cutover]]: point the ringtail immich at the *test* pg +cluster first, verify the pod boots, the web UI loads (via +`kubectl port-forward`), assets list, ML embeddings query. Then +tear it down. + +## Verification + +- All three pods Ready. +- ML pod has a GPU attached: `nvidia-smi` inside the container shows + the 4080. +- `immich-server` connects to pg and valkey (no `ECONNREFUSED` in + logs). +- A `kubectl port-forward` to the server service shows the Immich + web UI. + +## Out of scope + +- Public/tailnet routing flip. Caddy still points at the minikube + Tailscale ingress until [[immich-cutover-and-decommission]]. +- Removing the minikube immich. Same. diff --git a/docs/how-to/immich/immich-cutover-and-decommission.md b/docs/how-to/immich/immich-cutover-and-decommission.md new file mode 100644 index 0000000..b44fddd --- /dev/null +++ b/docs/how-to/immich/immich-cutover-and-decommission.md @@ -0,0 +1,103 @@ +--- +title: Immich Cutover and Decommission +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - immich + - migration +--- + +# Immich Cutover and Decommission + +The user-visible flip. By the time this card opens, the ringtail +stack has been proven against a copy of the data. This card does the +real cutover. + +## Pre-cutover checklist + +- [[immich-pg-data-migration]] dry-run succeeded; method is chosen. +- Ringtail immich stack has been brought up against the test pg, + pods healthy, UI loaded ([[immich-app-on-ringtail#Verification]]). +- Borgmatic just ran successfully (a fresh nightly archive is a + belt-and-suspenders fallback, on top of the live source pg). +- User has been told to stop uploading from the iOS app for the + cutover window. + +## Cutover sequence + +1. **Quiesce source.** `kubectl --context=minikube-indri -n immich + scale deploy/immich-server --replicas=0` and same for ML. Leave + valkey + pg running. Confirm no client traffic on the source pg + via `pg_stat_activity`. +2. **Tear down the minikube Tailscale ingress.** The `photos` + Tailscale device name must be freed before ringtail's ingress can + claim it (Tailscale enforces uniqueness across the tailnet). + `kubectl --context=minikube-indri -n immich delete ingress + immich-tailscale` and wait for the corresponding `tailscale`-LB + StatefulSet pod to terminate. Verify the `photos` device is gone: + `tailscale status | grep -i photos` from any tailnet host. +3. **Final sync.** Per chosen method in + [[immich-pg-data-migration]]: + - Option A: promote the ringtail replica. + - Option B: take final `pg_dump`, restore to ringtail + `immich-pg`. +4. **Verify.** Run the row-count and schema-diff checks from + [[immich-pg-data-migration#Verification on the real run]]. +5. **Flip the ringtail ingress to `photos`.** Update + `argocd/manifests/immich-ringtail/ingress-tailscale.yaml`: + `tls.hosts: [photos]` (was `[photos-ringtail]` during staging per + [[immich-app-on-ringtail]]). Commit, `argocd app sync + immich-ringtail`. Wait for the `photos` device to register on the + tailnet again. +6. **Bring up ringtail immich** against the now-promoted pg + (`argocd app sync immich-ringtail`). Wait for Ready. +7. **Flip routing.** Update Caddy on indri + (`ansible/roles/caddy/defaults/main.yml`): `photos.ops.eblu.me` + upstream changes to the ringtail Tailscale ingress hostname + (`photos` — same MagicDNS name, now pointing to the ringtail + proxy). `mise run provision-indri -- --tags caddy`. +8. **Smoke test.** Open `photos.ops.eblu.me` in a browser. Sign in. + Scroll the timeline. Open an album. Trigger an ML search. +9. **Update borgmatic.** If the Tailscale hostname for pg changed, + update `borgmatic.cfg` on indri to point at the ringtail + `immich-pg-tailscale` service. Run a manual backup to verify. + +## After cutover + +- `argocd app set immich --revision ` is no longer relevant; + the minikube `immich` app gets deleted entirely. +- Delete `argocd/apps/immich.yaml`, `argocd/manifests/immich/`, and + the minikube `argocd/manifests/databases/immich-pg.yaml` + + `external-secret-immich-borgmatic.yaml` + + `service-immich-pg-tailscale.yaml`. +- Rename `immich-ringtail` back to `immich` (the `-ringtail` suffix + was scaffolding for the dual-cluster window; once minikube is + empty of immich, the unsuffixed name is clean). +- Confirm the minikube `immich-pg` PVC is no longer used, then + delete it (the PV with `Retain` policy will persist — clean that + up too). + +## Verification (definition of done) + +- `photos.ops.eblu.me` works for a real session, including ML search. +- Source minikube has no `immich` pods, no `immich-pg`, no PVCs. +- Memory pressure on minikube has dropped (≥1.5 GiB reclaimed). Check + `docker stats minikube` on indri. +- Nightly borgmatic run after the cutover completes successfully, + with the immich-pg archive showing the new source. + +## Rollback (within the cutover window) + +If smoke test fails: flip Caddy back, scale ringtail immich to 0, +scale source immich back up. Source pg was never destroyed. File a +plan reset on the relevant prerequisite card and try again next +session. + +## Out of scope + +- Decommissioning all of minikube. This chain just removes immich. + Other tenants migrate in their own chains as part of the broader + indri-k8s decommission. See [[migrate-immich-to-ringtail]] for + context. diff --git a/docs/how-to/immich/immich-pg-data-migration.md b/docs/how-to/immich/immich-pg-data-migration.md new file mode 100644 index 0000000..fb87783 --- /dev/null +++ b/docs/how-to/immich/immich-pg-data-migration.md @@ -0,0 +1,79 @@ +--- +title: Immich Postgres Data Migration +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - postgres + - immich + - critical +--- + +# Immich Postgres Data Migration + +**This is the data-loss surface of the migration.** Pick a method, +prove it on a throwaway copy first, then run the real cutover. + +## Decision: pick one + +### Option A — CNPG `externalCluster` bootstrap (preferred) + +Stand the ringtail cluster up as a streaming replica of the minikube +cluster via `bootstrap.pg_basebackup.source`. Replica catches up +online; when ready, promote it and point Immich at it. This is +CNPG's documented PG-to-PG migration path and gives near-zero data +loss (the WAL position at promote == the position at app stop). + +Requires: network path from ringtail to minikube's pg over the +tailnet (the existing `immich-pg-tailscale` Service works), and a +superuser secret minikube-side exposed to ringtail's basebackup. + +Pitfall to plan around: the ringtail Cluster CR will need its +`bootstrap` block rewritten *after* promotion (CNPG doesn't +gracefully drop the externalCluster reference). Account for this in +[[immich-pg-on-ringtail]] — it may force a reset of that card. + +### Option B — pg_dump / pg_restore + +Stop immich, `pg_dump -Fc` from minikube, scp to ringtail, restore. +Simpler but full downtime for the whole dump+restore window +(measure on a copy first — VectorChord indexes are slow to rebuild). +Smaller blast radius; no streaming-replication moving parts. + +Use this if Option A hits any blocker. Data loss should still be +zero if the source is stopped first. + +### Option C — leave pg on minikube + +Rejected. See goal card [[migrate-immich-to-ringtail#Why postgres on +ringtail (not cross-cluster)]]. + +## Dry run before real cutover + +Whichever option wins: + +1. Snapshot the minikube `immich-pg` PVC or take a fresh `pg_dump` + into a scratch location. +2. Restore into a *separate* ringtail CNPG cluster (different name, + e.g. `immich-pg-test`) and point a scratch immich-server pod at + it. +3. Verify: pod boots, can list assets, ML embeddings query without + error, face thumbnails render. VectorChord-backed queries should + not error. +4. Tear the scratch cluster down before doing the real one. + +## Verification on the real run + +- Row counts match for `assets`, `albums`, `users`, `face`, + `asset_face`, `smart_search` (the embedding table) — script this. +- `pg_dump --schema-only --no-owner` diff between source and dest + should be empty modulo CNPG-managed roles. +- Immich `/api/server-info/version` and `/api/server-info/statistics` + return sane numbers. + +## Rollback + +If the cutover fails verification: stop the ringtail immich, repoint +ArgoCD `immich.destination` back to minikube, re-sync. Source pg was +never deleted. Document what failed and reset the chain. diff --git a/docs/how-to/immich/immich-pg-on-ringtail.md b/docs/how-to/immich/immich-pg-on-ringtail.md new file mode 100644 index 0000000..10c7072 --- /dev/null +++ b/docs/how-to/immich/immich-pg-on-ringtail.md @@ -0,0 +1,69 @@ +--- +title: Immich Postgres Cluster on Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - postgres + - immich +--- + +# Immich Postgres Cluster on Ringtail + +Stand up a fresh `immich-pg` CNPG Cluster on ringtail, ready to receive +data. **No data import yet** — that's [[immich-pg-data-migration]]. + +## What to do + +- Create `argocd/manifests/databases-ringtail/` (or pick another + namespace name — verify what other ringtail pg clusters will use; + if none yet, `databases` is fine). +- Port these from the minikube side: + - `immich-pg.yaml` — CNPG Cluster CR. Same image + (`ghcr.io/tensorchord/cloudnative-vectorchord:17-0.5.0`), same + extensions, same managed `borgmatic` role. Bump `storage.size` if + the minikube 10 GiB looks tight (check actual usage first). + `storageClass: local-path` on ringtail (default). + - `external-secret-immich-borgmatic.yaml` — same 1Password item, + same field, but referencing the ringtail `ClusterSecretStore` + (`onepassword-blumeops` already exists per the + `external-secrets-ringtail` app). + - Service for in-cluster access (the operator creates `immich-pg-rw` + etc. automatically; verify the app deployment uses those names). + - A Tailscale Service if we want backups to keep working via the + same hostname during the transition — see "Borgmatic" below. +- New ArgoCD app `argocd/apps/databases-ringtail.yaml` pointing at + the new path, destination ringtail. + +## Verification + +- Cluster reaches `Ready`. +- `borgmatic` role exists, `rolcanlogin=t`, and is a member of + `pg_read_all_data` (via `managed.roles[].inRoles`). +- ExternalSecret `immich-pg-borgmatic` syncs from 1Password + (`Ready: True`) and the rendered Secret has `username=borgmatic`. +- The `vchord`, `vector`, `cube`, `earthdistance` extensions show + installed in the `postgres` database (`\dx` from + `psql -U postgres`). They are NOT installed in the `immich` + database at this point — `postInitSQL` in CNPG's `initdb` block + runs against the `postgres` superuser database. The Immich app + itself creates the extensions in its own `immich` database at + startup; do not be alarmed by their absence pre-immich-deploy. + The `vchord.so` library is preloaded via + `shared_preload_libraries` regardless, so `CREATE EXTENSION` at + app startup just registers it in the right database. + +## Borgmatic implications + +`borgmatic.cfg` on indri targets `immich-pg-tailscale` over the +tailnet. During migration both clusters will exist briefly. Decide +upfront: backup the *source* pg until cutover, then flip borgmatic +to the ringtail Tailscale service. Document the flip in +[[immich-cutover-and-decommission]]. + +## Out of scope + +- Importing data. That is [[immich-pg-data-migration]], which may + drive a reset on this card if the migration approach (e.g. CNPG + `externalCluster` bootstrap) requires changes to this Cluster CR. diff --git a/docs/how-to/immich/migrate-immich-to-ringtail.md b/docs/how-to/immich/migrate-immich-to-ringtail.md new file mode 100644 index 0000000..e654b62 --- /dev/null +++ b/docs/how-to/immich/migrate-immich-to-ringtail.md @@ -0,0 +1,134 @@ +--- +title: Migrate Immich to Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - immich + - migration +--- + +# Migrate Immich to Ringtail + +Move the entire Immich stack (server, ML, valkey, postgres) off +`minikube-indri` and onto `k3s-ringtail`. This is the first concrete +chain in the broader indri-k8s decommission: minikube is +memory-saturated (97% RAM, swapping), and Immich is the single +largest tenant (~1.5 GiB resident). + +## End state + +- Immich `server`, `machine-learning`, and `valkey` Deployments run on + ringtail k3s in the `immich` namespace. +- The `immich-machine-learning` pod uses ringtail's RTX 4080 via the + `nvidia-device-plugin` (performance win — currently CPU-only on + minikube). +- A CNPG `immich-pg` Cluster (PostgreSQL 17 + VectorChord) runs in a + `databases` namespace on ringtail, owned by the `cnpg-system` + operator on ringtail. +- The photo library still lives on [[sifaka]] at `/volume1/photos`, + mounted via NFS from ringtail pods (RWX). +- Routing: `photos.ops.eblu.me` (Caddy on indri) proxies to a + Tailscale ProxyGroup ingress on ringtail. No public surface today. +- The ArgoCD `immich` app's `destination.server` points at + `https://ringtail.tail8d86e.ts.net:6443`. The old minikube + manifests are removed. + +## Non-goals + +- Public exposure via Fly. Immich stays tailnet-only. +- Changing the immich version or runtime configuration. This is a + lift-and-shift; bumps come later. +- Backing up to a different target. [[borgmatic]] keeps running on + indri (it pulls via Tailscale and uses sifaka SMB for the library). + +## Critical constraint: no data loss + +Downtime is acceptable (Immich is a single-user system; we can take +it offline for the cutover). **Data loss is not.** Two surfaces matter: + +1. **Postgres** — face data, ML embeddings (vectors), album state, + sharing, etc. Re-derivable in theory; weeks of recompute in + practice. See [[immich-pg-data-migration]]. +2. **Library files** — `/volume1/photos`. Not moving, but the NFS + path must be verified accessible from ringtail before cutover. + See [[sifaka-nfs-from-ringtail]]. + +[[borgmatic]] backs both up to sifaka + BorgBase nightly; restore is +possible but slow. Treat it as a fallback, not a plan. + +## Why postgres on ringtail (not cross-cluster) + +`immich-pg` already has a Tailscale Service we could point ringtail +at, leaving the DB on minikube. We're not doing that because: + +- The whole goal is to retire minikube — keeping pg there blocks it. +- Immich is chatty against pg; tailnet round-trips would hurt. +- CNPG is the same operator on both sides — a Cluster CR on ringtail + is mechanically equivalent. + +## Approach + +This is a C2 Mikado chain. The prerequisite cards each represent a +distinct surface that has to work before cutover. See +[[agent-change-process#C2 — Mikado Chain]] for the discipline. + +## Workflow note: registering new ArgoCD apps during the chain + +This chain adds three new ArgoCD `Application` definitions in +`argocd/apps/`: `cloudnative-pg-ringtail`, `databases-ringtail`, +and (later) `immich-ringtail`. The usual C1/C2 pattern of +`argocd app set --revision && argocd app sync ` +does NOT work for the app-of-apps `apps` Application itself, because +`apps` self-manages: it re-reads `apps.yaml` (which declares +`targetRevision: main`) on every sync and reverts the override. As a +result, new app definitions added on a feature branch are never +visible to the cluster via `apps`. + +**Use `kubectl apply` to register each new Application directly:** + +```fish +kubectl --context=minikube-indri apply -f argocd/apps/.yaml +``` + +This creates the Application resource out-of-band, bypassing `apps`. + +For apps whose source lives in **this** repo (e.g. +`databases-ringtail`, `immich-ringtail` — manifest paths exist only +on the branch until merge), follow the apply with a branch override: + +```fish +argocd app set --revision mikado/migrate-immich-to-ringtail +argocd app sync +``` + +For apps whose source is an **external** repo at a pinned tag (e.g. +`cloudnative-pg-ringtail` → `mirrors/cloudnative-pg` `v1.27.1`), no +override is needed — the source revision is independent of this PR. + +After PR merge: + +```fish +argocd app set --revision main +argocd app sync +``` + +`apps` itself, on its next sync from `main`, will discover the new +Application definitions in `argocd/apps/` and adopt the already-running +resources without disruption — provided their in-cluster spec matches +the on-disk definitions (which it does because we applied the same +file). + +## Related + +- [[migrate-wave1-ringtail]] — the next chain in the indri-k8s + decommission: paperless, teslamate, and mealie +- [[shower-on-ringtail]] — a previous migration to ringtail (simpler: + no upstream cluster, SQLite, no GPU) +- [[connect-to-postgres]] — getting a psql session against CNPG +- [[ringtail]] — the target cluster +- [[cnpg-on-ringtail]], [[immich-pg-on-ringtail]], + [[immich-pg-data-migration]], [[sifaka-nfs-from-ringtail]], + [[immich-app-on-ringtail]], [[immich-cutover-and-decommission]] — + the prerequisite cards diff --git a/docs/how-to/immich/sifaka-nfs-from-ringtail.md b/docs/how-to/immich/sifaka-nfs-from-ringtail.md new file mode 100644 index 0000000..2c490c1 --- /dev/null +++ b/docs/how-to/immich/sifaka-nfs-from-ringtail.md @@ -0,0 +1,67 @@ +--- +title: Sifaka NFS Photos from Ringtail +modified: 2026-05-13 +last-reviewed: 2026-05-13 +tags: + - how-to + - operations + - storage + - nfs + - sifaka +--- + +# Sifaka NFS Photos from Ringtail + +The Immich library lives at `sifaka:/volume1/photos` and is mounted +into the pod via an NFS PV (see `argocd/manifests/immich/pv-nfs.yaml`). +That PV is currently scoped to indri. We need ringtail to mount the +same path with the same RWX semantics, without breaking the existing +indri mount during the transition. + +## What to verify / do + +- Check `sifaka` DSM NFS rules for the `photos` share. Per + [[shower-on-ringtail#NFS + SMB share on sifaka]] convention, rules + use `192.168.1.0/24` + `100.64.0.0/10` with + `all_squash`/`Map all users to admin`. The existing rule may + already cover ringtail (it's on `192.168.1.21` per the recent + static-IP pin). If so this card is a verification card. +- If the rule is locked to indri's IP: add an entry for ringtail + (192.168.1.21) or widen to the subnet pattern above. +- Test mount from a ringtail debug pod (busybox or alpine with + nfs-utils) against the `photos` share. Read a file. Write a temp + file. Delete it. +- Watch for the known sifaka NFS-over-Tailscale gotcha: sifaka's + Tailscale must be in TUN mode (not userspace) for NFS to work + reliably over the tailnet. The NFS path here goes over the LAN + (not tailnet), so this shouldn't bite, but worth confirming the + NFS traffic is on `192.168.1.x` not `100.x`. + +## PV + PVC on ringtail + +- New `pv-nfs.yaml` mirroring the minikube one (name can be shared + if the PV is cluster-scoped — but PVs are per-cluster, so just + duplicate). Same `server: sifaka`, same path, same + `accessModes: [ReadWriteMany]`, `persistentVolumeReclaimPolicy: + Retain`. +- New `pvc.yaml` in the ringtail `immich` namespace bound to it. +- The minikube PVC stays bound and active until cutover — both + clusters can have the share NFS-mounted simultaneously (NFS RWX + permits this). Immich itself must not be running on both sides + at once. + +## Verification + +- A pod on ringtail can `ls /mnt/photos/` and see the same files + as the indri pod. +- File written from ringtail pod is visible from indri pod and + vice versa (proves there's no caching surprise). + +## Out of scope + +- Migrating photo files. Nothing moves; this is just adding a second + NFS client. +- The `pvc-ml-cache.yaml` PVC (a separate ML model cache). That's + not on NFS — it's a regular PVC. Recreated empty on ringtail in + [[immich-app-on-ringtail]]; the first ML pod boot will repopulate + it. diff --git a/docs/how-to/mealie/restore-from-borg.md b/docs/how-to/mealie/restore-from-borg.md new file mode 100644 index 0000000..7ff3625 --- /dev/null +++ b/docs/how-to/mealie/restore-from-borg.md @@ -0,0 +1,157 @@ +--- +title: Restore Mealie from Borg +modified: 2026-04-24 +last-reviewed: 2026-04-24 +tags: + - how-to + - mealie + - backup +--- + +# Restore Mealie from Borg + +How to restore [[mealie]]'s SQLite database from a [[borgmatic]] archive when data has been lost (e.g. PVC wiped, accidental deletion, post-DR rebuild). + +## Prerequisites + +- SSH access to [[indri]] (where borgmatic runs and stores k8s SQLite dumps) +- Mealie deployment present in the cluster (the PVC `mealie-data` exists in namespace `mealie`) +- Know which borg archive predates the data loss + +## Procedure + +### 1. Identify a Pre-Loss Archive + +List archives and pick one before the incident: + +```bash +ssh indri 'BORG_PASSCOMMAND="cat /Users/erichblume/.borg/config.yaml" \ + /opt/homebrew/bin/borg list /Volumes/backups/borg | tail -30' +``` + +Compare dump sizes across archives if you're unsure when the loss happened — the daily borgmatic run captures `/Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db`. A sudden drop in size signals the wipe: + +```bash +ssh indri 'bash -c "BORG_PASSCOMMAND=\"cat /Users/erichblume/.borg/config.yaml\" \ + /opt/homebrew/bin/borg list /Volumes/backups/borg:: \ + --pattern=+Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db"' +``` + +### 2. Extract the Pre-Loss Dump + +```bash +ssh indri 'mkdir -p ~/tmp/mealie-restore && cd ~/tmp/mealie-restore && \ + BORG_PASSCOMMAND="cat /Users/erichblume/.borg/config.yaml" \ + /opt/homebrew/bin/borg extract /Volumes/backups/borg:: \ + Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db' +``` + +The file lands at `~/tmp/mealie-restore/Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db` (borg preserves the full path). + +### 3. Verify the Extracted DB + +```bash +ssh indri 'sqlite3 ~/tmp/mealie-restore/Users/erichblume/.local/share/borgmatic/k8s-dumps/mealie.db \ + "PRAGMA integrity_check; SELECT COUNT(*) FROM recipes; SELECT COUNT(*) FROM users;"' +``` + +Expect `ok` and non-zero recipe/user counts. + +### 4. Snapshot the Current (Wiped) DB + +Belt and suspenders — keep a copy of the live DB before overwriting, in case the restore goes wrong: + +```bash +ssh indri 'bash -c "kubectl --context=minikube -n mealie exec deploy/mealie -- \ + python3 -c \"import sqlite3; sqlite3.connect(\\\"/app/data/mealie.db\\\").backup(sqlite3.connect(\\\"/tmp/wiped-mealie.db\\\"))\" && \ + POD=\$(kubectl --context=minikube -n mealie get pod -l app=mealie -o jsonpath=\"{.items[0].metadata.name}\") && \ + kubectl --context=minikube cp mealie/\$POD:/tmp/wiped-mealie.db /Users/erichblume/tmp/mealie-restore/wiped-mealie.db"' +``` + +### 5. Scale Mealie Down + +The PVC is `ReadWriteOnce`, so the helper pod can't mount it while mealie is running: + +```bash +ssh indri 'kubectl --context=minikube -n mealie scale deploy/mealie --replicas=0 && \ + kubectl --context=minikube -n mealie wait --for=delete pod -l app=mealie --timeout=60s' +``` + +### 6. Start a Helper Pod on the PVC + +```bash +ssh indri 'bash -c "cat > /tmp/mealie-helper.yaml <.tar.gz`) | + +`docs_version` in `ansible/roles/docs/defaults/main.yml` is the blumeops release tag (e.g. `v1.16.0`). The role's download/extract is gated by an on-disk sentinel. + +## Deploy + +1. Run the `Build BlumeOps` Forgejo workflow → builds the tarball, creates a release, bumps `docs_version` in the ansible role, pushes to main +2. From gilbert: `mise run provision-indri -- --tags docs` +3. From gilbert: `fly ssh console -a blumeops-proxy -C "sh -c 'rm -rf /tmp/cache && nginx -s reload'"` + +The Caddy block uses `try_files {path} {path}/ {path}.html` and a `handle_errors 404 → /404.html` rewrite, matching the original nginx behavior so Quartz's clean URLs continue to work. + +## Verify + +```fish +ssh indri 'cat ~/blumeops/docs/.installed-version' +ssh indri 'ls ~/blumeops/docs/content/' +curl -fsSI https://docs.ops.eblu.me/ # private +curl -fsSI https://docs.eblu.me/ # public +curl -fsSI https://docs.eblu.me/explanation/agent-change-process # clean URL → .html fallback +curl -fsSI https://docs.eblu.me/no-such-path-exists/ # → /404.html +``` + +## Bumping the docs version + +Normally driven by the workflow. If you need to pin manually, edit `docs_version` in `ansible/roles/docs/defaults/main.yml` and re-run `mise run provision-indri -- --tags docs`. + +## Backup + +Content dir is not borgmatic-backed. Source is in this repo; release tarballs are on the forge. + +## Rollback + +Set `docs_version` back to the previous release tag in the role defaults and re-run. Older release tarballs remain available as Forgejo release assets. + +## Related + +- [[cv-on-indri]] — sibling service, simpler (no `try_html`) +- [[devpi-on-indri]] — pattern reference for indri-native services +- [[docs]] — service reference diff --git a/docs/how-to/operations/manage-flyio-proxy.md b/docs/how-to/operations/manage-flyio-proxy.md index 5cea783..d1a243d 100644 --- a/docs/how-to/operations/manage-flyio-proxy.md +++ b/docs/how-to/operations/manage-flyio-proxy.md @@ -76,6 +76,10 @@ The auth key expires every 90 days. To rotate: 2. Re-run setup to stage the new secret: `mise run fly-setup` 3. Deploy to pick up the new secret: `mise run fly-deploy` +## Rotate Fly.io API Token + +See [[rotate-fly-deploy-token]] for the full rotation procedure (75-day cadence, `org`-scoped). + ## Troubleshooting **502 Bad Gateway on fresh deploy**: MagicDNS may not be ready when nginx starts. The `start.sh` script polls `nslookup` before launching nginx, but if it still fails, check that `tailscale status` is healthy inside the container. diff --git a/docs/how-to/operations/read-compliance-reports.md b/docs/how-to/operations/read-compliance-reports.md index 75fd3ab..e676ad5 100644 --- a/docs/how-to/operations/read-compliance-reports.md +++ b/docs/how-to/operations/read-compliance-reports.md @@ -80,7 +80,7 @@ Not all failures require action. Common expected failures in our minikube cluste 1. **Triage** — review new failures, distinguish real issues from expected noise 2. **Remediate** — fix what you can (pod security contexts, RBAC tightening) -3. **Mutelist** — suppress expected/accepted failures via Prowler's `--mutelist-file` to reduce noise in future scans +3. **Mutelist** — suppress expected/accepted failures by adding a Resource entry under the matching Check in `argocd/manifests/prowler/mutelist/*.yaml` with a free-form `Description` explaining why 4. **Track** — compare reports over time to spot regressions ## Related diff --git a/docs/how-to/operations/rebuild-minikube-cluster.md b/docs/how-to/operations/rebuild-minikube-cluster.md index ad64c89..0d924e9 100644 --- a/docs/how-to/operations/rebuild-minikube-cluster.md +++ b/docs/how-to/operations/rebuild-minikube-cluster.md @@ -108,18 +108,13 @@ kubectl --context=minikube-indri apply -f argocd/apps/apps.yaml # 6. Login and sync apps argocd login argocd.tail8d86e.ts.net --username admin \ --password "$(kubectl --context=minikube-indri -n argocd get secret argocd-initial-admin-secret -o jsonpath='{.data.password}' | base64 -d)" \ - --grpc-web -argocd app sync apps --grpc-web -``` + argocd app sync apps``` ## Phase 4: Bootstrap 1Password Connect + External Secrets ```bash # 1. Sync foundation -argocd app sync external-secrets-crds --grpc-web -argocd app sync external-secrets --grpc-web -argocd app sync 1password-connect --grpc-web - +argocd app sync external-secrets-crdsargocd app sync external-secretsargocd app sync 1password-connect # 2. Create 1Password Connect secrets manually CREDS_RAW=$(op read "op://blumeops/1Password Connect/credentials-file") echo "$CREDS_RAW" | kubectl --context=minikube-indri create secret generic op-credentials -n 1password \ @@ -140,25 +135,20 @@ kubectl --context=minikube-indri get clustersecretstores ```bash # Foundation (CRDs, operators) -argocd app sync cloudnative-pg kube-state-metrics --grpc-web - +argocd app sync cloudnative-pg kube-state-metrics # Databases -argocd app sync blumeops-pg --grpc-web - +argocd app sync blumeops-pg # Observability -argocd app sync loki prometheus tempo grafana grafana-config --grpc-web - +argocd app sync loki prometheus tempo grafana grafana-config # Register ringtail cluster (for authentik, ntfy, ollama, frigate) ssh ringtail 'sudo cat /etc/rancher/k3s/k3s.yaml' | \ sed 's|127.0.0.1|ringtail.tail8d86e.ts.net|' > /tmp/k3s-ringtail.yaml KUBECONFIG=/tmp/k3s-ringtail.yaml argocd cluster add default --name k3s-ringtail --grpc-web -y # Authentik (critical — Zot OIDC depends on it, most image pulls depend on Zot) -argocd app sync authentik --grpc-web - +argocd app sync authentik # Everything else -argocd app sync tailscale-operator alloy-k8s --grpc-web -# ... remaining apps +argocd app sync tailscale-operator alloy-k8s# ... remaining apps ``` ## Phase 6: Restore Databases from Borgmatic @@ -245,25 +235,7 @@ mise run services-check ## Post-Rebuild: Cold Cache Failures -### Devpi (PyPI Cache) - -After a rebuild, devpi's package cache is empty. The first Dagger-based container build will trigger a flood of concurrent package downloads. Devpi uses lazy caching — it serves package metadata (simple index) immediately from upstream PyPI but fetches wheel files on demand. Under heavy concurrent load with a cold cache, the upstream fetch can race with the client request, causing devpi to return `no such file` (HTTP 404) for packages it knows about but hasn't finished downloading yet. - -**Why devpi, not PyPI?** The repo's `uv.lock` was generated with devpi as the index, so every package source URL points at `pypi.ops.eblu.me`. Dagger's Python SDK runtime does a locked install (`uv sync`), not fresh resolution — it fetches from whatever URLs are in the lockfile. This is intentional (supply chain control), but means all builds — local and CI — depend on devpi being available and warm. - -**Symptoms:** Forgejo Actions Dagger builds fail during module initialization with errors like: -``` -Failed to download `googleapis-common-protos==1.74.0` -HTTP status client error (404 Not Found) for url (https://pypi.ops.eblu.me/root/pypi/+f/...) -``` - -**Fix:** Re-run the failed build. The first attempt warms the cache; subsequent builds succeed. Alternatively, warm the cache manually before triggering CI builds: - -```bash -# From any machine that can reach pypi.ops.eblu.me, install the Dagger SDK -# to pre-populate the most common packages: -pip install --dry-run --index-url https://pypi.ops.eblu.me/root/pypi/+simple/ dagger-io -``` +Devpi runs natively on indri (see [[devpi-on-indri]]) and is unaffected by minikube rebuilds, so the historical "devpi cold cache after rebuild" failure mode no longer applies. If devpi itself goes cold (fresh server-dir), the same lazy-cache race can still cause `404` on the first Dagger build under concurrent load — re-run the build to warm the cache, or pre-warm with `uv pip install --dry-run --index-url https://pypi.ops.eblu.me/root/pypi/+simple/ dagger-io`. ## Related diff --git a/docs/how-to/operations/record-review-evidence.md b/docs/how-to/operations/record-review-evidence.md deleted file mode 100644 index 9de4e37..0000000 --- a/docs/how-to/operations/record-review-evidence.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: Record Review Evidence -modified: 2026-04-01 -last-reviewed: 2026-04-01 -tags: - - how-to - - security - - compliance ---- - -# Record Review Evidence - -How review evidence *would* be captured after a [[review-compensating-controls|compensating control review]], to make the review auditable under a compliance framework. - -blumeops does not currently collect review evidence. This card documents the target process for reference and practice. - -## Why Record Evidence? - -Reviewing a control and updating `last-reviewed` proves the review *happened* but not *what was checked*. Under frameworks like PCI DSS v4.0, a QSA needs to see dated, immutable evidence that the reviewer verified the control and that an appropriate party accepted the residual risk. Compliance platforms like Drata automate this collection, but the underlying artifacts are the same whether you use a platform or a directory of files. - -## What Evidence Would Be Captured - -For each control reviewed, artifacts should answer: - -1. **Who reviewed it** — reviewer name, date -2. **What was verified** — the specific checks performed (e.g., Tailscale ACL policy snapshot, `tailscale status` output, kubectl auth checks) -3. **What was found** — the outcome: control still in effect, circumstances changed, or control invalidated -4. **Residual risk** — what the control does *not* cover (the gap a QSA will ask about) -5. **Acceptance** — formal sign-off that the residual risk is accepted by an appropriate party (reviewer + approver, typically a manager or CTO) - -Supporting artifacts would include command output, policy snapshots, screenshots, or API responses — anything that demonstrates the verification was actually performed. - -## PCI DSS Context - -Under PCI DSS v4.0, compensating controls require a **Compensating Control Worksheet (CCW)** that maps each control to the original requirement it substitutes for. The CCW fields are: - -- **Original requirement** — the specific PCI DSS requirement not directly met -- **Constraint** — why direct compliance isn't feasible -- **Compensating control definition** — what is done instead -- **Risk addressed** — how the control mitigates the original threat -- **Residual risk** — what remains unmitigated -- **Validation procedure** — steps to verify (what `notes` captures in `compensating-controls.yaml`) - -Req 12.3.2 mandates review **at least annually** (quarterly is typical for Level 1 Service Providers). In a platform like Drata, these map to Controls with uploaded Evidence and review workflows requiring sign-off from both the reviewer and an approver. - -## Related - -- [[review-compensating-controls]] — The technical review process -- [[security]] — Security posture overview -- [[read-compliance-reports]] — Interpreting Prowler/Kingfisher reports diff --git a/docs/how-to/operations/restart-indri.md b/docs/how-to/operations/restart-indri.md index a956644..e92581e 100644 --- a/docs/how-to/operations/restart-indri.md +++ b/docs/how-to/operations/restart-indri.md @@ -41,6 +41,7 @@ Native services managed by launchd will stop automatically during macOS shutdown ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.forgejo.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.caddy.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.zot.plist' +ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.devpi.plist' # see [[devpi-on-indri]] ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.jellyfin.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist' ssh indri 'launchctl unload ~/Library/LaunchAgents/mcquack.eblume.borgmatic.plist' diff --git a/docs/how-to/operations/review-compensating-controls.md b/docs/how-to/operations/review-compensating-controls.md deleted file mode 100644 index b05958e..0000000 --- a/docs/how-to/operations/review-compensating-controls.md +++ /dev/null @@ -1,78 +0,0 @@ ---- -title: Review Compensating Controls -modified: 2026-03-30 -last-reviewed: 2026-03-30 -tags: - - how-to - - security - - maintenance ---- - -# Review Compensating Controls - -How to periodically review compensating controls that justify suppressed security findings. - -## Review by Staleness - -Show controls sorted by when they were last reviewed (most stale first): - -```bash -mise run review-compensating-controls -``` - -This reads `compensating-controls.yaml` (repo root), sorts by `last-reviewed`, and displays the most stale control with all codebase references. It also searches for every file that references the control ID, so you can see exactly which suppressed findings depend on it. - -To show more entries: - -```bash -mise run review-compensating-controls --limit 20 -``` - -## What is a Compensating Control? - -A compensating control is a security measure that mitigates the risk a finding was designed to detect, when the finding itself cannot be directly remediated. For example: - -- **Finding:** API server does not enable AlwaysPullImages admission plugin -- **Risk:** Untrusted users could run pods using cached images they shouldn't have access to -- **Compensating control:** `single-user-cluster` — only the operator has kubectl access; no untrusted users can create pods - -Controls are documented in `compensating-controls.yaml` and referenced from security tool configurations (Prowler mutelist files, Kingfisher config, etc.) using the format `CC: `. - -## Review Process - -For each control up for review: - -1. **Understand the risk.** Read each suppressed finding that references this control. What attack or misconfiguration does the original check guard against? - -2. **Verify the control is in effect.** Follow the verification steps in the control's `notes` field. For example, for `tailscale-network-isolation`, check that the cluster is not directly internet-exposed and Tailscale ACLs are enforced. - -3. **Assess whether the control actually mitigates the risk.** A compensating control should address the same threat the check was designed to catch, not just be a vaguely related security measure. If it doesn't hold up, either: - - Fix the underlying finding and remove the suppression - - Document a stronger or more specific compensating control - -4. **Check for changed circumstances.** Has the cluster gained new users? Has a service been exposed publicly? Has an operator added native support for the missing feature? Any of these could invalidate the control. - -5. **Update the review date.** Edit `compensating-controls.yaml` and set `last-reviewed` to today's date. Commit alongside any changes. - -## Adding a New Control - -When suppressing a new security finding, either map it to an existing control or add a new one: - -```yaml -- id: my-new-control - description: >- - What this control does and how it mitigates the specific risk. - created: 2026-03-30 - last-reviewed: 2026-03-30 - notes: >- - How to verify this control is still in effect. -``` - -Then reference it in the suppression configuration with `CC: my-new-control`. - -## Related - -- [[record-review-evidence]] — Capturing evidence artifacts for audit (aspirational) -- [[security]] — Security posture overview -- [[read-compliance-reports]] — Accessing and interpreting Prowler reports -- [[review-services]] — Periodic service version review (similar staleness pattern) diff --git a/docs/how-to/operations/run-1password-backup.md b/docs/how-to/operations/run-1password-backup.md index b0807da..0dc9ec9 100644 --- a/docs/how-to/operations/run-1password-backup.md +++ b/docs/how-to/operations/run-1password-backup.md @@ -26,20 +26,18 @@ How to export and encrypt your 1Password vaults for inclusion in [[borgmatic]] b 1. Open the 1Password desktop app 2. **File > Export > All Vaults** 3. Choose **1PUX** format -4. Save to `~/Documents/1Password-export.1pux` +4. Save to `~/Documents/` — 1Password names the file `1PasswordExport--.1pux` automatically; don't bother renaming it, pass the path to the task in the next step ### 2. Run the Backup Task -```fish -mise run op-backup -``` - -Or, if you saved the export to a non-default location: +Pass the exported file's path: ```fish -mise run op-backup ~/path/to/export.1pux +mise run op-backup ~/Documents/1PasswordExport-*.1pux ``` +(If only one export exists in `~/Documents/`, the glob expands cleanly. Otherwise, paste the full path.) + The task will: 1. Prompt for the `.1pux` path if not provided diff --git a/docs/how-to/operations/shower-on-ringtail.md b/docs/how-to/operations/shower-on-ringtail.md new file mode 100644 index 0000000..daf1046 --- /dev/null +++ b/docs/how-to/operations/shower-on-ringtail.md @@ -0,0 +1,245 @@ +--- +title: Shower App on Ringtail +modified: 2026-05-10 +last-reviewed: 2026-05-10 +tags: + - how-to + - operations + - kubernetes + - django +--- + +# Shower App on Ringtail + +How the Adelaide / Heidi / Addie baby shower app is deployed. The app is a +Django project ([`adelaide-baby-shower-app`](https://forge.eblu.me/eblume/adelaide-baby-shower-app)) +released as a wheel to the Forgejo Packages PyPI index and run on +[[ringtail]]'s k3s cluster. Public landing page at `shower.eblu.me`, staff +console + admin UI at `shower.ops.eblu.me` (tailnet only). + +The contract this deploy implements is defined in the app repo's +`docs/how-to/hosting.md` — read that for the env-var contract, security +model, and storage requirements before changing anything here. + +## Routing + +``` +Internet → shower.eblu.me + │ (Fly.io nginx — public) + ▼ + Caddy on indri (shower.ops.eblu.me) + │ + ▼ + Tailscale ProxyGroup ingress (shower.tail8d86e.ts.net) + │ + ▼ + Service shower:8000 → Pod (Django + gunicorn) +``` + +| Hostname | Reachable from | Notes | +|---|---|---| +| `shower.eblu.me` | Public internet | Guest surface only — splash, `/prizes//`, `/static/`, `/media/`. Everything authenticated 403s with a tailnet pointer. | +| `shower.ops.eblu.me` | Tailnet | Full app surface — `/host/`, `/admin/`, the works | +| `shower.tail8d86e.ts.net` | Tailnet | Bare ProxyGroup endpoint Caddy proxies to | + +## Defense layers (public side) + +The public surface is guest-only, so the threat model collapses: there +is no credential-accepting endpoint reachable from WAN, and nothing on +WAN that requires authentication. + +1. **edge auth lockout** — fly nginx 403s `/admin/`, `/host/`, and + anything that would redirect into them. Anyone hitting an auth URL + on WAN gets a "tailnet only" message. +2. **fly nginx `limit_req zone=general`** — 10 r/s per Fly-Client-IP + cushion for the splash form. +3. **django-axes** — 5 fails / 1 hour lockout per `(username, ip_address)`, + running on the tailnet-side login. Provides the only credential + defense, since brute-force is only reachable to tailnet members. + +The QR codes that `/host/` (on tailnet) generates for guests embed +`https://shower.eblu.me/...` even though the QR view is served from +the tailnet host. The app's `PUBLIC_URL_BASE` setting (added in v1.0.1) +overrides Django's `request.build_absolute_uri()` for those URLs. + +## Persistent storage + +| Mount | PVC | Type | Why | +|---|---|---|---| +| `/app/media` | `shower-media` | NFS RWX on sifaka (`/volume1/shower`) | Prize photos survive pod rescheduling | +| `/app/data` | `shower-data` | k3s `local-path` RWO | SQLite DB; NFS file locking can't be trusted for WAL/journal | + +The container has the app + its Python deps baked in at nix build time +(`buildPythonPackage` against the wheel fetched from forge PyPI). The +entrypoint runs migrations, runs `collectstatic`, and `exec`s gunicorn — +no pip-at-boot. A `local_settings.py` shim overrides `DATABASES.NAME`, +`MEDIA_ROOT`, and `STATIC_ROOT` to absolute paths under `/app/`, +sidestepping the wheel's `BASE_DIR = parent.parent` of an +in-site-packages settings module. + +## Backups + +[[borgmatic]] (running on indri) captures both halves of the persistent +state on its daily 2 a.m. run: + +- **`/app/data/db.sqlite3`** — dumped via `kubectl exec`'s + `sqlite3.backup()` against the live pod (entry in + `borgmatic_k8s_sqlite_dumps`, context `k3s-ringtail`). The dumped + file lands in `borgmatic_k8s_dump_dir` on indri and is picked up by + the main source-directory sweep. +- **`/app/media`** — picked up via `/Volumes/shower`, the SMB mount of + `sifaka:/volume1/shower` on indri. The same Synology share is exposed + via SMB *and* NFS simultaneously; ringtail's pod uses the NFS export, + while indri reads the SMB side for the borgmatic source. + +Both archive to [[sifaka]] (`borg-backups`) and BorgBase offsite, with +retention `keep_daily=7 / keep_monthly=12 / keep_yearly=1000`. + +The SMB mount on indri is set up manually once via Finder (Cmd-K → +`smb://sifaka/shower`, save credentials, "Always log in" so it +reconnects after reboot). If `/Volumes/shower` is missing at backup +time borgmatic will fail loudly — `source_directories_must_exist: true` +applies to all entries. + +## One-time setup steps + +These steps are required the first time the service is deployed and are +not encoded in the manifests. + +### 1. NFS + SMB share on sifaka + +On the Synology DSM web UI: + +1. **Control Panel → Shared Folder → Create**. Name: `shower`, + Location: Volume 1. Leave the rest at default. +2. **Control Panel → File Services → NFS → NFS Rules** (on the + `shower` row's *Permissions* tab). Add a rule mirroring the other + shares' pattern: Hostname/IP=`192.168.1.0/24` and again for + `100.64.0.0/10`, Privilege=Read/Write, Squash=`Map all users to + admin` (= `all_squash`), and tick *Allow connections from + non-privileged ports*. (See [[sifaka#NFS Exports]] — the existing + `frigate`, `paperless`, etc. shares use this exact pattern.) +3. **Control Panel → File Services → SMB**: leave SMB enabled + globally. No per-share rule required — the share inherits the + default `eblume` access. +4. The directory ownership at `/volume1/shower` will end up + `root:root`, mode `0777` (DSM default) — which is fine because + `all_squash` rewrites every NFS write to `admin:users`, and the + `0777` lets pods read what other pods wrote. No `chown` needed. + +After the share exists, mount it on indri for borgmatic: + +- In Finder, **Cmd-K → `smb://sifaka/shower`**, sign in as `eblume`, + and tick **Remember in Keychain** + **Always log in** so it + reconnects on reboot. This produces `/Volumes/shower`, which the + borgmatic source-directory list points at. + +### 2. 1Password item + +Item name: **`Shower (blumeops)`** in the `blumeops` vault. +Required property: + +| Field | Value | +|---|---| +| `secret-key` | Output of `openssl rand -base64 48` | + +The `ExternalSecret` `shower-app-secrets` will sync this into the +`shower` namespace as a `Secret` and `envFrom` exposes it as +`DJANGO_SECRET_KEY` to the container. + +**Never reuse a key that has ever been in git history.** Per the app's +hosting.md, an early dev key was committed before being replaced with +the `django-insecure-...` placeholder; the production key must be +freshly generated. + +### 3. Container image + +Built by the `build-container` Forgejo Actions workflow on the +`nix-container-builder` runner (ringtail, amd64). The wheel is fetched +from forge PyPI at nix build time and baked into the image — no +pip-at-runtime. To bump the version, change `version` in +`containers/shower/default.nix` and update `wheelHash` (or set it to +`pkgs.lib.fakeHash` and let the next build print the correct one). + +Trigger with: + +```fish +mise run container-build-and-release shower +``` + +After the workflow finishes, update `images[].newTag` in +`argocd/manifests/shower/kustomization.yaml` to the resulting +`vX.Y.Z--nix` tag, then commit (C0). + +### 4. DNS + +`pulumi/gandi/__main__.py` declares the `shower-public` CNAME pointing +at `blumeops-proxy.fly.dev.`. Apply with: + +```fish +mise run dns-preview +mise run dns-up +``` + +### 5. Fly.io certificate + +```fish +fly certs add shower.eblu.me -a blumeops-proxy +``` + +(Add to `mise-tasks/fly-setup` so re-runs of the one-time setup pick +it up.) + +### 6. Caddy on indri + +`shower` is in `ansible/roles/caddy/defaults/main.yml`. Push with: + +```fish +mise run provision-indri -- --tags caddy +``` + +### 7. Create the admin user + +The container's entrypoint runs `migrate --noinput` + `collectstatic +--noinput --clear` before gunicorn, so a fresh `db.sqlite3` is schema- +ready as soon as the pod boots. It does *not* create a Django superuser +— that has to happen once, interactively, after the first pod is up: + +```fish +kubectl --context=k3s-ringtail -n shower exec -it deploy/shower -- \ + python -m django createsuperuser +``` + +Use `erich` / your usual email. The same account doubles as the +`@staff_member_required` login for `/host/`. Subsequent staff accounts +can be created from `/admin/auth/user/` once you're signed in. + +## Deploying a new version + +1. Bump the wheel version in the app repo (`adelaide-baby-shower-app`) + and release it to Forgejo PyPI. +2. Bump `appVersion` in `containers/shower/default.nix` to match. +3. `mise run container-build-and-release shower`. Verify the build + with `mise run runner-logs`. +4. Update the `newTag` in `argocd/manifests/shower/kustomization.yaml` + to the new `[main]` SHA tag. +5. Commit (C0 after PR merge — see [[build-container-image#Squash-merge and container tags]]). +6. `argocd app sync shower`. + +## Verifying after a deploy + +```fish +kubectl --context=k3s-ringtail -n shower get pods +kubectl --context=k3s-ringtail -n shower logs deploy/shower +curl -sf https://shower.ops.eblu.me/ # tailnet +curl -sf https://shower.eblu.me/ # public +curl -I https://shower.eblu.me/admin/users/ # expect 403 (edge block) +curl -I https://shower.ops.eblu.me/admin/ # expect 200 / 302 (login) +``` + +## Related + +- [[expose-service-publicly]] — Fly.io proxy + Tailscale pattern +- [[deploy-k8s-service]] — generic ArgoCD service onboarding +- [[ringtail]] — the cluster +- [`hosting.md`](https://forge.eblu.me/eblume/adelaide-baby-shower-app/src/branch/main/docs/how-to/hosting.md) — app's deployment contract diff --git a/docs/how-to/operations/troubleshooting.md b/docs/how-to/operations/troubleshooting.md index 63dc79a..84301c3 100644 --- a/docs/how-to/operations/troubleshooting.md +++ b/docs/how-to/operations/troubleshooting.md @@ -72,6 +72,11 @@ kubectl --context=minikube-indri -n get pods --field-selector=status **ArgoCD login expired:** ```bash +argocd login argocd.ops.eblu.me --sso +``` + +If Authentik itself is down, fall back to admin: +```bash argocd login argocd.ops.eblu.me --username admin --password "$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/srogeebssulhtb6tnqd7ls6qey/password')" ``` diff --git a/docs/how-to/ringtail/migrate-wave1-ringtail.md b/docs/how-to/ringtail/migrate-wave1-ringtail.md new file mode 100644 index 0000000..ffb8cdc --- /dev/null +++ b/docs/how-to/ringtail/migrate-wave1-ringtail.md @@ -0,0 +1,176 @@ +--- +title: Migrate Wave 1 (paperless, teslamate, mealie) to Ringtail +modified: 2026-06-03 +last-reviewed: 2026-06-03 +tags: + - how-to + - operations + - ringtail + - migration +--- + +# Migrate Wave 1 to Ringtail + +Move paperless, teslamate, and mealie off `minikube-indri` and onto +`k3s-ringtail`. This is the load-shedding response to minikube going +OOM: the kernel OOM killer was thrashing the 8 GiB node — killing +`kube-apiserver`, `dockerd`, and the argocd application-controller — +which made every minikube-hosted service probe-flap at once. These +three app pods are ~1.1 GiB resident combined and are the heaviest +non-observability tenants left on minikube. Following +[[migrate-immich-to-ringtail]], the first chain in the indri-k8s +decommission. + +## End state + +- `paperless`, `teslamate`, and `mealie` run on ringtail k3s in their + own namespaces, off minikube entirely. +- A CNPG `blumeops-pg` Cluster runs in a `databases` namespace on + ringtail (PostgreSQL, owned by ringtail's `cnpg-system` operator), + holding the `paperless` and `teslamate` databases. Apps reach it + in-cluster via `blumeops-pg-rw.databases.svc.cluster.local`. +- mealie keeps its SQLite database; its 2 GiB `mealie-data` PVC is + copied to a ringtail PVC. +- paperless media still lives on [[sifaka]] via NFS (RWX, 500 GiB), + mounted from ringtail pods. teslamate has no file state. +- Routing: `paperless.ops.eblu.me`, `teslamate.ops.eblu.me`, and + `mealie.ops.eblu.me` (Caddy on indri) proxy to Tailscale + ProxyGroup ingresses on ringtail. Service names are unchanged. +- The minikube manifests and the `paperless`/`teslamate`/`mealie` + databases inside indri's `blumeops-pg` are removed only after + cutover is verified. + +## Non-goals + +- Migrating the rest of `blumeops-pg` (e.g. miniflux) — that is a + later wave. This chain moves only the paperless + teslamate + databases out; the source cluster on indri stays up for the others. +- Version bumps or config changes. Lift-and-shift only. +- Public (Fly) exposure changes. These stay tailnet-only. +- The observability stack (prometheus/loki/tempo/grafana) — deferred; + it carries 50 GiB of local TSDB and is the riskiest move. + +## Critical constraint: no data loss + +**Downtime is acceptable — data loss is not.** We can take each +service fully offline for its cutover, which removes the entire +class of streaming-replication and double-writer hazards. The cold +dump is taken from a *quiesced* source, so it is internally +consistent. + +Data surfaces: + +1. **paperless postgres** — document metadata, tags, correspondents, + the search index state. The document *files* are on NFS and never + move, but losing the DB means files-without-index. This is the + surface to protect most carefully. +2. **teslamate postgres** — drive/charge history. Re-derivable only + from Tesla's API for a limited window; treat as unrecoverable. +3. **mealie SQLite** — recipes, meal plans. On the `mealie-data` PVC. + +The source databases on indri are **never dropped until the ringtail +side is verified and serving**. Rollback is "repoint and scale back +up," not "restore from backup." [[borgmatic]] remains the backstop. + +## Why a fresh CNPG cluster (not cross-cluster pg) + +indri's `blumeops-pg` is already exposed tailnet-wide at +`pg.ops.eblu.me` (Caddy L4), so we *could* leave the DBs on indri and +just move the app pods. We are not, because: + +- The goal is to retire minikube — keeping pg there blocks it and + leaves a cross-host runtime dependency (ringtail apps SPOF on + indri's pg over the tailnet). +- CNPG is the same operator on both clusters; a Cluster CR on ringtail + is mechanically equivalent to the one on minikube. +- Naming the ringtail cluster `blumeops-pg` in `databases` lets apps + use the same in-cluster DNS they would on indri. + +## Cold-cutover procedure (per service) + +Do these one service at a time. paperless first (heaviest, highest +data-sensitivity), then teslamate, then mealie. + +### 0. Prerequisites (once, before any service) + +- Confirm ringtail's `cnpg-system` operator and `databases` namespace + are healthy (immich-pg already runs there). +- Confirm ringtail pods can reach indri's `pg.ops.eblu.me:5432` (used + only to pull the dump) and the sifaka NFS export for paperless + media. See [[sifaka-nfs-from-ringtail]]. +- Define the ringtail `blumeops-pg` CNPG Cluster manifest (model on + `databases-ringtail/immich-pg.yaml`) and its ExternalSecrets for + the per-app roles. Sync it; let it come up empty and healthy. + +### 1. Quiesce the source + +```fish +kubectl --context=minikube-indri -n scale deploy/ --replicas=0 +# confirm 0 running, DB now has no writers +``` + +### 2. Dump from indri, restore to ringtail (postgres apps) + +```fish +# dump the single app DB from the quiesced source +kubectl --context=minikube-indri -n databases exec blumeops-pg-1 -- \ + pg_dump -Fc -d > /tmp/.dump + +# restore into the ringtail cluster +kubectl --context=k3s-ringtail -n databases exec -i blumeops-pg-1 -- \ + pg_restore --no-owner --role= -d < /tmp/.dump +``` + +For **mealie** (SQLite) instead: copy the `mealie-data` PVC contents +to the ringtail PVC (e.g. a one-shot rsync pod mounting both, or +`kubectl cp` via a helper pod). Verify the `.db` file size and that +mealie boots read-only against it. + +### 3. Verify the restore (before any routing flips) + +- Row counts match source for the key tables, scripted: + - paperless: `documents_document`, `documents_tag`, + `documents_correspondent`, `auth_user`. + - teslamate: `cars`, `drives`, `charging_processes`, `positions`. +- `pg_dump --schema-only --no-owner` diff between source and dest is + empty modulo CNPG-managed roles. +- Boot the app against the ringtail DB on its tailnet name *before* + Caddy is flipped, and smoke-test (paperless: documents list + + search; teslamate: dashboard loads recent drives; mealie: recipes + list). + +### 4. Release the service name + +```fish +# delete the minikube tailscale ingress so ringtail can claim the name +kubectl --context=minikube-indri -n delete ingress -tailscale +``` + +### 5. Bring up on ringtail + +- Apply the ringtail manifests (new ArgoCD app `-ringtail`, + `destination.server` = `https://ringtail.tail8d86e.ts.net:6443`). + App points at `blumeops-pg-rw.databases.svc.cluster.local`. +- Sync; wait for healthy + the ProxyGroup ingress to get its name. + +### 6. Flip routing + +- Repoint the Caddy `.ops.eblu.me` upstream at the ringtail + ProxyGroup ingress (provision-indri, caddy role). +- `mise run services-check` — confirm the service flips from FIRING + to OK and no neighbours regressed. + +### 7. Decommission the source (only after verification) + +- Remove the minikube manifests for the app. +- Drop the app DB from indri's `blumeops-pg` (paperless/teslamate) + **last**, once the ringtail side has served real traffic. + +## Rollback + +If a cutover fails verification at any step before §7: + +- Re-create the minikube tailscale ingress (if §4 ran). +- Scale the minikube app back to `1`. +- Repoint Caddy back to the minikube ingress. +- The source DB was never modified or dropped. Document the failure. diff --git a/docs/index.md b/docs/index.md index 6da90a4..fb04c47 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,7 @@ --- title: BlumeOps -modified: 2026-02-08 +modified: 2026-05-06 +last-reviewed: 2026-05-06 aliases: [] id: index tags: [] @@ -22,8 +23,9 @@ raft I built for myself as I went, and you can see it all from within your editor of choice. (I recommend vim.) These services run on my home [[hosts|infrastructure]], primarily an m1 mac -mini named [[indri]] and a Synology NAS called [[sifaka]]. The infrastructure -is networked via [[tailscale]], with the domain `eblu.me` hosted via [[gandi]], +mini named [[indri]], a NixOS GPU host called [[ringtail]] running a k3s +cluster, and a Synology NAS called [[sifaka]]. The infrastructure is networked +via [[tailscale]], with the domain `eblu.me` hosted via [[gandi]], [[caddy]] providing a private reverse proxy for tailnet devices, and [[flyio-proxy|Fly.io]] serving public-facing services like [this documentation site](https://docs.eblu.me). diff --git a/docs/reference/infrastructure/gandi.md b/docs/reference/infrastructure/gandi.md index ae1fe56..763bae3 100644 --- a/docs/reference/infrastructure/gandi.md +++ b/docs/reference/infrastructure/gandi.md @@ -1,7 +1,7 @@ --- title: Gandi -modified: 2026-04-09 -last-reviewed: 2026-04-09 +modified: 2026-04-27 +last-reviewed: 2026-04-27 tags: - infrastructure - networking @@ -20,12 +20,11 @@ DNS hosting provider for the `eblu.me` domain, managed via Pulumi IaC. | **Provider** | Gandi LiveDNS | | **IaC** | `pulumi/gandi/` | | **Stack** | `eblu-me` | +| **PAT** | `op://blumeops/gandi - blumeops/pat` | ## What It Does -Gandi hosts the DNS records that make `*.ops.eblu.me` resolve to [[indri]]'s Tailscale IP (`indri.tail8d86e.ts.net`). Since Tailscale IPs are not publicly routable, this gives services real DNS names while keeping them private to the tailnet. - -The target IP is resolved dynamically from `indri.tail8d86e.ts.net` at deploy time, so if indri's Tailscale IP changes, re-running the deployment is sufficient. +Gandi hosts the DNS records that make `*.ops.eblu.me` resolve to [[indri]]'s Tailscale IP. Since Tailscale IPs are not publicly routable, this gives services real DNS names while keeping them private to the tailnet. The target IP is resolved dynamically from `indri.tail8d86e.ts.net` at deploy time. ## DNS Records @@ -46,38 +45,25 @@ Both records point to [[indri]], which runs [[caddy]] as the reverse proxy for a | `cv.eblu.me` | CNAME | `blumeops-proxy.fly.dev` | 300s | | `forge.eblu.me` | CNAME | `blumeops-proxy.fly.dev` | 300s | -Public CNAMEs point to [[flyio-proxy]] on Fly.io. See [[expose-service-publicly]] for adding new public services. - -See [[routing]] for the full service URL map. - -## Pulumi Configuration - -The Pulumi program lives in `pulumi/gandi/`: - -- `__main__.py` - Creates A and CNAME records via `pulumiverse_gandi` -- `Pulumi.eblu-me.yaml` - Stack config (domain, subdomain) - -Stack config values: - -| Key | Value | -|-----|-------| -| `blumeops-dns:domain` | `eblu.me` | -| `blumeops-dns:subdomain` | `ops` | - -A break-glass override is available via the `BLUMEOPS_REVERSE_PROXY_IP` environment variable, which bypasses dynamic IP resolution. +Public CNAMEs point to [[flyio-proxy]] on Fly.io. See [[expose-service-publicly]] for adding new public services. See [[routing]] for the full service URL map. ## TLS Integration -[[caddy]] uses Gandi's API separately (via `GANDI_BEARER_TOKEN`) for ACME DNS-01 challenges to obtain a wildcard Let's Encrypt certificate for `*.ops.eblu.me`. This is a different credential from the Pulumi PAT. +[[caddy]] uses this same Gandi PAT for ACME DNS-01 challenges to obtain a wildcard Let's Encrypt certificate for `*.ops.eblu.me`. Caddy reads the PAT from `~/.config/caddy/gandi-token` on [[indri]], populated by ansible from 1Password. ## Authentication -Gandi requires a Personal Access Token (PAT) for API access. PATs have a maximum lifetime of 90 days (currently set to 30). See [[gandi-operations]] for deployment and PAT cycling instructions. +One Gandi Personal Access Token, shared by Pulumi and Caddy. Gandi caps PATs at 90 days; rotate every 60 days via [[rotate-gandi-pat]]. + +## ACME Challenge Cleanup + +Caddy's renewal flow leaves `_acme-challenge.ops` TXT orphans in the zone — a value-comparison bug in `libdns/gandi` v1.1.0 makes the cleanup phase a no-op. Run `mise run dns-acme-cleanup` periodically (alongside PAT rotation works well). ## Related -- [[gandi-operations]] - PAT cycling and deployment how-to -- [[routing]] - Service URLs and routing architecture -- [[caddy]] - Reverse proxy using Gandi for TLS -- [[tailscale]] - Tailnet networking -- [[indri]] - Server hosting Caddy (DNS target) +- [[manage-eblu-me-dns]] — Add/change DNS records via Pulumi +- [[rotate-gandi-pat]] — Rotate the shared Gandi PAT +- [[routing]] — Service URLs and routing architecture +- [[caddy]] — Reverse proxy using this PAT for TLS +- [[tailscale]] — Tailnet networking +- [[indri]] — Server hosting Caddy (DNS target) diff --git a/docs/reference/infrastructure/indri.md b/docs/reference/infrastructure/indri.md index cbb2a0f..8364ba0 100644 --- a/docs/reference/infrastructure/indri.md +++ b/docs/reference/infrastructure/indri.md @@ -1,6 +1,7 @@ --- title: Indri -modified: 2026-02-19 +modified: 2026-05-27 +last-reviewed: 2026-05-27 tags: - infrastructure - host @@ -15,6 +16,7 @@ Primary BlumeOps server. Mac Mini M1 (2020). | Property | Value | |----------|-------| | **Model** | Mac mini M1, 2020 (Macmini9,1) | +| **CPU / RAM** | 8 cores / 16 GB | | **Storage** | 2TB internal SSD | | **macOS** | 15.7.3 (Sequoia) | | **Tailscale hostname** | `indri.tail8d86e.ts.net` | @@ -30,9 +32,13 @@ Primary BlumeOps server. Mac Mini M1 (2020). - [[borgmatic]] - Backup system - [[alloy|Alloy]] - Metrics/logs collector - [[caddy]] - Reverse proxy for `*.ops.eblu.me` +- [[devpi]] - PyPI mirror (LaunchAgent) +- [[hephaestus]] - heph task/context sync hub (LaunchAgent, self-updating) +- [[cv]] - Static CV site, served by Caddy +- [[docs]] - Quartz-built docs site, served by Caddy **Kubernetes (via minikube):** -- [[apps|Most k8s applications]] (Frigate, ntfy migrated to [[ringtail]] k3s) +- [[apps|Most k8s applications]]. A growing set of apps (Authentik, Frigate, ntfy, Immich, Homepage, Shower, Kingfisher, alloy-ringtail) now run on [[ringtail]]'s k3s instead. Long-term plan is to decommission indri's minikube entirely. **GUI Applications (manual start required):** - Docker Desktop - Container runtime for minikube diff --git a/docs/reference/infrastructure/ringtail.md b/docs/reference/infrastructure/ringtail.md index 8b93d4d..a4e6837 100644 --- a/docs/reference/infrastructure/ringtail.md +++ b/docs/reference/infrastructure/ringtail.md @@ -25,6 +25,19 @@ Service host and gaming PC. Custom-built PC running NixOS. | **OS** | NixOS 25.11 (Sway/Wayland) | | **Tailscale hostname** | `ringtail.tail8d86e.ts.net` | +## Networking + +| Property | Value | +|----------|-------| +| **Interface (wired)** | `enp5s0` | +| **IP** | `192.168.1.21/24` (static, set by NixOS scripted networking) | +| **Gateway** | `192.168.1.1` (UX7) | +| **DNS** | `192.168.1.1`, `1.1.1.1` (used as Tailscale's upstream resolvers; `/etc/resolv.conf` is owned by Tailscale's MagicDNS at `100.100.100.100`) | +| **DHCP reservation** | UniFi "Fixed IP" tied to ringtail's MAC; belt-and-suspenders so the UX7 won't lease `192.168.1.21` to anyone else even though ringtail no longer asks for it | +| **Wireless** | `wlp6s0` still managed by NetworkManager as a fallback path | + +NetworkManager is enabled but explicitly excluded from managing `enp5s0` via `networking.networkmanager.unmanaged = [ "interface-name:enp5s0" ]`. The wired address is configured by a deterministic `network-addresses-enp5s0.service` oneshot — no daemon, no lease, no renewal. + ## Software Managed declaratively via `nixos/ringtail/configuration.nix`. Home-manager handles ringtail-specific sway/waybar config; chezmoi manages cross-platform dotfiles. diff --git a/docs/reference/infrastructure/tailscale.md b/docs/reference/infrastructure/tailscale.md index 2794111..9c15d83 100644 --- a/docs/reference/infrastructure/tailscale.md +++ b/docs/reference/infrastructure/tailscale.md @@ -33,7 +33,7 @@ ACLs managed via Pulumi in `pulumi/tailscale/policy.hujson`. | `tag:loki` | indri | Loki log aggregation | | `tag:k8s-api` | indri | Kubernetes API server (minikube) | | `tag:k8s-operator` | (operator pod) | Tailscale operator for k8s — see [[tailscale-operator]] | -| `tag:k8s` | (Ingress proxy pods) | Kubernetes Tailscale Ingress nodes; each also carries a per-service tag (`tag:grafana`, `tag:kiwix`, `tag:devpi`, `tag:feed`, `tag:pg`) | +| `tag:k8s` | (Ingress proxy pods) | Kubernetes Tailscale Ingress nodes; each also carries a per-service tag (`tag:grafana`, `tag:kiwix`, `tag:feed`, `tag:pg`) | | `tag:ci-gateway` | (ephemeral CI containers) | CI containers pushing images to registry | | `tag:flyio-proxy` | (Fly.io proxy container) | Public reverse proxy | | `tag:flyio-target` | indri, designated Ingress endpoints | Endpoints reachable by the Fly.io proxy (indri for Caddy routing, Ingress pods for Alloy metrics/logs) | diff --git a/docs/reference/kubernetes/apps.md b/docs/reference/kubernetes/apps.md index 80ea72e..fd5c06f 100644 --- a/docs/reference/kubernetes/apps.md +++ b/docs/reference/kubernetes/apps.md @@ -41,6 +41,7 @@ Registry of all applications deployed via [[argocd]]. | `ollama` | ollama | `argocd/manifests/ollama/` | [[ollama]] | | `mealie` | mealie | `argocd/manifests/mealie/` | [[mealie]] | | `paperless` | paperless | `argocd/manifests/paperless/` | [[paperless]] | +| `shower` | shower | `argocd/manifests/shower/` | [[shower-app]] | | `prowler` | prowler | `argocd/manifests/prowler/` | [[prowler]] | ## Sync Policies diff --git a/docs/reference/kubernetes/cluster.md b/docs/reference/kubernetes/cluster.md index 9b632bd..07c14af 100644 --- a/docs/reference/kubernetes/cluster.md +++ b/docs/reference/kubernetes/cluster.md @@ -1,6 +1,7 @@ --- title: Cluster -modified: 2026-02-19 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - kubernetes --- @@ -15,7 +16,7 @@ BlumeOps runs two Kubernetes clusters: a Minikube cluster on [[indri]] (most ser |----------|-------| | **Driver** | docker | | **Container Runtime** | docker | -| **Kubernetes Version** | v1.34.0 | +| **Kubernetes Version** | v1.35.0 | | **CPUs** | 6 | | **Memory** | 11GB | | **Disk** | 200GB | @@ -41,7 +42,9 @@ Single-node k3s cluster for workloads requiring amd64 or GPU access. See [[ringt |----------|-------| | **Context** | `k3s-ringtail` | | **API Server** | `https://ringtail.tail8d86e.ts.net:6443` | -| **Workloads** | Frigate (GPU), ntfy, frigate-notify, nvidia-device-plugin | +| **Workloads** | GPU workloads (Frigate, Ollama), notifications (ntfy, frigate-notify), [[authentik]], and services migrated off indri minikube (Immich, Mealie, Paperless, TeslaMate). See [[ringtail]] for the authoritative list. | + +Services are being progressively migrated from indri's minikube to ringtail's k3s; the split above reflects an in-progress state, not a fixed boundary. ## Related diff --git a/docs/reference/operations/security.md b/docs/reference/operations/security.md index 18561a5..11c4df9 100644 --- a/docs/reference/operations/security.md +++ b/docs/reference/operations/security.md @@ -46,13 +46,7 @@ Security posture and compliance scanning for BlumeOps infrastructure. All compliance scan reports are stored on `sifaka:/volume1/reports/`. See [[read-compliance-reports]] for access and interpretation. -## Compensating controls - -Suppressed findings reference named compensating controls tracked in `compensating-controls.yaml` (repo root). Each control has a review date and verification steps. See [[review-compensating-controls]] for the review process. - -```bash -mise run review-compensating-controls -``` +Suppressed findings are kept in Prowler mutelist YAML under `argocd/manifests/prowler/mutelist/`. Each entry's `Description` field explains why the finding is muted; entries are reviewed ad-hoc rather than on a scheduled cadence. ## Known gaps diff --git a/docs/reference/services/1password.md b/docs/reference/services/1password.md index 4489194..5ad50da 100644 --- a/docs/reference/services/1password.md +++ b/docs/reference/services/1password.md @@ -1,6 +1,7 @@ --- title: 1Password -modified: 2026-02-10 +modified: 2026-05-22 +last-reviewed: 2026-05-22 tags: - service - secrets @@ -8,15 +9,22 @@ tags: # 1Password -Root credential store for all BlumeOps secrets, synced to Kubernetes via External Secrets Operator. +Root credential store for all BlumeOps secrets. Kubernetes workloads read items via [[external-secrets|External Secrets Operator]]; humans and agents read via the `op` CLI. -## Architecture +## Vaults + +| Vault | Purpose | +|-------|---------| +| `blumeops` | Infrastructure secrets — referenced by ExternalSecret manifests and scripts. | +| `Personal` | Human login credentials keyed by URL for autofill. Not consumed by infrastructure. | + +## Kubernetes Integration ``` 1Password Cloud | v -1Password Connect (namespace: 1password) +1Password Connect (namespace: 1password, deployed on both indri and ringtail) | v External Secrets Operator (namespace: external-secrets) @@ -25,15 +33,15 @@ External Secrets Operator (namespace: external-secrets) Native Kubernetes Secrets ``` -## Vault +**ClusterSecretStore:** `onepassword-blumeops` (same name on both clusters). -The `blumeops` vault contains all infrastructure credentials. +Services reference 1Password items via `ExternalSecret` manifests. Both `minikube-indri` and `k3s-ringtail` run their own `onepassword-connect` deployment talking to the same vault. -## Kubernetes Integration +## Direct Access -**ClusterSecretStore:** `onepassword-blumeops` +Prefer `op read "op://vault/item/field"` over `op item get --fields` in scripts and IaC — `op item get --fields` wraps multi-line values in quotes, corrupting them. `op item get` without flags is fine for exploring item metadata. -Services reference 1Password items via `ExternalSecret` manifests. +If an item name contains special characters (e.g. parentheses), use the item ID instead of the name in the `op://` path. ## Disaster Recovery Backup @@ -41,8 +49,9 @@ The `mise run op-backup` task encrypts a `.1pux` vault export and transfers it t ## Related -- [[argocd]] - Uses secrets for git access -- [[postgresql]] - Database credentials -- [[run-1password-backup]] - Periodic backup procedure -- [[restore-1password-backup]] - Recovery from backup -- [[borgmatic]] - Backup system +- [[external-secrets]] — Kubernetes operator that consumes ClusterSecretStore +- [[argocd]] — Uses secrets for git access +- [[postgresql]] — Database credentials +- [[run-1password-backup]] — Periodic backup procedure +- [[restore-1password-backup]] — Recovery from backup +- [[borgmatic]] — Backup system diff --git a/docs/reference/services/alloy.md b/docs/reference/services/alloy.md index d781f2f..97d1e77 100644 --- a/docs/reference/services/alloy.md +++ b/docs/reference/services/alloy.md @@ -1,6 +1,7 @@ --- title: Alloy -modified: 2026-03-13 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - service - observability @@ -20,10 +21,10 @@ Unified observability collector for metrics and logs with three deployments: | **Indri Binary** | `~/.local/bin/alloy` | | **Indri Config** | `~/.config/grafana-alloy/config.alloy` | | **K8s Namespace** | `alloy` | -| **K8s Image** | `grafana/alloy:v1.14.0` | +| **K8s Image** | `registry.ops.eblu.me/blumeops/alloy:v1.16.0-9564435` (locally built) | | **ArgoCD App** | `alloy-k8s` | | **Fly.io Config** | `fly/alloy.river` | -| **Fly.io Image** | `grafana/alloy:v1.5.1` (binary copied into nginx container) | +| **Fly.io Image** | `grafana/alloy:v1.16.1` (binary copied into nginx container, sha-pinned) | ## Metrics Collected diff --git a/docs/reference/services/borgmatic.md b/docs/reference/services/borgmatic.md index fea4551..37f1a60 100644 --- a/docs/reference/services/borgmatic.md +++ b/docs/reference/services/borgmatic.md @@ -25,7 +25,7 @@ Daily backup system using Borg backup, running on indri. ## What Gets Backed Up **Directories:** -- `~/code/personal/zk` - Zettelkasten +- `~/code/personal/zk` - Zettelkasten (migrating into heph docs; see [hephaestus](https://github.com/eblume/hephaestus)) - `/opt/homebrew/var/forgejo` - Git forge data - `~/.config/borgmatic` - Borgmatic config - `~/Documents` - Personal documents diff --git a/docs/reference/services/cv.md b/docs/reference/services/cv.md index 55805d6..1bc5f15 100644 --- a/docs/reference/services/cv.md +++ b/docs/reference/services/cv.md @@ -1,7 +1,7 @@ --- title: CV -modified: 2026-03-27 -last-reviewed: 2026-03-27 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - resume @@ -15,37 +15,36 @@ Personal resume/CV served as a static HTML page with PDF download, built from YA | Property | Value | |----------|-------| -| **URL** | `cv.eblu.me` (public, via [[flyio-proxy]]) | -| **Namespace** | `cv` | -| **Container** | `registry.ops.eblu.me/blumeops/cv` ([kustomization](https://forge.eblu.me/eblume/blumeops/src/branch/main/argocd/manifests/cv/kustomization.yaml)) | +| **Public URL** | `cv.eblu.me` (via [[flyio-proxy]]) | +| **Private URL** | `cv.ops.eblu.me` (Caddy on indri) | +| **Deployment** | Ansible role `cv` on indri (no daemon — Caddy serves files directly) | +| **Content dir** | `~/blumeops/cv/content/` on indri | | **Source repo** | `forge.eblu.me/eblume/cv` (private, not mirrored to GitHub) | | **Content packages** | `forge.eblu.me/eblume/-/packages` (generic package `cv`) | -| **ArgoCD App** | `cv` | + +Migrated from minikube to indri-native on 2026-04-29 (see [[cv-on-indri]]). ## Architecture 1. **Source**: `resume.yaml` (content) + `template.html` (Jinja2) + `style.css` in the cv repo 2. **Build**: `render.py` (uv script runner) generates `index.html`; WeasyPrint generates `resume.pdf` 3. **Release**: Dagger `build` function packages `index.html`, `style.css`, `resume.pdf` into a tarball, uploaded to Forgejo generic packages -4. **Deploy**: nginx container downloads the tarball at startup via `CV_RELEASE_URL` env var +4. **Deploy**: ansible role downloads the tarball into `~/blumeops/cv/content/` on indri; Caddy serves the directory directly ## Endpoints | Path | Description | |------|-------------| | `/` | Resume HTML page | -| `/resume.pdf` | PDF download (Content-Disposition: attachment) | -| `/healthz` | Health check (200 OK) | +| `/resume.pdf` | PDF download (Caddy adds `Content-Disposition: attachment`) | ## Configuration **Key files (blumeops):** -- `containers/cv/Dockerfile` — nginx:alpine container -- `containers/cv/start.sh` — tarball download + extraction -- `containers/cv/default.conf` — nginx config (gzip, caching, PDF headers) -- `argocd/manifests/cv/deployment.yaml` — `CV_RELEASE_URL` env var -- `argocd/apps/cv.yaml` — ArgoCD Application +- `ansible/roles/cv/defaults/main.yml` — pinned `cv_version` and tarball URL +- `ansible/roles/cv/tasks/main.yml` — sentinel-gated download + extract +- `ansible/roles/caddy/defaults/main.yml` — `cv` service entry (`kind: static`, `download_paths` for the PDF) **Key files (cv repo):** @@ -56,17 +55,15 @@ Personal resume/CV served as a static HTML page with PDF download, built from YA - `src/cv_ci/main.py` — Dagger pipeline (alpine + uv + WeasyPrint) - `.forgejo/workflows/cv-release.yaml` — Release workflow -## Secrets +## Release flow -| Secret | Repo | Source | Description | -|--------|------|--------|-------------| -| `FORGE_TOKEN` | cv | 1Password (via Ansible) | Forgejo API token for package uploads | - -Provisioned via `forgejo_actions_secrets` Ansible role. See [[create-release-artifact-workflow]]. +1. Release a new package from the cv repo (`Release CV` workflow) +2. Run the blumeops `Deploy CV` workflow → bumps `cv_version` in the ansible role and pushes +3. Run `mise run provision-indri -- --tags cv` from gilbert +4. Purge the Fly.io proxy cache so the new content is fetched ## Related -- [[docs]] — Similar architecture (nginx container + content tarball) +- [[cv-on-indri]] — Operations how-to +- [[docs]] — Similar architecture (Caddy serving a tarball-extracted dir) - [[flyio-proxy]] — Exposes `cv.eblu.me` publicly via Tailscale tunnel -- [[create-release-artifact-workflow]] — How to set up release artifact workflows -- [[deploy-k8s-service]] — General k8s deployment guide diff --git a/docs/reference/services/devpi.md b/docs/reference/services/devpi.md index c6493fe..589a802 100644 --- a/docs/reference/services/devpi.md +++ b/docs/reference/services/devpi.md @@ -1,7 +1,7 @@ --- title: Devpi -modified: 2026-03-23 -last-reviewed: 2026-03-23 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - python @@ -9,31 +9,37 @@ tags: # devpi (PyPI Proxy) -PyPI caching proxy and private package index. +PyPI caching proxy and private package index. Runs natively on [[indri]] as a LaunchAgent (not in-cluster). See [[devpi-on-indri]] for deploy and operations. ## Quick Reference | Property | Value | |----------|-------| -| **URL** | https://pypi.ops.eblu.me | -| **Namespace** | `devpi` | -| **ArgoCD App** | `devpi` | -| **Storage** | 50Gi PVC | -| **Image** | `registry.ops.eblu.me/blumeops/devpi` (see `argocd/manifests/devpi/kustomization.yaml` for current tag) | +| **URL** | `https://pypi.ops.eblu.me` | +| **Listen** | `127.0.0.1:3141` (loopback only; reached via Caddy) | +| **Service** | LaunchAgent `mcquack.eblume.devpi` on indri | +| **Server-dir** | `/Users/erichblume/devpi/server-dir/` | +| **Runtime** | uv-managed venv at `/Users/erichblume/devpi/venv/` | +| **Ansible role** | `ansible/roles/devpi/` | +| **Versions** | Pinned in `ansible/roles/devpi/defaults/main.yml` (`devpi_server_version`, `devpi_web_version`) | ## Indices | Index | Purpose | |-------|---------| -| `root/pypi` | PyPI mirror/cache (auto-created) | -| `eblume/dev` | Private packages (inherits from root/pypi) | +| `root/pypi` | PyPI mirror/cache (auto-created by `devpi-init`) | +| `eblume/dev` | Private packages (inherits from `root/pypi`) | ## Credentials -Root password stored in 1Password (blumeops vault), injected via ExternalSecret. +Root password stored in 1Password (`blumeops` vault, item `devpi`, field `root password`). Fetched via `op read` in the `ansible/playbooks/indri.yml` `pre_tasks` and passed to the role on first init. + +## Backup + +The server-dir is **not** backed up. The PyPI cache (`+files/`) is re-fetchable from upstream on first request. The local `eblume/dev` index metadata is small but also not critical to retain — packages can be republished from source. If retention becomes important, add `/Users/erichblume/devpi/server-dir/` to `borgmatic_source_directories`. ## Related -- [[use-pypi-proxy]] - Client configuration and package uploads -- [[argocd]] - Deployment -- [[1password]] - Secrets management +- [[devpi-on-indri]] — Deploy, verify, and version-bump procedures +- [[use-pypi-proxy]] — Client configuration and package uploads +- [[1password]] — Secrets management diff --git a/docs/reference/services/docs.md b/docs/reference/services/docs.md index 1361d02..8ca8310 100644 --- a/docs/reference/services/docs.md +++ b/docs/reference/services/docs.md @@ -1,7 +1,7 @@ --- title: Docs -modified: 2026-03-23 -last-reviewed: 2026-03-23 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - documentation @@ -9,44 +9,42 @@ tags: # Docs (Quartz) -Documentation site built with [Quartz](https://quartz.jzhao.xyz/) and served via nginx. +Documentation site built with [Quartz](https://quartz.jzhao.xyz/). ## Quick Reference | Property | Value | |----------|-------| -| **Public URL** | https://docs.eblu.me | -| **Private URL** | `docs.ops.eblu.me` (tailnet only, via [[caddy]]) | -| **Namespace** | `docs` | -| **Image** | `registry.ops.eblu.me/blumeops/quartz` (see `argocd/manifests/docs/kustomization.yaml` for current tag) | +| **Public URL** | https://docs.eblu.me (via [[flyio-proxy]]) | +| **Private URL** | `docs.ops.eblu.me` (Caddy on indri) | +| **Deployment** | Ansible role `docs` on indri (no daemon — Caddy serves files directly) | +| **Content dir** | `~/blumeops/docs/content/` on indri | | **Source** | `docs/` directory in blumeops repo | | **Build** | Forgejo workflow `build-blumeops.yaml` | -| **Public proxy** | [[flyio-proxy]] (Fly.io → Tailscale tunnel) | + +Migrated from minikube to indri-native on 2026-04-29 (see [[docs-on-indri]]). ## Architecture 1. **Source**: Markdown files in `docs/` with Obsidian-compatible wiki-links -2. **Build**: Forgejo workflow builds Quartz static site on push to main -3. **Release**: Built assets published as Forgejo release attachments -4. **Deploy**: Container downloads release bundle on startup, serves via nginx - -## Release Process - -Documentation is built and released via the `build-blumeops` Forgejo workflow (manual dispatch): - -1. Quartz builds static HTML/CSS/JS -2. Assets uploaded as Forgejo release attachment -3. Workflow updates `DOCS_RELEASE_URL` in `argocd/manifests/docs/deployment.yaml` and commits to main -4. ArgoCD syncs the updated deployment; new pod downloads the release bundle at startup +2. **Build**: `Build BlumeOps` Forgejo workflow runs towncrier + Quartz, uploads tarball as a release asset, and bumps `docs_version` in the ansible role +3. **Deploy**: ansible role downloads the tarball into `~/blumeops/docs/content/` on indri; Caddy serves the directory directly with Quartz-style `try_files` (path → path/ → path.html → 404.html) ## Configuration - **Quartz config**: `quartz.config.ts` - **Layout**: `quartz.layout.ts` -- **ArgoCD app**: `argocd/apps/docs.yaml` -- **Manifests**: `argocd/manifests/docs/` +- **Ansible role**: `ansible/roles/docs/` +- **Caddy entry**: `ansible/roles/caddy/defaults/main.yml` (`kind: static`, `try_html: true`) + +## Release flow + +1. Run the `Build BlumeOps` workflow → builds tarball, creates release, bumps `docs_version` in the ansible role and pushes +2. Run `mise run provision-indri -- --tags docs` from gilbert +3. Purge the Fly.io proxy cache so the new content is fetched ## Related -- [[argocd]] - Deployment management -- [[forgejo]] - Build workflows +- [[docs-on-indri]] — Operations how-to +- [[cv]] — Similar architecture +- [[forgejo]] — Build workflows diff --git a/docs/reference/services/forgejo-runner.md b/docs/reference/services/forgejo-runner.md index d61f378..612f20f 100644 --- a/docs/reference/services/forgejo-runner.md +++ b/docs/reference/services/forgejo-runner.md @@ -1,7 +1,7 @@ --- title: Forgejo Runner -modified: 2026-03-30 -last-reviewed: 2026-03-30 +modified: 2026-04-20 +last-reviewed: 2026-04-20 tags: - service - ci-cd @@ -22,21 +22,21 @@ Forgejo Actions runner daemon for CI/CD job execution. Runs as a Kubernetes pod | **Capacity** | 2 concurrent jobs | | **Timeout** | 3h | | **Forgejo Instance** | https://forge.ops.eblu.me | -| **Image** | `code.forgejo.org/forgejo/runner` (see `argocd/manifests/forgejo-runner/kustomization.yaml` for current tag) | +| **Image** | `registry.ops.eblu.me/blumeops/forgejo-runner` (see `argocd/manifests/forgejo-runner/kustomization.yaml` for current tag) | | **DinD Sidecar** | `docker:27-dind` | ## Architecture The pod runs two containers: -1. **runner** - The Forgejo runner daemon. Registers with the forge on first start, then polls for jobs. Talks to DinD via `tcp://localhost:2375`. +1. **runner** - The Forgejo runner daemon. Loads a rendered `server.connections` config at startup, then polls for jobs. Talks to DinD via `tcp://localhost:2375`. 2. **dind** - Docker-in-Docker sidecar (privileged). Provides the Docker daemon for job container execution. Uses a registry mirror at `host.minikube.internal:5050` ([[zot]]). -Runner state (`/data/.runner`) is stored in an `emptyDir` volume, so re-registration happens on pod restart. The registration token comes from 1Password via [[external-secrets]]. +The runner daemon image is built from `containers/forgejo-runner/container.py`, not pulled directly from upstream. Credentials come from 1Password via [[external-secrets]], and the startup script renders the final config before launching the daemon. The `/data` volume remains for the runner home directory and job scratch space, not for `.runner` registration state. ## Job Execution Image -The actual container image used to run workflow steps is set via `RUNNER_LABELS` in the deployment, not in the runner config. This image is tracked separately as `runner-job-image` in `service-versions.yaml`. See [[build-container-image]] for how it's built. +The actual container image used to run workflow steps is declared in `server.connections.labels` in the runner config. This image is tracked separately as `runner-job-image` in `service-versions.yaml`. See [[build-container-image]] for how it's built. ## Network @@ -46,7 +46,8 @@ Jobs run with `network: "host"` to share the DinD network namespace. This gives | Secret | Source | Purpose | |--------|--------|---------| -| `RUNNER_TOKEN` | 1Password ("Forgejo Secrets" → `runner_reg`) | Runner registration with forge | +| `FORGEJO_RUNNER_UUID` | 1Password ("Forgejo Secrets" → `runner_k8s_uuid`) | Static runner identity for `server.connections` | +| `FORGEJO_RUNNER_TOKEN` | 1Password ("Forgejo Secrets" → `runner_k8s_token`) | Static runner credential for `server.connections` | ## Related diff --git a/docs/reference/services/forgejo.md b/docs/reference/services/forgejo.md index 11bb9a5..5b16b0e 100644 --- a/docs/reference/services/forgejo.md +++ b/docs/reference/services/forgejo.md @@ -85,6 +85,7 @@ Both container workflows trigger on the same tag pattern (`*-v[0-9]*`). Each che Server configuration secrets managed via 1Password → Ansible: - `lfs-jwt-secret`, `internal-token`, `oauth2-jwt-secret` - Forgejo server tokens - `runner_reg` - Runner registration token (also in k8s via [[external-secrets]]) +- `runner_k8s_uuid`, `runner_k8s_token` - Static credentials for the k8s runner `server.connections` flow ## Forgejo Actions Secrets diff --git a/docs/reference/services/hephaestus.md b/docs/reference/services/hephaestus.md new file mode 100644 index 0000000..7abc35b --- /dev/null +++ b/docs/reference/services/hephaestus.md @@ -0,0 +1,141 @@ +--- +title: Hephaestus +modified: 2026-06-04 +last-reviewed: 2026-06-04 +tags: + - service + - hephaestus +--- + +# Hephaestus + +[hephaestus](https://github.com/eblume/hephaestus) (`heph`) is the user's +self-hosted task + context/knowledge system. It is **hub-and-spoke**: each device +runs a full local SQLite replica (`hephd --mode local`) and background-syncs +against one canonical **hub**. Indri runs that hub. + +## Quick Reference + +| Property | Value | +|----------|-------| +| **PWA URL** | https://heph.ops.eblu.me (browser PWA, Caddy TLS) | +| **Spoke sync URL** | http://indri.tail8d86e.ts.net:8787 (direct, tailnet) | +| **Local Port** | 8787 (`hephd --mode server`, bound `0.0.0.0`) | +| **Binary** | `~/.cargo/bin/hephd` (self-updating) | +| **Data** | `~/.local/share/heph/heph.db` | +| **PWA shell** | `~/.local/share/heph/web` | +| **Logs** | `~/Library/Logs/mcquack.heph.{out,err}.log` | +| **LaunchAgent** | `mcquack.eblume.heph` | +| **Ansible role** | `ansible/roles/heph` (tag `heph`) | + +## What runs on indri + +The launchagent runs the hub in server mode with three features enabled: + +``` +hephd --mode server --http-addr 0.0.0.0:8787 --db ~/.local/share/heph/heph.db + --web-root ~/.local/share/heph/web + --oidc-issuer https://authentik.ops.eblu.me/application/o/heph/ + --oidc-audience heph + --self-update --self-update-interval-secs 600 +``` + +- **Server mode** exposes the HTTP sync endpoint (`/rpc`, `/sync/*`) that spokes + reconcile their op-log against. +- **Self-update** (10-minute poll) rebuilds `hephd` from the forge when a newer + release tag appears (`cargo install --git https://forge.eblu.me/eblume/hephaestus.git`). + Indri's Rust toolchain (`~/.cargo/bin`) is on the agent's `PATH` for this, and + the plist pins `RUSTUP_TOOLCHAIN=stable` — the + launchagent runs without mise, so a bare `cargo` shim would otherwise fall back + to rustup's *default* toolchain, which can lag behind heph's `rust-version` floor + (1.89) and silently fail the build. +- **PWA** (`--web-root`) serves the [heph-pwa] mobile shell; Caddy terminates TLS + at `heph.ops.eblu.me` so the PWA runs in a secure context (service worker, + install-to-home-screen, voice capture). + +[heph-pwa]: https://github.com/eblume/hephaestus + +The hub binds `0.0.0.0` so tailnet spokes can also sync directly +(`http://indri.tail8d86e.ts.net:8787`); access is gated by Authentik OIDC either +way — tailnet reachability alone is not enough. + +## Authentication (Authentik OIDC, device-code) + +The hub verifies an OIDC bearer token on every sync. The `heph` application is a +**public** OAuth2 client using the **device-code flow** (RFC 8628), provisioned +in the [[authentik]] blueprint (`argocd/manifests/authentik/configmap-blueprint.yaml`): + +- Issuer: `https://authentik.ops.eblu.me/application/o/heph/` +- Audience / client id: `heph` +- Restricted to the `admins` group (single-owner, sensitive data). +- Scope mappings: `openid`, `email`, `profile`, **`offline_access`**. + +> **`offline_access` is required for durable sync.** The `heph` CLI requests +> `scope = "openid offline_access"`, and a refresh token is only issued for the +> 30-day refresh-token window when the provider actually grants `offline_access`. +> Without that scope mapping the refresh token is bound to the login **session**; +> once the session lapses, hephd's `refresh_token` grant returns `400 Bad +> Request`, the bearer can't be refreshed, and spoke sync silently degrades +> (`heph sync --status` → `auth_failure: true`). `heph auth login` papers over it +> until the next session expiry. Keep `offline_access` in the provider's +> `property_mappings`. + +Because no Authentik instance ships a device-code flow by default, the blueprint +also creates `default-device-code-flow` and binds it to the default brand's +`flow_device_code`. Devices obtain a token with `heph auth login`; the PWA +currently takes a pasted token (in-app device-code login is upstream follow-up). + +## Data seeding (Path A, one-time) + +The hub was seeded from the existing `gilbert` device so no task history was +lost. heph's data-safe bring-up ("Path A") has the hub **adopt the device's +identity** rather than rewriting the device: + +1. Quiesce the seed device: `heph daemon stop` (on gilbert). +2. Copy its store to indri: `scp ~/.local/share/heph/heph.db indri:~/.local/share/heph/heph.db`. +3. Give the hub its **own device origin** (keeps gilbert's `owner_id` + data; + `hephd` regenerates a fresh `origin` on next start when it is missing): + ```fish + ssh indri "sqlite3 ~/.local/share/heph/heph.db \"DELETE FROM meta WHERE key='origin';\"" + ``` +4. `mise run provision-indri -- --tags heph` (installs hephd, stages the PWA, + loads the launchagent → hub starts on the seeded store). + +Only `meta.origin` changes; `owner_id`, nodes, op-log, and links are copied +untouched. A clean `hephd --owner-id` / seed command is tracked upstream as +hephaestus follow-up — until then this manual reset is the documented path. + +## Connecting a spoke (e.g. gilbert) + +A device joins by running its local daemon with the hub URL + OIDC client and +logging in once: + +```bash +hephd --mode local --hub-url http://indri.tail8d86e.ts.net:8787 \ + --oidc-issuer https://authentik.ops.eblu.me/application/o/heph/ \ + --oidc-client-id heph +heph auth login --hub-url http://indri.tail8d86e.ts.net:8787 \ + --issuer https://authentik.ops.eblu.me/application/o/heph/ --client-id heph +``` + +> **Use the direct `http://…:8787` tailnet URL for sync, not the Caddy HTTPS +> URL.** hephd's sync client is plain-HTTP-only; pointing `--hub-url` at +> `https://heph.ops.eblu.me` fails with a confusing `error sending request` +> (the HTTP connector rejects the `https` scheme before connecting). Tailscale +> encrypts the transport, and the OIDC bearer token still gates every request. +> `heph.ops.eblu.me` (Caddy TLS) exists only for the browser PWA, which needs a +> secure context. The cached token is keyed by the exact `--hub-url`, so use the +> same value for `hephd` and `heph auth login`. + +> **Caveat:** `heph daemon` cannot yet bake hub/spoke flags into the generated +> launchd plist (upstream gap). On a spoke whose plist is managed by `heph +> daemon`, the hub/OIDC flags must be hand-added — and a later `heph daemon +> start/restart` will regenerate the plist and drop them. Avoid `heph daemon` +> subcommands on a configured spoke until that gap is closed; reload via +> `launchctl` instead. + +## Related + +- [[indri]] — host +- [[authentik]] — OIDC provider +- [[caddy]] — TLS termination for `heph.ops.eblu.me` diff --git a/docs/reference/services/kiwix.md b/docs/reference/services/kiwix.md index 6806a5e..04fe0f6 100644 --- a/docs/reference/services/kiwix.md +++ b/docs/reference/services/kiwix.md @@ -1,6 +1,7 @@ --- title: Kiwix -modified: 2026-03-05 +modified: 2026-05-04 +last-reviewed: 2026-05-04 tags: - service - knowledge @@ -41,7 +42,7 @@ Full list: `argocd/manifests/kiwix/torrents.txt` ## Adding Archives -1. Edit `configmap-zim-torrents.yaml` +1. Edit `argocd/manifests/kiwix/torrents.txt` (rendered into a ConfigMap by `configMapGenerator`) 2. Add torrent URL from https://download.kiwix.org/zim/ 3. Sync: `argocd app sync kiwix` 4. Torrent-sync adds to [[transmission]] diff --git a/docs/reference/services/mealie.md b/docs/reference/services/mealie.md index c658046..fdd0260 100644 --- a/docs/reference/services/mealie.md +++ b/docs/reference/services/mealie.md @@ -46,6 +46,8 @@ OIDC via [[authentik]] using a confidential client. Client secret stored in 1Pas SQLite database backed up via [[borgmatic]]'s `before_backup` hook. Borgmatic runs `kubectl exec` to create a safe `.backup` copy (via Python's `sqlite3` module), then `kubectl cp` to the host. The dump lands in `~/.local/share/borgmatic/k8s-dumps/mealie.db` and is included in both local (sifaka) and offsite (BorgBase) backups. +To restore from a borg archive, see [[restore-from-borg]]. + ## Networking | Endpoint | Reachable from | diff --git a/docs/reference/services/ntfy.md b/docs/reference/services/ntfy.md index b549a6d..1bf45af 100644 --- a/docs/reference/services/ntfy.md +++ b/docs/reference/services/ntfy.md @@ -1,6 +1,7 @@ --- title: Ntfy -modified: 2026-02-17 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - service - notifications @@ -17,7 +18,7 @@ Self-hosted push notification service. Ntfy receives HTTP POST messages and deli | **URL** | https://ntfy.ops.eblu.me | | **Tailscale URL** | https://ntfy.tail8d86e.ts.net | | **Namespace** | `ntfy` | -| **Image** | `binwiederhier/ntfy:v2.17.0` | +| **Image** | `registry.ops.eblu.me/blumeops/ntfy:v2.19.2-fd0bebb-nix` (locally built) | | **Upstream** | https://github.com/binwiederhier/ntfy | | **Manifests** | `argocd/manifests/ntfy/` | diff --git a/docs/reference/services/ollama.md b/docs/reference/services/ollama.md index 75480cb..b749cf2 100644 --- a/docs/reference/services/ollama.md +++ b/docs/reference/services/ollama.md @@ -1,6 +1,7 @@ --- title: Ollama -modified: 2026-03-04 +modified: 2026-05-01 +last-reviewed: 2026-05-01 tags: - service - ai @@ -18,7 +19,7 @@ LLM inference server with GPU acceleration. Runs on [[ringtail]] with declarativ | **Tailscale URL** | https://ollama.tail8d86e.ts.net | | **Namespace** | `ollama` | | **Cluster** | ringtail k3s | -| **Image** | `ollama/ollama:0.17.5` | +| **Image** | `ollama/ollama:0.20.4` | | **Upstream** | https://github.com/ollama/ollama | | **Manifests** | `argocd/manifests/ollama/` | | **API Port** | 11434 | @@ -50,6 +51,8 @@ Declared in `argocd/manifests/ollama/models.txt`. The model-sync sidecar pulls m | `deepseek-r1:14b` | 14B | | `phi4:14b` | 14B | | `gemma3:12b` | 12B | +| `qwen3.5:9b` | 9B | +| `qwen3.5:27b` | 27B | To add or remove models, edit `models.txt` and sync via ArgoCD. diff --git a/docs/reference/services/shower-app.md b/docs/reference/services/shower-app.md new file mode 100644 index 0000000..26d1764 --- /dev/null +++ b/docs/reference/services/shower-app.md @@ -0,0 +1,55 @@ +--- +title: Shower App +modified: 2026-05-10 +last-reviewed: 2026-05-10 +tags: + - service + - django +--- + +# Shower App + +Django web app for Adelaide / Heidi / Addie's baby shower — guest splash with +a "what did you bring?" form, raffle picker, contest-prize ranking via +QR-coded `/prizes//` URLs, and an `/host/` operator console with +drag-rank assignment solving via scipy. + +## Quick Reference + +| Property | Value | +|----------|-------| +| **Public URL** | `shower.eblu.me` (guest surface only — via [[flyio-proxy]]) | +| **Private URL** | `shower.ops.eblu.me` (admin + `/host/` console — Caddy on indri) | +| **Cluster** | [[ringtail]] k3s, namespace `shower` | +| **Container** | `registry.ops.eblu.me/blumeops/shower` (built from `containers/shower/default.nix`) | +| **App source** | `forge.eblu.me/eblume/adelaide-baby-shower-app` (wheel on Forgejo PyPI) | +| **Database** | SQLite on a local-path PVC (`shower-data`, RWO 2 Gi) | +| **Media (prize photos)** | NFS RWX PVC `shower-media` → `sifaka:/volume1/shower` | +| **Secrets** | `Shower (blumeops)` 1Password item → `DJANGO_SECRET_KEY` | + +## Routing + +``` +Internet → shower.eblu.me (Fly nginx, guest-only 403s on /admin/ /host/) + │ + ▼ + Caddy on indri (shower.ops.eblu.me — full surface) + │ + ▼ + Tailscale ProxyGroup → k3s Service → Deployment +``` + +## Backups + +- **SQLite** dumped via `kubectl exec` to indri's `borgmatic_k8s_dump_dir` on every 2 a.m. run (mealie-pattern entry in `borgmatic_k8s_sqlite_dumps`) +- **Media** picked up via `/Volumes/shower` (sifaka SMB mount on indri) in the main `borgmatic_source_directories` list + +Both archive to sifaka + BorgBase. + +## Related + +- [[shower-on-ringtail]] — onboarding + day-of runbook +- [[expose-service-publicly]] — Fly proxy + tailnet pattern this rides on +- [[ringtail]] — host cluster +- [[sifaka#NFS Exports]] — NFS share table +- [[borgmatic]] — backup system diff --git a/docs/reference/services/tempo.md b/docs/reference/services/tempo.md index 771b97f..5eb5d87 100644 --- a/docs/reference/services/tempo.md +++ b/docs/reference/services/tempo.md @@ -1,6 +1,7 @@ --- title: Tempo -modified: 2026-03-05 +modified: 2026-06-04 +last-reviewed: 2026-06-04 tags: - service - observability @@ -18,7 +19,7 @@ Distributed tracing backend for BlumeOps infrastructure. Receives traces via OTL | **Tailscale URL** | https://tempo.tail8d86e.ts.net | | **OTLP Endpoint** | https://tempo-otlp.tail8d86e.ts.net | | **Namespace** | `monitoring` | -| **Image** | `grafana/tempo:2.10.1` | +| **Image** | `registry.ops.eblu.me/blumeops/tempo:v2.10.3-75f9ba4` (locally built) | | **Storage** | 10Gi PVC (local filesystem) | | **Retention** | 7 days | diff --git a/docs/reference/services/transmission.md b/docs/reference/services/transmission.md index 3676177..89904ce 100644 --- a/docs/reference/services/transmission.md +++ b/docs/reference/services/transmission.md @@ -1,6 +1,7 @@ --- title: Transmission -modified: 2026-02-07 +modified: 2026-04-29 +last-reviewed: 2026-04-29 tags: - service - torrent @@ -22,14 +23,13 @@ BitTorrent daemon, primarily for downloading ZIM archives for [[kiwix]]. ## Storage Layout -NFS share on sifaka (`/volume1/torrents`): +| Path | Backing | Purpose | +|------|---------|---------| +| `/downloads/incomplete/` | NFS (`sifaka:/volume1/torrents`) | Active downloads | +| `/downloads/complete/` | NFS (`sifaka:/volume1/torrents`) | Completed downloads | +| `/config/` | `emptyDir` (ephemeral) | Transmission `settings.json`, regenerated on pod start | -| Path | Purpose | -|------|---------| -| `/downloads/` | Active downloads and metadata | -| `/downloads/complete/` | Completed downloads | -| `/config/` | Transmission configuration | -| `/watch/` | Watch directory for .torrent files | +The watch directory is disabled (`watch-dir-enabled: false`); torrents are added via RPC (see Kiwix integration below). [[kiwix]] reads from `/downloads/complete/` to serve ZIM archives. @@ -44,7 +44,7 @@ When downloads complete, the zim-watcher CronJob detects new ZIMs and restarts K ## Monitoring -Basic uptime via blackbox probe in [[alloy|Alloy]] k8s (Services Health dashboard). +A `transmission-exporter` sidecar (image `registry.ops.eblu.me/blumeops/transmission-exporter`) scrapes the local RPC and exposes Prometheus metrics on port 19091. Uptime is also covered by a blackbox probe in [[alloy|Alloy]] k8s (Services Health dashboard). Web UI shows: active/seeding/paused counts, speeds, disk usage. diff --git a/docs/reference/services/zot.md b/docs/reference/services/zot.md index d00a200..b01a6ce 100644 --- a/docs/reference/services/zot.md +++ b/docs/reference/services/zot.md @@ -56,8 +56,9 @@ The `zot-ci` API key expires every **90 days**. To rotate: 5. Generate a new API key, copy it to clipboard 6. Update 1Password: ```fish - pbpaste | op item edit "Forgejo Secrets" --vault blumeops "zot-ci-api[password]=-" + set -l NEWKEY (pbpaste); op item edit "Forgejo Secrets" --vault blumeops "zot-ci-api[password]=$NEWKEY"; set -e NEWKEY ``` + The value is briefly visible to other `ps`-readers on this machine (single-user mac, acceptable tradeoff). The older `pbpaste | op item edit ... "field[password]=-"` stdin syntax was rejected by op 2.34 as "invalid JSON" — recent op versions treat piped input as a full JSON template. 7. Sync to Forgejo: `mise run provision-indri -- --tags forgejo_actions_secrets` ## Related diff --git a/docs/reference/storage/backups.md b/docs/reference/storage/backups.md index 9ca3bcb..2dfbae4 100644 --- a/docs/reference/storage/backups.md +++ b/docs/reference/storage/backups.md @@ -22,7 +22,7 @@ Daily automated backups from [[indri]] to [[sifaka|Sifaka]] NAS. | Path | Description | Priority | |------|-------------|----------| -| `~/code/personal/zk` | Zettelkasten notes | Critical | +| `~/code/personal/zk` | Zettelkasten notes (migrating into heph docs) | Critical | | `/opt/homebrew/var/forgejo` | Git repositories | Critical | | `~/.config/borgmatic` | Backup config | High | | `~/Documents` | Personal documents (includes [[1password]] encrypted export) | High | @@ -62,7 +62,7 @@ Other data lives directly on [[sifaka]] (music via [[navidrome]], video via [[je | ZIM archives (`~/transmission/`) | Re-downloadable via torrent | | Prometheus metrics | Ephemeral, in k8s PVC | | Loki logs | Ephemeral, in k8s PVC | -| devpi cache | Re-fetchable from PyPI | +| devpi cache (`~/devpi/server-dir/` on indri) | Re-fetchable from PyPI on first request | ## Retention Policy diff --git a/docs/reference/tools/argocd-cli.md b/docs/reference/tools/argocd-cli.md index 7a60490..a2aa223 100644 --- a/docs/reference/tools/argocd-cli.md +++ b/docs/reference/tools/argocd-cli.md @@ -24,6 +24,14 @@ argocd app sync apps # Sync the app-of-apps (picks up new Application ## Login +Default (Authentik SSO, PKCE, opens browser): + +```bash +argocd login argocd.ops.eblu.me --sso +``` + +Break-glass admin login (only if Authentik is down): + ```bash argocd login argocd.ops.eblu.me \ --username admin \ diff --git a/docs/reference/tools/dagger.md b/docs/reference/tools/dagger.md index 379c10f..81c5caf 100644 --- a/docs/reference/tools/dagger.md +++ b/docs/reference/tools/dagger.md @@ -16,7 +16,7 @@ Build engine for BlumeOps CI/CD pipelines. Replaces shell-based build scripts wi | Property | Value | |----------|-------| | **Module** | `blumeops` | -| **Engine Version** | v0.20.1 | +| **Engine Version** | v0.20.6 | | **SDK** | Python | | **Source** | `src/blumeops/main.py` | | **Config** | `dagger.json` (source: `.`) | @@ -50,16 +50,16 @@ New containers for indri (k8s runner) should use `container.py`. Ringtail contai ```bash # Build a container -dagger call build --src=. --container-name=devpi +dagger call build --src=. --container-name=miniflux # Drop into container shell for inspection -dagger call build --src=. --container-name=devpi terminal +dagger call build --src=. --container-name=miniflux terminal # Debug a failure interactively -dagger call --interactive build --src=. --container-name=devpi +dagger call --interactive build --src=. --container-name=miniflux # Publish a container to zot -dagger call publish --src=. --container-name=devpi --version=v1.1.0 +dagger call publish --src=. --container-name=miniflux --version=v1.1.0 # Build a nix container (no local nix required) dagger call build-nix --src=. --container-name=ntfy export --path=./ntfy.tar.gz diff --git a/docs/reference/tools/mise-tasks.md b/docs/reference/tools/mise-tasks.md index fefb30f..b614cb1 100644 --- a/docs/reference/tools/mise-tasks.md +++ b/docs/reference/tools/mise-tasks.md @@ -39,6 +39,7 @@ Run `mise tasks --sort name` for the live list with descriptions. | `fly-shutoff` | Emergency shutoff: stop all Fly.io proxy machines | | `dns-preview` | Preview DNS changes with [[pulumi]] | | `dns-up` | Apply DNS changes with [[pulumi]] | +| `dns-acme-cleanup` | Delete orphaned `_acme-challenge.ops` TXT records (libdns/gandi v1.1.0 workaround) | | `tailnet-preview` | Preview Tailscale ACL changes with [[pulumi]] | | `tailnet-up` | Apply Tailscale ACL changes with [[pulumi]] | @@ -68,7 +69,6 @@ Run `mise tasks --sort name` for the live list with descriptions. |------|-------------| | `services-check` | Check all services are online and responding | | `service-review` | Review the most stale service for version freshness | -| `blumeops-tasks` | List tasks from Todoist sorted by priority | | `op-backup` | Encrypt 1Password export and send to indri for borgmatic | ## Infrastructure Setup diff --git a/docs/reference/tools/pulumi.md b/docs/reference/tools/pulumi.md index bdc7e8f..a716bb9 100644 --- a/docs/reference/tools/pulumi.md +++ b/docs/reference/tools/pulumi.md @@ -49,7 +49,8 @@ mise run tailnet-up # Apply ACL/tag changes ## Related -- [[gandi-operations]] — DNS PAT rotation and Pulumi workflow +- [[manage-eblu-me-dns]] — DNS records workflow +- [[rotate-gandi-pat]] — Rotate the Gandi PAT - [[update-tailscale-acls]] — ACL editing and Pulumi workflow - [[gandi]] — DNS hosting - [[tailscale]] — Tailnet configuration diff --git a/docs/tutorials/ai-assistance-guide.md b/docs/tutorials/ai-assistance-guide.md index 3ee1ffa..4f0c595 100644 --- a/docs/tutorials/ai-assistance-guide.md +++ b/docs/tutorials/ai-assistance-guide.md @@ -98,7 +98,6 @@ BlumeOps operations are driven by mise tasks. Run `mise tasks` to list all avail | `provision-indri` | Deploy changes to [[indri]]-hosted services via Ansible | | `services-check` | After deployments - verify all services are healthy | | `pr-comments` | Check unresolved PR comments during review | -| `blumeops-tasks` | Find pending tasks from Todoist | | `container-list` | View available container images and tags | | `container-build-and-release` | Trigger container build workflows | | `dns-preview` | Preview DNS changes before applying | @@ -111,6 +110,8 @@ BlumeOps operations are driven by mise tasks. Run `mise tasks` to list all avail | `docs-review` | Review the most stale doc by last-reviewed date | | `runner-logs` | View Forgejo workflow logs (indri or ringtail runner) | +For task discovery, BlumeOps tasks live in [hephaestus](https://github.com/eblume/hephaestus) (`heph`), not Todoist. List outstanding work with `heph list --project Blumeops --json`. + For ArgoCD operations, use the `argocd` CLI directly: - `argocd app diff ` - Preview changes - `argocd app sync ` - Deploy changes diff --git a/docs/tutorials/contributing.md b/docs/tutorials/contributing.md index cddafea..0d48e8f 100644 --- a/docs/tutorials/contributing.md +++ b/docs/tutorials/contributing.md @@ -1,6 +1,7 @@ --- title: Contributing -modified: 2026-02-07 +modified: 2026-04-21 +last-reviewed: 2026-04-21 tags: - tutorials - contributing @@ -10,7 +11,7 @@ tags: > **Audiences:** Contributor -This tutorial walks through making your first contribution to BluemeOps - from understanding the codebase to submitting a pull request. +This tutorial walks through making your first contribution to BlumeOps - from understanding the codebase to submitting a pull request. ## Prerequisites @@ -37,14 +38,14 @@ brew bundle # installs tea, argocd, mise, etc. ### Using Mise (Optional) -Mise manages language toolchains and runs tasks: +Mise manages language toolchains, runs tasks, and pins tools like `prek`: ```bash -mise install # installs Python, Node.js, etc. from mise.toml +mise install # installs Python, Node.js, prek, etc. from mise.toml ``` ### Git Hooks (prek) -Git hooks validate changes on `git commit`: +Git hooks validate changes on `git commit` (prek is pinned in `mise.toml`): ```bash prek install prek run --all-files # verify setup @@ -104,6 +105,7 @@ Fragment types (file suffix): - `.bugfix.md` - Bug fixes - `.infra.md` - Infrastructure changes - `.doc.md` - Documentation +- `.ai.md` - AI-assisted changes - `.misc.md` - Other ### 4. Test Your Changes diff --git a/docs/tutorials/expose-service-publicly.md b/docs/tutorials/expose-service-publicly.md index 6bc8fae..65af611 100644 --- a/docs/tutorials/expose-service-publicly.md +++ b/docs/tutorials/expose-service-publicly.md @@ -176,17 +176,39 @@ Indri carries `tag:flyio-target` so the Fly proxy can reach Caddy. No per-servic Deploy: `mise run tailnet-preview` then `mise run tailnet-up`. -After deploying, extract the auth key and set it as a Fly.io secret: +After deploying, push the auth key to Fly.io. The simplest path is +`mise run fly-setup`, which reads the current value from Pulumi state +and stages it as a Fly.io secret: ```bash -# Get the key from Pulumi state -cd pulumi/tailscale && pulumi stack output flyio_authkey --show-secrets - -# Set it in Fly.io -fly secrets set TS_AUTHKEY="tskey-auth-..." -a blumeops-proxy +mise run fly-setup ``` -Store the auth key in 1Password as well for the `fly-setup` mise task. +Manual equivalent for reference: + +```bash +cd pulumi/tailscale && pulumi stack output flyio_authkey --show-secrets +# then in fly/: +fly secrets set TS_AUTHKEY="tskey-auth-..." -a blumeops-proxy --stage +``` + +**Pulumi state is the only source of truth for this key.** No other +process (mise tasks, ansible, scripts) reads it from anywhere else — +in particular, the key is not stored in 1Password. To rotate +(every 90 days, or after a compromise), force-replace the resource +and re-run `fly-setup`: + +```bash +mise run tailnet-up -- \ + --replace='urn:pulumi:tail8d86e::blumeops-tailnet::tailscale:index/tailnetKey:TailnetKey::flyio-proxy-key' +mise run fly-setup +mise run fly-deploy +``` + +Pulumi destroys the old key and mints a new 90-day one in a single +operation. Older fly machines that already authed against the old key +are unaffected (they don't need it after the initial join); only +*new* machine starts read the rotated value. ### Step 4: Mise tasks @@ -354,6 +376,13 @@ Mitigations for dynamic services: - fail2ban on indri (see below) can block IPs showing abuse patterns - The break-glass shutoff remains the last resort +The most acute version of this in practice has been **AI scrapers**, which +ignore `robots.txt` and crawl dynamic services (notably [[forgejo|Forgejo]]'s +infinite git-history URL space) into both a surprise egress bill and an +effective L7 DoS. See [[ai-scraper-mitigation]] for the incident, the tiered +defense (mirror black-hole, user-agent denylist, Anubis proof-of-work), and +why a Cloudflare Tunnel is *not* the chosen answer here. + If a publicly exposed dynamic service attracts targeted attacks or the home network bandwidth is impacted, consider migrating to Cloudflare Tunnel for enterprise-grade DDoS protection (requires DNS migration; diff --git a/docs/tutorials/replicating-blumeops.md b/docs/tutorials/replicating-blumeops.md index f2ed8ca..e54ecb2 100644 --- a/docs/tutorials/replicating-blumeops.md +++ b/docs/tutorials/replicating-blumeops.md @@ -1,6 +1,7 @@ --- title: Replicating BlumeOps -modified: 2026-02-07 +modified: 2026-05-11 +last-reviewed: 2026-05-11 tags: - tutorials - replication @@ -10,7 +11,7 @@ tags: > **Audiences:** Replicator -This tutorial provides a roadmap for building your own homelab GitOps environment inspired by BluemeOps. It links to detailed component tutorials for each major piece. +This tutorial provides a roadmap for building your own homelab GitOps environment inspired by BlumeOps. It links to detailed component tutorials for each major piece. ## What You'll Build @@ -23,7 +24,7 @@ By following this guide, you'll have: ## Hardware Requirements -BluemeOps runs on modest hardware. At minimum: +BlumeOps runs on modest hardware. At minimum: | Component | BlumeOps Uses | Minimum Alternative | |-----------|---------------|---------------------| @@ -94,7 +95,7 @@ Without observability, you're flying blind. ### Phase 6: Your First Services -With the foundation in place, deploy actual workloads. BluemeOps runs: +With the foundation in place, deploy actual workloads. BlumeOps runs: - [[miniflux]] - RSS reader - [[jellyfin]] - Media server - [[immich]] - Photo management @@ -118,7 +119,7 @@ Protect your data. ## Alternative Approaches -BluemeOps makes specific choices that may not suit everyone: +BlumeOps makes specific choices that may not suit everyone: | BlumeOps Choice | Alternative | |-----------------|-------------| diff --git a/fly/Dockerfile b/fly/Dockerfile index 8a6df31..406c849 100644 --- a/fly/Dockerfile +++ b/fly/Dockerfile @@ -1,9 +1,10 @@ -FROM nginx:1.29.6-alpine +# nginx 1.30.1-alpine +FROM nginx@sha256:c819f83c54b0361f5557601bf5eb4943d09360e7a7fdf426afc466570f45874d -# Copy tailscale binaries from official image -COPY --from=docker.io/tailscale/tailscale:v1.94.1 \ +# Copy tailscale binaries from official image (v1.94.2) +COPY --from=docker.io/tailscale/tailscale@sha256:95e528798bebe75f39b10e74e7051cf51188ee615934f232ba7ad06a3390ffa1 \ /usr/local/bin/tailscaled /usr/local/bin/tailscaled -COPY --from=docker.io/tailscale/tailscale:v1.94.1 \ +COPY --from=docker.io/tailscale/tailscale@sha256:95e528798bebe75f39b10e74e7051cf51188ee615934f232ba7ad06a3390ffa1 \ /usr/local/bin/tailscale /usr/local/bin/tailscale RUN mkdir -p /var/run/tailscale /var/lib/tailscale \ @@ -12,8 +13,8 @@ RUN mkdir -p /var/run/tailscale /var/lib/tailscale \ && apk add --no-cache fail2ban \ && rm -f /etc/fail2ban/jail.d/alpine-ssh.conf -# Copy Alloy binary from official image (Ubuntu-based, needs libc6-compat) -COPY --from=docker.io/grafana/alloy:v1.14.1 \ +# Copy Alloy binary from official image (v1.16.1, Ubuntu-based, needs libc6-compat) +COPY --from=docker.io/grafana/alloy@sha256:51aeb9d829239345070619dad3edd6873186f913c84f45b365b74574fcb38ec0 \ /bin/alloy /usr/local/bin/alloy RUN mkdir -p /var/log/nginx /etc/alloy /tmp/alloy-data @@ -24,6 +25,7 @@ COPY fail2ban/action.d/nginx-deny.conf /etc/fail2ban/action.d/nginx-deny.conf COPY nginx.conf /etc/nginx/nginx.conf COPY error.html /usr/share/nginx/html/error.html +COPY naughty.html /usr/share/nginx/html/naughty.html COPY alloy.river /etc/alloy/config.alloy COPY start.sh /start.sh RUN chmod +x /start.sh diff --git a/fly/fail2ban/action.d/nginx-deny.conf b/fly/fail2ban/action.d/nginx-deny.conf index 1d3737b..bab8abb 100644 --- a/fly/fail2ban/action.d/nginx-deny.conf +++ b/fly/fail2ban/action.d/nginx-deny.conf @@ -2,13 +2,22 @@ # Standard iptables banning won't work in Fly.io because $remote_addr # is Fly's internal proxy IP. Instead, we write banned IPs to a file # that nginx checks via a geo directive keyed on $http_fly_client_ip. +# +# The deny file is per-service: each jail sets `nginx_deny_file = ...` +# (see jail.d/*.conf) and a matching `geo $http_fly_client_ip $..._banned` +# block in nginx.conf includes the same path. [Definition] -actionban = echo " 1;" >> /etc/nginx/forge-deny.conf && nginx -s reload +actionban = echo " 1;" >> && nginx -s reload -actionunban = sed -i '/ 1;/d' /etc/nginx/forge-deny.conf && nginx -s reload +actionunban = sed -i '/ 1;/d' && nginx -s reload actionstart = actionstop = actioncheck = + +[Init] + +# Default for jails that don't override (preserves forge behaviour). +nginx_deny_file = /etc/nginx/forge-deny.conf diff --git a/fly/fly.toml b/fly/fly.toml index 11aac9c..6ccf29d 100644 --- a/fly/fly.toml +++ b/fly/fly.toml @@ -7,7 +7,7 @@ primary_region = "sjc" memory = "512mb" [deploy] -strategy = "bluegreen" +strategy = "immediate" [http_service] internal_port = 8080 diff --git a/fly/naughty.html b/fly/naughty.html new file mode 100644 index 0000000..b6eada8 --- /dev/null +++ b/fly/naughty.html @@ -0,0 +1,61 @@ + + + + + + + 403 · Roll of Dishonour + + + +
+

🪤 403 — you walked into the scraper trap

+

These are mirror repositories. They are tailnet-only.

+ +

+ This path used to serve the web UI for mirrors of public upstream + projects. It exists for supply-chain control, not for crawling. A + robots.txt politely disallowed /mirrors/. + A pack of AI scrapers ignored it, walked the infinite git-history URL + space, and ran up ~1.25 TB of egress and a real + money bill in a single month — while timing out the server for everyone + else. +

+ +

So /mirrors/ is closed at the edge now. Roll of dishonour, + by share of the bytes they stole:

+ + + + + + + + + +
OperatorUser-Agent
Metameta-externalagent
OpenAIGPTBot
AmazonAmazonbot
ByteDanceBytespider
+ +

+ If you are a human who actually wanted these mirrors, they are reachable + from the tailnet at forge.ops.eblu.me. If you are a crawler: + read the robots.txt next time. We left you a header, too. +

+
+ + diff --git a/fly/nginx.conf b/fly/nginx.conf index 5e49d88..ec35774 100644 --- a/fly/nginx.conf +++ b/fly/nginx.conf @@ -34,6 +34,15 @@ http { # bucket. $http_fly_client_ip has the actual client IP. limit_req_zone $http_fly_client_ip zone=forge_auth:10m rate=3r/s; + # Shower-specific zone: loose enough that ~30 guests sharing a single + # venue-wifi NAT'd public IP can all scan the QR and load the splash + # (HTML + a handful of static asset hits each) without anyone tripping + # the limit. 50r/s + burst=200 covers the simultaneous-load spike; + # exploit scanners still trip it (e.g. the .env-sweeping bot we saw + # fired ~30 req in 2s — that pattern stays caught). See the + # shower.eblu.me server block for the matching `limit_req`. + limit_req_zone $http_fly_client_ip zone=shower_general:10m rate=50r/s; + # fail2ban deny list — banned IPs are written here by fail2ban and # checked via the $forge_banned variable. The file is touched at # container start to ensure it exists. @@ -184,11 +193,55 @@ http { return 200 "User-agent: *\nDisallow: /mirrors/\nDisallow: /user/\nDisallow: /users/\nDisallow: /*/archive/\nDisallow: /*/releases/download/\n"; } + # Block the package registry at the public edge. Forgejo's per-user + # visibility model treats packages as world-readable when the owner + # has Visibility=Public — which means anyone on the internet can + # enumerate and download every wheel/sdist/generic artifact, even + # for private-repo releases (the sdist contains full source). We + # like keeping eblume's profile public, so we close the hole here + # at the proxy instead: WAN sees 403, tailnet (forge.ops.eblu.me) + # stays open for legitimate consumers (CI workflows, gilbert). + # See docs/tutorials/expose-service-publicly.md for the broader + # threat model on this proxy. + location /api/packages/ { + return 403 "Package downloads are tailnet-only — use forge.ops.eblu.me.\n"; + } + location /api/v1/packages { + return 403 "Package enumeration is tailnet-only — use forge.ops.eblu.me.\n"; + } + # Block swagger API docs — use forge.ops.eblu.me from tailnet location /swagger { return 403 "API documentation is only available at forge.ops.eblu.me (tailnet).\n"; } + # Black-hole the mirror repositories on WAN. These are mirrors of + # already-public upstreams (tailscale, prometheus, mealie, …) kept + # for supply-chain control; CI, gilbert, and tailnet clients consume + # them via forge.ops.eblu.me. Their web UI served no public purpose + # but AI scrapers, which crawled the near-infinite git-history URL + # space (src/commit, commits, blame, raw) and drove ~70% of Fly + # egress (1.24 TB/30d → a surprise bill) plus enough upstream load to + # time out Forgejo. robots.txt already Disallows /mirrors/, but + # meta-externalagent and GPTBot ignore it — so enforce at the edge. + # `^~` makes this win over the regex locations below (e.g. *.css), so + # static assets under /mirrors/ can't leak through. We also name and + # shame: blocked requests get a "roll of dishonour" page (403 status + # preserved) and an X-Naughty-Scrapers header. See + # docs/explanation/ai-scraper-mitigation.md. + location ^~ /mirrors/ { + error_page 403 /naughty.html; + return 403; + } + + # Roll of dishonour — served on the /mirrors/ 403, status kept at 403. + location = /naughty.html { + internal; + root /usr/share/nginx/html; + add_header X-Naughty-Scrapers "OpenAI/GPTBot, Meta/meta-externalagent, Amazonbot, ByteDance/Bytespider — robots.txt ignorers" always; + add_header X-Clacks-Overhead "GNU Terry Pratchett" always; + } + # Redirect archive endpoints to tailnet — archive requests generate full # git bundles on demand. Unauthenticated crawlers hitting unique commit # SHAs cause unbounded CPU and disk usage (DoS vector). Legitimate users @@ -288,6 +341,140 @@ http { } } + # --- shower.eblu.me (Adelaide baby shower — guest-only public surface) --- + # Only the guest paths (`/`, `/prizes//`, /static/, /media/) are + # exposed on WAN. /host/, /admin/, and Django's login views are blocked + # at the edge with a 403 pointing at the tailnet hostname — staff sign + # in on shower.ops.eblu.me, which is reachable from any device with + # Tailscale installed. Defense layers reduce to: general per-IP rate + # limit + django-axes (5 fails / 1h) on the tailnet-side login. No + # fail2ban needed here because the public surface no longer takes + # credentials of any kind. + server { + listen 8080; + server_name shower.eblu.me; + + # Per-IP rate limit. shower_general (50r/s, burst=200) instead of + # the global `general` zone because at the party, guests on the + # venue's wifi all NAT through a single Fly-Client-IP — 30 guests + # scanning the QR at once would each fetch HTML + a few static + # assets, easily clearing 20 burst on `general`. Exploit scanners + # still trip it (sustained ≫ 50r/s patterns). + limit_req zone=shower_general burst=200 nodelay; + + # Image uploads from /host/'s prize cropper are ~150-300 KiB JPEGs. + # The host page itself isn't reachable here, but /media/ reads can + # be larger than 1 MiB so set the cap to 5 MiB to match Django. + client_max_body_size 5m; + + # Security headers — HSTS matches Django's SECURE_HSTS_SECONDS. + add_header X-Frame-Options "DENY" always; + add_header X-Content-Type-Options "nosniff" always; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + add_header Referrer-Policy "same-origin" always; + # GNU Terry Pratchett — keep the name moving. + add_header X-Clacks-Overhead "GNU Terry Pratchett" always; + + error_page 502 503 504 /error.html; + location = /error.html { + root /usr/share/nginx/html; + internal; + } + + # Reject indexers — there's nothing here we want crawled. + location = /robots.txt { + default_type text/plain; + return 200 "User-agent: *\nDisallow: /\n"; + } + + # Admin surface: tailnet-only. Anything under /admin/ — login, + # logout, CRUD UI, password reset — returns 403 with a pointer to + # the tailnet host. Django's `staff_member_required` will redirect + # /host/ to /admin/login/, which lands on this 403 if a guest + # device wanders into it. Staff hit the tailnet host directly. + location /admin/ { + return 403 "Authentication is tailnet-only — visit shower.ops.eblu.me.\n"; + } + + # Operator console: tailnet-only. Same rationale as /admin/. + location /host/ { + return 403 "The host console is tailnet-only — visit shower.ops.eblu.me.\n"; + } + + # Static assets — WhiteNoise + CompressedManifestStaticFilesStorage + # gives content-hashed filenames, so cache aggressively. Hashed + # names make cache invalidation automatic on app upgrades. + location /static/ { + proxy_pass https://indri_backend$request_uri; + proxy_ssl_verify off; + proxy_ssl_server_name on; + proxy_ssl_name shower.ops.eblu.me; + + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + proxy_set_header Host shower.ops.eblu.me; + proxy_set_header X-Real-IP $http_fly_client_ip; + proxy_set_header X-Forwarded-For $http_fly_client_ip; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_cache services; + proxy_cache_valid 200 1y; + proxy_cache_valid 404 1m; + proxy_cache_use_stale error timeout updating; + proxy_cache_lock on; + proxy_cache_key $host$uri; + proxy_ignore_headers Cache-Control Set-Cookie; + + add_header X-Cache-Status $upstream_cache_status; + } + + # Prize photo uploads. Shorter TTL than /static/ because filenames + # aren't content-hashed — operators can re-upload a prize photo + # and we want guests to see the new image within a day. + location /media/ { + proxy_pass https://indri_backend$request_uri; + proxy_ssl_verify off; + proxy_ssl_server_name on; + proxy_ssl_name shower.ops.eblu.me; + + proxy_http_version 1.1; + proxy_set_header Connection $connection_upgrade; + proxy_set_header Host shower.ops.eblu.me; + proxy_set_header X-Real-IP $http_fly_client_ip; + proxy_set_header X-Forwarded-For $http_fly_client_ip; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_cache services; + proxy_cache_valid 200 1d; + proxy_cache_valid 404 1m; + proxy_cache_use_stale error timeout updating; + proxy_cache_lock on; + proxy_cache_key $host$uri; + proxy_ignore_headers Cache-Control Set-Cookie; + + add_header X-Cache-Status $upstream_cache_status; + } + + location / { + proxy_pass https://indri_backend$request_uri; + proxy_ssl_verify off; + proxy_ssl_server_name on; + proxy_ssl_name shower.ops.eblu.me; + proxy_intercept_errors on; + + # No proxy_cache — dynamic content with sessions and CSRF. + + proxy_set_header Host shower.ops.eblu.me; + proxy_set_header X-Real-IP $http_fly_client_ip; + proxy_set_header X-Forwarded-For $http_fly_client_ip; + proxy_set_header X-Forwarded-Proto $scheme; + + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + } + } + # Catch-all: reject unknown hosts, but serve health check server { listen 8080 default_server; diff --git a/fly/start.sh b/fly/start.sh index 1f2acaa..a924849 100644 --- a/fly/start.sh +++ b/fly/start.sh @@ -20,6 +20,7 @@ done echo "MagicDNS ready" # Ensure fail2ban deny file exists before nginx starts +# (the geo directive's `include` fails if the file is missing). touch /etc/nginx/forge-deny.conf # Start nginx — MagicDNS is available, upstreams resolved. diff --git a/mise-tasks/blumeops-tasks b/mise-tasks/blumeops-tasks deleted file mode 100755 index 333178e..0000000 --- a/mise-tasks/blumeops-tasks +++ /dev/null @@ -1,165 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0"] -# /// -#MISE description="List Blumeops tasks from Todoist sorted by priority" -"""Fetch and display Blumeops tasks from Todoist, sorted by priority. - -This script is specific to Erich Blume's personal development workflow and -is not intended for general use. It requires: - - - A 1Password CLI (`op`) configured with access to the author's vault - - A Todoist account with a project named "Blumeops" - -The script fetches tasks and displays them sorted by a custom priority order: -p1 (urgent), p2 (high), p4 (normal/default), p3 (backlog). The p3-last ordering -reflects a deliberate choice to treat p3 as "backlog" rather than moderate -priority. - -Usage: mise run blumeops-tasks -""" - -import subprocess -import sys -from datetime import date - -import httpx -from rich.console import Console -from rich.markup import escape -from rich.text import Text - -TODOIST_API_BASE = "https://api.todoist.com/api/v1" -PROJECT_NAME = "Blumeops" - -# Priority mapping: Todoist API uses 1=normal(p4), 2=moderate(p3), 3=high(p2), 4=urgent(p1) -# User wants order: p1, p2, p4, p3 (p3 is backlog, goes last) -PRIORITY_LABELS = {4: "p1", 3: "p2", 1: "p4", 2: "p3"} -PRIORITY_SORT_ORDER = {4: 1, 3: 2, 1: 3, 2: 4} # Lower = earlier - - -def get_todoist_token() -> str: - """Retrieve Todoist API token from 1Password.""" - result = subprocess.run( - ["op", "read", "op://vg6xf6vvfmoh5hqjjhlhbeoaie/c53h3xnmswhvexa5mntoyvhgpm/credential"], - capture_output=True, - text=True, - ) - if result.returncode != 0: - raise RuntimeError(f"Failed to get Todoist token from 1Password: {result.stderr}") - return result.stdout.strip() - - -def get_project_id(client: httpx.Client, project_name: str) -> str: - """Find project ID by name.""" - cursor = None - while True: - params = {} - if cursor: - params["cursor"] = cursor - response = client.get(f"{TODOIST_API_BASE}/projects", params=params) - response.raise_for_status() - data = response.json() - for project in data.get("results", data if isinstance(data, list) else []): - if project["name"] == project_name: - return project["id"] - cursor = data.get("next_cursor") if isinstance(data, dict) else None - if not cursor: - break - - raise RuntimeError(f"Project '{project_name}' not found in Todoist") - - -def get_tasks(client: httpx.Client, project_id: str) -> list[dict]: - """Get all tasks for a project.""" - tasks = [] - cursor = None - while True: - params = {"project_id": project_id} - if cursor: - params["cursor"] = cursor - response = client.get(f"{TODOIST_API_BASE}/tasks", params=params) - response.raise_for_status() - data = response.json() - tasks.extend(data.get("results", data if isinstance(data, list) else [])) - cursor = data.get("next_cursor") if isinstance(data, dict) else None - if not cursor: - break - return tasks - - -def is_due(task: dict) -> bool: - """Check if a task should be displayed based on its due date. - - Tasks without a due date are always shown. Tasks with a due date - are only shown when the date is today or in the past. - """ - due = task.get("due") - if due is None: - return True - due_date = date.fromisoformat(due["date"][:10]) - return due_date <= date.today() - - -def sort_tasks(tasks: list[dict]) -> list[dict]: - """Sort tasks by custom priority order: p1, p2, p4, p3.""" - return sorted(tasks, key=lambda t: PRIORITY_SORT_ORDER.get(t["priority"], 5)) - - -def main() -> int: - console = Console() - - # Get API token - try: - token = get_todoist_token() - except RuntimeError as e: - console.print(f"[red]Error:[/red] {e}") - return 1 - - # Create HTTP client with auth header - with httpx.Client(headers={"Authorization": f"Bearer {token}"}) as client: - # Find project - try: - project_id = get_project_id(client, PROJECT_NAME) - except RuntimeError as e: - console.print(f"[red]Error:[/red] {e}") - return 1 - - # Get, filter, and sort tasks - tasks = get_tasks(client, project_id) - tasks = [t for t in tasks if is_due(t)] - sorted_tasks = sort_tasks(tasks) - - if not sorted_tasks: - console.print("No tasks found in Blumeops project") - return 0 - - # Display tasks - console.print(f"[bold]Blumeops Tasks[/bold] ({len(sorted_tasks)} tasks)") - console.print("=" * 40) - console.print() - - for task in sorted_tasks: - priority = task["priority"] - label = PRIORITY_LABELS.get(priority, "p?") - content = task["content"] - description = task.get("description", "") - - # Header line with priority and content - header = Text() - header.append(f"[{label}]", style="bold") - header.append(f" {content}") - console.print(header) - - # Description indented (escape rich markup to preserve brackets) - if description: - for line in description.split("\n"): - console.print(f" {escape(line)}", style="dim") - - console.print() - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/mise-tasks/branch-cleanup b/mise-tasks/branch-cleanup index bd5ac66..a538880 100755 --- a/mise-tasks/branch-cleanup +++ b/mise-tasks/branch-cleanup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Delete branches that have been merged into main (local and remote)" #MISE alias="bc" diff --git a/mise-tasks/container-build-and-release b/mise-tasks/container-build-and-release index 2e1be27..85e6cb8 100755 --- a/mise-tasks/container-build-and-release +++ b/mise-tasks/container-build-and-release @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["typer>=0.24.0", "httpx>=0.28.1"] +# dependencies = ["typer==0.26.2", "httpx==0.28.1"] # /// #MISE description="Trigger container build workflows via Forgejo API" #USAGE arg "" help="Container name (directory under containers/)" @@ -15,6 +15,7 @@ Dockerfile and Nix builds in a single workflow. import subprocess import sys +import time from pathlib import Path import httpx @@ -48,6 +49,52 @@ def get_forge_token() -> str: return result.stdout.strip() +def max_run_number(headers: dict[str, str]) -> int: + """Return the highest current run_number for WORKFLOW, or 0 if none.""" + resp = httpx.get( + f"{FORGE_API}/repos/{REPO}/actions/tasks", + params={"limit": 50}, + headers=headers, + timeout=15, + ) + if resp.status_code != 200: + return 0 + runs = [ + t["run_number"] + for t in resp.json().get("workflow_runs", []) + if t.get("workflow_id") == WORKFLOW + ] + return max(runs, default=0) + + +def find_dispatched_run( + ref: str, floor: int, headers: dict[str, str], timeout_s: int = 20 +) -> int | None: + """Poll the tasks endpoint for the run triggered by our dispatch. + + Matches by head_sha + workflow + run_number > floor so we don't pick up + an older build of the same commit or a concurrent unrelated dispatch. + """ + deadline = time.monotonic() + timeout_s + while time.monotonic() < deadline: + resp = httpx.get( + f"{FORGE_API}/repos/{REPO}/actions/tasks", + params={"limit": 20}, + headers=headers, + timeout=15, + ) + if resp.status_code == 200: + for task in resp.json().get("workflow_runs", []): + if ( + task.get("head_sha") == ref + and task.get("workflow_id") == WORKFLOW + and task.get("run_number", 0) > floor + ): + return task["run_number"] + time.sleep(1) + return None + + def list_containers() -> None: typer.echo("Available containers:") for d in sorted(Path("containers").iterdir()): @@ -112,7 +159,8 @@ def main( if dry_run: typer.echo(f"[dry-run] Would dispatch {WORKFLOW}") typer.echo() - typer.echo(f"Monitor builds at: {FORGE_ACTIONS}") + typer.echo("Monitor builds with: mise run runner-logs") + typer.echo(f" or visit: {FORGE_ACTIONS}") return token = get_forge_token() @@ -132,6 +180,10 @@ def main( typer.echo("Push your changes before triggering a build: git push origin main") raise typer.Exit(1) + # Snapshot the highest existing run_number so we can identify the one + # our dispatch creates. + floor = max_run_number(headers) + url = f"{FORGE_API}/repos/{REPO}/actions/workflows/{WORKFLOW}/dispatches" payload = { "ref": "main", @@ -148,7 +200,12 @@ def main( raise typer.Exit(1) typer.echo() - typer.echo(f"Monitor builds at: {FORGE_ACTIONS}") + run_number = find_dispatched_run(ref, floor, headers) + if run_number is not None: + typer.echo(f"Monitor builds with: mise run runner-logs {run_number}") + else: + typer.echo("Monitor builds with: mise run runner-logs") + typer.echo(f" or visit: {FORGE_ACTIONS}") if __name__ == "__main__": diff --git a/mise-tasks/container-list b/mise-tasks/container-list index b1bd433..7dad346 100755 --- a/mise-tasks/container-list +++ b/mise-tasks/container-list @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="List available containers and their recent tags" #USAGE arg "[name]" help="Optional container name to filter output" diff --git a/mise-tasks/container-version-check b/mise-tasks/container-version-check index 6270ae1..06f96ae 100755 --- a/mise-tasks/container-version-check +++ b/mise-tasks/container-version-check @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Validate container version consistency across container.py, Dockerfiles, nix derivations, and service-versions.yaml" #USAGE flag "--all-files" help="Check all containers, not just changed ones" @@ -42,7 +42,6 @@ BLACKLIST = {"kubectl"} # Container dir name → service-versions.yaml name (when they differ) CONTAINER_TO_SERVICE = { - "quartz": "docs", "kiwix-serve": "kiwix", } diff --git a/mise-tasks/dns-acme-cleanup b/mise-tasks/dns-acme-cleanup new file mode 100755 index 0000000..3a53b11 --- /dev/null +++ b/mise-tasks/dns-acme-cleanup @@ -0,0 +1,112 @@ +#!/usr/bin/env -S uv run --script +# /// script +# requires-python = ">=3.12" +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] +# /// +#MISE description="Delete orphaned ACME challenge TXT records in eblu.me" +#USAGE flag "--dry-run" help="List orphans without deleting" +"""Clean up orphaned _acme-challenge TXT records in the eblu.me zone. + +Workaround for libdns/gandi v1.1.0: its DeleteRecords compares unquoted +certmagic values to Gandi-quoted stored values, so cleanup is a silent +no-op. Without this script, the rrset grows by ~2 values per successful +Caddy renewal cycle. + +In healthy steady state these records should be absent. Run alongside +PAT rotation, or any time after Caddy ACME activity. +""" + +import os +import subprocess +from typing import Annotated + +import httpx +import typer +from rich.console import Console +from rich.table import Table + +DOMAIN = "eblu.me" +RRSET = "_acme-challenge.ops" +GANDI_API = "https://api.gandi.net/v5/livedns" +OP_PAT_REF = "op://blumeops/gandi - blumeops/pat" + + +def resolve_token(console: Console) -> str: + env_token = os.environ.get("GANDI_PERSONAL_ACCESS_TOKEN", "").strip() + if env_token: + return env_token + console.print("[dim]Reading Gandi PAT from 1Password...[/dim]") + try: + result = subprocess.run( + ["op", "read", OP_PAT_REF], + capture_output=True, + text=True, + check=True, + ) + return result.stdout.strip() + except (subprocess.CalledProcessError, FileNotFoundError) as e: + console.print(f"[red]Failed to read PAT from 1Password:[/red] {e}") + raise typer.Exit(1) + + +app = typer.Typer(add_completion=False) + + +@app.command() +def main( + dry_run: Annotated[ + bool, + typer.Option("--dry-run", help="List orphans without deleting"), + ] = False, +) -> None: + """Delete orphan _acme-challenge TXT records in eblu.me.""" + console = Console() + token = resolve_token(console) + + url = f"{GANDI_API}/domains/{DOMAIN}/records/{RRSET}/TXT" + headers = {"Authorization": f"Bearer {token}"} + + with httpx.Client(timeout=15, headers=headers) as client: + resp = client.get(url) + if resp.status_code == 404: + console.print( + f"[green]Clean — {RRSET}.{DOMAIN} TXT rrset is absent.[/green]" + ) + raise typer.Exit(0) + resp.raise_for_status() + values = resp.json().get("rrset_values", []) + + if not values: + console.print( + f"[green]Clean — {RRSET}.{DOMAIN} TXT rrset is empty.[/green]" + ) + raise typer.Exit(0) + + table = Table(title=f"Orphan ACME challenge values: {RRSET}.{DOMAIN}") + table.add_column("#", justify="right") + table.add_column("Value") + for i, v in enumerate(values, 1): + table.add_row(str(i), v) + console.print(table) + console.print(f"\n[bold]{len(values)}[/bold] orphan(s).") + + if dry_run: + console.print("\n[dim]Dry run — no records deleted.[/dim]") + raise typer.Exit(0) + + del_resp = client.delete(url) + if del_resp.status_code == 204: + console.print( + f"[green]Deleted {RRSET}.{DOMAIN} TXT " + f"({len(values)} values).[/green]" + ) + else: + console.print( + f"[red]Delete failed: HTTP {del_resp.status_code}[/red]\n" + f"{del_resp.text[:300]}" + ) + raise typer.Exit(1) + + +if __name__ == "__main__": + app() diff --git a/mise-tasks/docs-check-frontmatter b/mise-tasks/docs-check-frontmatter index 11d1a49..35e1879 100755 --- a/mise-tasks/docs-check-frontmatter +++ b/mise-tasks/docs-check-frontmatter @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0"] +# dependencies = ["rich==15.0.0"] # /// #MISE description="Check that all docs have required frontmatter fields" """Validate that all documentation files have required YAML frontmatter. diff --git a/mise-tasks/docs-check-links b/mise-tasks/docs-check-links index 78e871a..9974fc7 100755 --- a/mise-tasks/docs-check-links +++ b/mise-tasks/docs-check-links @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0"] +# dependencies = ["rich==15.0.0"] # /// #MISE description="Validate all wiki-links point to existing doc files" """Validate that all wiki-links in documentation point to existing files. diff --git a/mise-tasks/docs-mikado b/mise-tasks/docs-mikado index 0b37f51..c632e46 100755 --- a/mise-tasks/docs-mikado +++ b/mise-tasks/docs-mikado @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="View active Mikado dependency chains for C2 changes" #USAGE arg "[card]" help="Card stem to show chain for" diff --git a/mise-tasks/docs-preview b/mise-tasks/docs-preview index f63b1d1..9e0bd16 100755 --- a/mise-tasks/docs-preview +++ b/mise-tasks/docs-preview @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Build docs with Dagger and serve locally, opening to a specific card" #USAGE arg "" help="Card path relative to docs/, e.g. how-to/knowledgebase/review-documentation" diff --git a/mise-tasks/docs-review b/mise-tasks/docs-review index 49cf4d0..12e301f 100755 --- a/mise-tasks/docs-review +++ b/mise-tasks/docs-review @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Review the most stale documentation card by last-reviewed date" #USAGE flag "--limit " default="15" help="Number of docs to show in the table" diff --git a/mise-tasks/docs-review-stale b/mise-tasks/docs-review-stale index facbf6b..0c5490e 100755 --- a/mise-tasks/docs-review-stale +++ b/mise-tasks/docs-review-stale @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Report docs by git-last-modified date, highlighting stale ones" #USAGE flag "--threshold " default="180" help="Days before a doc is considered stale" diff --git a/mise-tasks/docs-review-tags b/mise-tasks/docs-review-tags index 0e7f1d4..869e2f2 100755 --- a/mise-tasks/docs-review-tags +++ b/mise-tasks/docs-review-tags @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0"] # /// #MISE description="Print frontmatter tag inventory across all docs" """Print every frontmatter tag with usage count and file list. diff --git a/mise-tasks/fly-setup b/mise-tasks/fly-setup index 0c5cb56..be797e5 100755 --- a/mise-tasks/fly-setup +++ b/mise-tasks/fly-setup @@ -23,6 +23,7 @@ echo "IPs allocated" fly certs add docs.eblu.me -a "$APP" 2>/dev/null || true fly certs add cv.eblu.me -a "$APP" 2>/dev/null || true fly certs add forge.eblu.me -a "$APP" 2>/dev/null || true +fly certs add shower.eblu.me -a "$APP" 2>/dev/null || true echo "Certificates configured" echo "Done. Run 'mise run fly-deploy' to deploy." diff --git a/mise-tasks/mikado-branch-invariant-check b/mise-tasks/mikado-branch-invariant-check index ca9f79a..3135bf2 100755 --- a/mise-tasks/mikado-branch-invariant-check +++ b/mise-tasks/mikado-branch-invariant-check @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Validate Mikado Branch Invariant on mikado/* branches" #USAGE arg "[commit_msg_file]" help="Commit message file (passed by commit-msg hook)" diff --git a/mise-tasks/op-backup b/mise-tasks/op-backup index 6ffef14..7db033b 100755 --- a/mise-tasks/op-backup +++ b/mise-tasks/op-backup @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Encrypt a 1Password .1pux export and send to indri for borgmatic" #USAGE arg "[export_path]" help="Path to .1pux export file (prompted if omitted)" diff --git a/mise-tasks/pr-comments b/mise-tasks/pr-comments index a44a430..39d7c9a 100755 --- a/mise-tasks/pr-comments +++ b/mise-tasks/pr-comments @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="List unresolved comments on a PR" #USAGE arg "" help="Pull request number" diff --git a/mise-tasks/prune-ringtail-generations b/mise-tasks/prune-ringtail-generations index 8066f8b..2ad8dc8 100755 --- a/mise-tasks/prune-ringtail-generations +++ b/mise-tasks/prune-ringtail-generations @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Prune old NixOS generations on ringtail, preserving rollback safety" #MISE alias="prg" diff --git a/mise-tasks/review-compensating-controls b/mise-tasks/review-compensating-controls deleted file mode 100755 index 09e2d16..0000000 --- a/mise-tasks/review-compensating-controls +++ /dev/null @@ -1,229 +0,0 @@ -#!/usr/bin/env -S uv run --script -# /// script -# requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] -# /// -#MISE description="Review the most stale compensating control" -#USAGE flag "--limit " default="10" help="Number of controls to show in the table" -"""Review compensating controls by staleness. - -Reads ``compensating-controls.yaml`` and sorts by ``last-reviewed``. -Shows a staleness table, then displays the most stale control with all -references found in the codebase. - -After reviewing, update the control entry: - - last-reviewed: YYYY-MM-DD - -Usage: mise run review-compensating-controls [--limit 10] -""" - -import subprocess -import sys -from datetime import date -from pathlib import Path -from typing import Annotated - -import typer -import yaml -from rich.console import Console -from rich.panel import Panel -from rich.table import Table - -CONTROLS_FILE = Path(__file__).parent.parent / "compensating-controls.yaml" -REPO_ROOT = Path(__file__).parent.parent - - -def load_controls(path: Path) -> list[dict]: - data = yaml.safe_load(path.read_text()) - return data.get("controls", []) - - -def parse_date(raw) -> date | None: - if raw is None: - return None - if isinstance(raw, date): - return raw - try: - return date.fromisoformat(str(raw)) - except ValueError: - return None - - -def find_references(control_id: str) -> list[str]: - """Find all files referencing a control ID using ripgrep.""" - try: - result = subprocess.run( - ["rg", "--no-heading", "-n", control_id, str(REPO_ROOT)], - capture_output=True, - text=True, - timeout=10, - ) - lines = result.stdout.strip().splitlines() - # Exclude the controls file itself and this script - return [ - ln - for ln in lines - if "compensating-controls.yaml" not in ln - and "review-compensating-controls" not in ln - ] - except (FileNotFoundError, subprocess.TimeoutExpired): - return [] - - -def main( - limit: Annotated[ - int, typer.Option(help="Number of controls to show in the table") - ] = 10, -) -> None: - console = Console() - today = date.today() - - if not CONTROLS_FILE.exists(): - console.print( - f"[bold red]Controls file not found:[/bold red] {CONTROLS_FILE}" - ) - raise typer.Exit(code=1) - - controls = load_controls(CONTROLS_FILE) - - # Parse dates and build sortable entries - entries: list[tuple[dict, date | None]] = [] - for ctrl in controls: - reviewed = parse_date(ctrl.get("last-reviewed")) - entries.append((ctrl, reviewed)) - - # Sort: never-reviewed first, then oldest - entries.sort(key=lambda e: (e[1] is not None, e[1] or date.min)) - - never_reviewed = sum(1 for _, r in entries if r is None) - - # --- Summary panel --- - console.print() - console.print( - Panel( - f"[bold]{len(entries)}[/bold] compensating controls, " - f"[bold red]{never_reviewed}[/bold red] never reviewed", - title="[bold]Compensating Control Review Queue[/bold]", - border_style="cyan", - ) - ) - console.print() - - # --- Staleness table --- - table = Table(show_header=True, header_style="bold") - table.add_column("#", justify="right") - table.add_column("Control ID") - table.add_column("Last Reviewed", justify="right") - table.add_column("Age (days)", justify="right") - table.add_column("Refs", justify="right") - - for i, (ctrl, reviewed) in enumerate(entries[:limit], 1): - control_id = ctrl["id"] - refs = len(find_references(control_id)) - - if reviewed is None: - table.add_row( - str(i), - f"[red]{control_id}[/red]", - "[red]never[/red]", - "[red]—[/red]", - str(refs), - ) - else: - age = (today - reviewed).days - style = "yellow" if age > 90 else "" - id_str = f"[{style}]{control_id}[/{style}]" if style else control_id - date_str = f"[{style}]{reviewed}[/{style}]" if style else str(reviewed) - age_str = f"[{style}]{age}[/{style}]" if style else str(age) - table.add_row(str(i), id_str, date_str, age_str, str(refs)) - - remaining = len(entries) - limit - if remaining > 0: - table.add_row("", f"[dim]… {remaining} more[/dim]", "", "", "") - - console.print(table) - console.print() - - # --- Most stale control detail --- - if not entries: - console.print("[bold red]No controls found![/bold red]") - raise typer.Exit(code=1) - - top_ctrl, top_reviewed = entries[0] - control_id = top_ctrl["id"] - refs = find_references(control_id) - - detail_lines = [ - f"[bold cyan]{control_id}[/bold cyan]", - f"[dim]Last reviewed: {top_reviewed or 'never'}[/dim]", - "", - f"[bold]Description:[/bold] {top_ctrl.get('description', '').strip()}", - ] - notes = top_ctrl.get("notes", "").strip() - if notes: - detail_lines.append(f"[bold]Notes:[/bold] {notes}") - - console.print( - Panel( - "\n".join(detail_lines), - title="[bold]Up For Review[/bold]", - border_style="green", - ) - ) - console.print() - - # --- References --- - if refs: - ref_table = Table( - show_header=True, header_style="bold", title="References in codebase" - ) - ref_table.add_column("File", style="cyan") - ref_table.add_column("Line") - - for ref in refs: - # rg output: file:line:content - parts = ref.split(":", 2) - if len(parts) >= 3: - filepath = parts[0].replace(str(REPO_ROOT) + "/", "") - line_no = parts[1] - content = parts[2].strip() - ref_table.add_row(f"{filepath}:{line_no}", content) - else: - ref_table.add_row(ref, "") - - console.print(ref_table) - else: - console.print( - f"[yellow]No references to '{control_id}' found in the codebase.[/yellow]" - ) - console.print() - - # --- Review checklist --- - checklist = [ - "[bold]Verification:[/bold]\n", - f"• {notes}\n" if notes else "", - "\n[bold]Review each reference:[/bold]\n", - "• For each muted finding referencing this control, confirm:\n", - " 1. The risk the original check guards against\n", - " 2. That this control actually mitigates that risk\n", - " 3. That the control is still in effect (not degraded or bypassed)\n", - "\n[bold]After review:[/bold]\n", - f"• Update compensating-controls.yaml: [cyan]last-reviewed: {today}[/cyan]\n", - "• If the control is no longer valid, either:\n", - " - Fix the underlying finding and remove the mute, or\n", - " - Document a new/updated compensating control\n", - "• Commit the change", - ] - - console.print( - Panel( - "".join(checklist), - title="[bold yellow]Review Guidance[/bold yellow]", - border_style="yellow", - ) - ) - - -if __name__ == "__main__": - typer.run(main) diff --git a/mise-tasks/review-compliance-reports b/mise-tasks/review-compliance-reports index 080271c..24d2afc 100755 --- a/mise-tasks/review-compliance-reports +++ b/mise-tasks/review-compliance-reports @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["rich>=14.0.0", "typer>=0.24.0", "pyyaml>=6.0"] +# dependencies = ["rich==15.0.0", "typer==0.26.2", "pyyaml==6.0.3"] # /// #MISE description="Summarize the latest Prowler and Kingfisher compliance reports from sifaka" #USAGE flag "--full" help="Show all unmuted failures, not just new ones" @@ -9,23 +9,26 @@ """Fetch and summarize compliance reports from sifaka. Covers: - - Prowler K8s CIS: CSV-based, full analysis with delta tracking + - Prowler K8s CIS (in-cluster): per-finding detail + - Prowler container image scans: grouped by check + resource + - Prowler IaC manifest scans: grouped by check + resource - Kingfisher secret scanning: TODO — pending upstream JSON/CSV output support (currently HTML-only; contribute from spork) -For Prowler, copies the two most recent K8s CIS reports, parses them, -and displays: +For each Prowler scan, copies the two most recent CSV reports, parses +them, and displays: 1. Overall status (pass/fail/manual/muted counts) 2. Unmuted failures by severity 3. Delta from the previous report (new vs resolved) - 4. Actionable unmuted failures with details + 4. Actionable unmuted failures (per-finding for in-cluster; grouped + by check ID and resource for image/IaC because they have far too + many findings to list individually) This is the primary tool for the weekly compliance report review. """ import csv import subprocess -import sys import tempfile from collections import Counter from pathlib import Path @@ -36,7 +39,12 @@ from rich.console import Console from rich.panel import Panel from rich.table import Table -REPORT_BASE = "sifaka:/volume1/reports/prowler" +PROWLER_SCANS: list[tuple[str, str, bool]] = [ + # (label, sifaka base path, group_findings) + ("K8s CIS (In-Cluster)", "/volume1/reports/prowler", False), + ("Container Images", "/volume1/reports/prowler-images", True), + ("IaC (manifests)", "/volume1/reports/prowler-iac", True), +] console = Console() @@ -52,18 +60,18 @@ def scp(remote: str, local: str) -> bool: return result.returncode == 0 -def list_reports() -> list[str]: - """List Prowler CSV reports on sifaka, sorted by embedded timestamp.""" +def list_reports(base: str) -> list[str]: + """List Prowler CSV reports under `base` on sifaka, sorted by timestamp.""" result = subprocess.run( - ["ssh", "sifaka", "find /volume1/reports/prowler/ -name '*.csv' " + ["ssh", "sifaka", f"find {base}/ -name '*.csv' " "-not -path '*/compliance/*' -not -name '@*'"], capture_output=True, text=True, timeout=15, ) if result.returncode != 0: - console.print("[bold red]Failed to list reports on sifaka[/bold red]") - raise typer.Exit(code=1) + console.print(f"[bold red]Failed to list reports under {base}[/bold red]") + return [] csvs = [p.strip() for p in result.stdout.strip().splitlines() if p.strip()] # Sort by the timestamp embedded in the filename (e.g. 20260405030007) @@ -135,7 +143,10 @@ def _kubectl(args: str, timeout: int = 15) -> subprocess.CompletedProcess: def run_node_verification(console: Console) -> None: """Verify node-level conditions that Prowler reports as MANUAL. - Compensating control: node-config-automated-verification + Prowler runs inside a pod and can't evaluate kubelet file permissions, + kubelet config arguments, etcd CA separation, or cluster-admin RBAC + bindings. We SSH into the minikube node and check each condition here, + failing loudly if any deviates from expected values. """ checks: list[tuple[str, str, bool]] = [] # (name, detail, passed) @@ -270,7 +281,7 @@ def run_node_verification(console: Console) -> None: table = Table( show_header=True, header_style="bold", - title="Node Verification (CC: node-config-automated-verification)", + title="Node Verification (out-of-band checks for MANUAL findings)", ) table.add_column("Check") table.add_column("Detail") @@ -306,136 +317,151 @@ def run_node_verification(console: Console) -> None: console.print() -def main( - full: Annotated[ - bool, typer.Option(help="Show all unmuted failures, not just new ones") - ] = False, - show_muted: Annotated[ - bool, typer.Option(help="Also show muted failures") - ] = False, +SEVERITY_STYLE = { + "critical": "bold red", + "high": "red", + "medium": "yellow", +} + + +def _sev_style(sev: str) -> str: + return SEVERITY_STYLE.get(sev.lower(), "") + + +def summarize_report( + label: str, + base: str, + tmpdir: str, + *, + show_muted: bool = False, + group_findings: bool = False, ) -> None: - csvs = list_reports() + """Fetch and summarize the latest Prowler report under `base`. + + When `group_findings` is True, top-N CHECK_ID and RESOURCE_NAME tables + are shown instead of a per-finding detail table — appropriate for + image and IaC scans that produce thousands of findings. + """ + console.rule(f"[bold]{label}[/bold]") + csvs = list_reports(base) if not csvs: - console.print("[bold red]No Prowler CSV reports found on sifaka[/bold red]") - raise typer.Exit(code=1) - - with tempfile.TemporaryDirectory() as tmpdir: - # Fetch the two most recent reports - latest_remote = csvs[-1] - latest_local = Path(tmpdir) / "latest.csv" - - console.print(f"[dim]Fetching {latest_remote}...[/dim]") - if not scp(f"sifaka:{latest_remote}", str(latest_local)): - console.print("[bold red]Failed to copy latest report[/bold red]") - raise typer.Exit(code=1) - - prev_local = None - if len(csvs) >= 2: - prev_remote = csvs[-2] - prev_local = Path(tmpdir) / "prev.csv" - console.print(f"[dim]Fetching {prev_remote}...[/dim]") - if not scp(f"sifaka:{prev_remote}", str(prev_local)): - prev_local = None - - latest = parse_findings(load_csv(str(latest_local))) - - # Extract report date from filename - report_name = Path(latest_remote).stem - console.print() - - # --- Overall status --- - status_table = Table( - show_header=True, header_style="bold", title=f"Report: {report_name}" + console.print( + f"[bold yellow]{label}: no Prowler CSV reports found " + f"under {base}[/bold yellow]" ) - status_table.add_column("Status") - status_table.add_column("Count", justify="right") + console.print() + return - for status in ["PASS", "FAIL", "MANUAL"]: - count = latest["statuses"].get(status, 0) - style = "red" if status == "FAIL" and count > 0 else "" - status_table.add_row( - f"[{style}]{status}[/{style}]" if style else status, + safe = "".join(c if c.isalnum() else "_" for c in label.lower()) + latest_remote = csvs[-1] + latest_local = Path(tmpdir) / f"{safe}_latest.csv" + + console.print(f"[dim]Fetching {latest_remote}...[/dim]") + if not scp(f"sifaka:{latest_remote}", str(latest_local)): + console.print(f"[bold red]Failed to copy {latest_remote}[/bold red]") + return + + prev_local: Path | None = None + if len(csvs) >= 2: + prev_remote = csvs[-2] + prev_path = Path(tmpdir) / f"{safe}_prev.csv" + console.print(f"[dim]Fetching {prev_remote}...[/dim]") + if scp(f"sifaka:{prev_remote}", str(prev_path)): + prev_local = prev_path + + latest = parse_findings(load_csv(str(latest_local))) + report_name = Path(latest_remote).stem + console.print() + + # --- Overall status --- + status_table = Table( + show_header=True, header_style="bold", title=f"Report: {report_name}" + ) + status_table.add_column("Status") + status_table.add_column("Count", justify="right") + + for status in ["PASS", "FAIL", "MANUAL"]: + count = latest["statuses"].get(status, 0) + style = "red" if status == "FAIL" and count > 0 else "" + status_table.add_row( + f"[{style}]{status}[/{style}]" if style else status, + f"[{style}]{count}[/{style}]" if style else str(count), + ) + + muted_count = len(latest["muted"]) + unmuted_count = len(latest["unmuted"]) + status_table.add_row("", "") + status_table.add_row("[dim]↳ muted[/dim]", f"[dim]{muted_count}[/dim]") + status_table.add_row( + "[bold]↳ unmuted (action needed)[/bold]", + f"[bold red]{unmuted_count}[/bold red]" + if unmuted_count > 0 + else "[bold green]0[/bold green]", + ) + status_table.add_row("", "") + status_table.add_row("[bold]Total[/bold]", f"[bold]{latest['total']}[/bold]") + + console.print(status_table) + console.print() + + # --- Unmuted failures by severity --- + if latest["unmuted"]: + sev_table = Table( + show_header=True, + header_style="bold", + title="Unmuted Failures by Severity", + ) + sev_table.add_column("Severity") + sev_table.add_column("Count", justify="right") + + for sev, count in sorted( + Counter(r["SEVERITY"] for r in latest["unmuted"]).items(), + key=lambda kv: severity_sort({"SEVERITY": kv[0]}), + ): + style = _sev_style(sev) + sev_table.add_row( + f"[{style}]{sev}[/{style}]" if style else sev, f"[{style}]{count}[/{style}]" if style else str(count), ) - fail_count = len(latest["fails"]) - muted_count = len(latest["muted"]) - unmuted_count = len(latest["unmuted"]) - status_table.add_row("", "") - status_table.add_row("[dim]↳ muted[/dim]", f"[dim]{muted_count}[/dim]") - status_table.add_row( - "[bold]↳ unmuted (action needed)[/bold]", - f"[bold red]{unmuted_count}[/bold red]" - if unmuted_count > 0 - else "[bold green]0[/bold green]", - ) - status_table.add_row("", "") - status_table.add_row("[bold]Total[/bold]", f"[bold]{latest['total']}[/bold]") - - console.print(status_table) + console.print(sev_table) console.print() - # --- Unmuted failures by severity --- - if latest["unmuted"]: - sev_table = Table( - show_header=True, - header_style="bold", - title="Unmuted Failures by Severity", + # --- Delta from previous report --- + if prev_local: + prev = parse_findings(load_csv(str(prev_local))) + + prev_keys = {finding_key(r): r for r in prev["unmuted"]} + curr_keys = {finding_key(r): r for r in latest["unmuted"]} + + new_keys = set(curr_keys.keys()) - set(prev_keys.keys()) + resolved_keys = set(prev_keys.keys()) - set(curr_keys.keys()) + + prev_name = Path(csvs[-2]).stem + delta_lines = [ + f"Compared against: [dim]{prev_name}[/dim]", + "", + f"Previous unmuted FAILs: {len(prev['unmuted'])}", + f"Current unmuted FAILs: {len(latest['unmuted'])}", + f"[green]Resolved: {len(resolved_keys)}[/green]", + f"[red]New: {len(new_keys)}[/red]" + if new_keys + else "[green]New: 0[/green]", + ] + + console.print( + Panel( + "\n".join(delta_lines), + title="[bold]Week-over-Week Delta (unmuted only)[/bold]", + border_style="cyan", ) - sev_table.add_column("Severity") - sev_table.add_column("Count", justify="right") - - for sev, count in Counter( - r["SEVERITY"] for r in latest["unmuted"] - ).most_common(): - style = ( - "bold red" - if sev == "critical" - else "red" - if sev == "high" - else "yellow" - if sev == "medium" - else "" - ) - sev_table.add_row( - f"[{style}]{sev}[/{style}]" if style else sev, - f"[{style}]{count}[/{style}]" if style else str(count), - ) - - console.print(sev_table) - console.print() - - # --- Delta from previous report --- - if prev_local: - prev = parse_findings(load_csv(str(prev_local))) - - prev_keys = {finding_key(r): r for r in prev["unmuted"]} - curr_keys = {finding_key(r): r for r in latest["unmuted"]} - - new_keys = set(curr_keys.keys()) - set(prev_keys.keys()) - resolved_keys = set(prev_keys.keys()) - set(curr_keys.keys()) - - prev_name = Path(csvs[-2]).stem - delta_lines = [ - f"Compared against: [dim]{prev_name}[/dim]", - "", - f"Previous unmuted FAILs: {len(prev['unmuted'])}", - f"Current unmuted FAILs: {len(latest['unmuted'])}", - f"[green]Resolved: {len(resolved_keys)}[/green]", - f"[red]New: {len(new_keys)}[/red]" - if new_keys - else f"[green]New: 0[/green]", - ] - - console.print( - Panel( - "\n".join(delta_lines), - title="[bold]Week-over-Week Delta (unmuted only)[/bold]", - border_style="cyan", - ) - ) - console.print() + ) + console.print() + # For grouped scans the new/resolved listings are too noisy + # (potentially thousands of lines). Skip the listings; the count + # is in the panel above and detail is in the grouped tables. + if not group_findings: if new_keys: console.print("[bold red]New Unmuted Failures:[/bold red]") for k in sorted(new_keys): @@ -456,89 +482,180 @@ def main( ) console.print() - # --- Unmuted failure details --- - findings_to_show = latest["unmuted"] if full else [] - if not full and latest["unmuted"]: - findings_to_show = latest["unmuted"] - - if findings_to_show: - detail_table = Table( - show_header=True, - header_style="bold", - title="Unmuted Failures — Action Needed", - ) - detail_table.add_column("Severity") - detail_table.add_column("Check") - detail_table.add_column("Resource") - detail_table.add_column("Detail", max_width=60) - - for r in sorted(findings_to_show, key=severity_sort): - sev = r["SEVERITY"] - style = ( - "bold red" - if sev == "critical" - else "red" - if sev == "high" - else "yellow" - if sev == "medium" - else "" - ) - detail_table.add_row( - f"[{style}]{sev}[/{style}]" if style else sev, - r["CHECK_ID"], - r.get("RESOURCE_NAME", ""), - r["STATUS_EXTENDED"][:60], - ) - - console.print(detail_table) - console.print() - - # --- Muted findings summary --- - if show_muted and latest["muted"]: - muted_table = Table( - show_header=True, - header_style="bold", - title="Muted Failures (for reference)", - ) - muted_table.add_column("Severity") - muted_table.add_column("Check") - muted_table.add_column("Count", justify="right") - - muted_groups: dict[tuple[str, str], int] = Counter() - for r in latest["muted"]: - muted_groups[(r["SEVERITY"], r["CHECK_ID"])] += 1 - - for (sev, check), count in sorted( - muted_groups.items(), key=lambda x: severity_sort({"SEVERITY": x[0][0]}) - ): - muted_table.add_row(f"[dim]{sev}[/dim]", f"[dim]{check}[/dim]", f"[dim]{count}[/dim]") - - console.print(muted_table) - console.print() - - # --- Verdict --- - if not latest["unmuted"]: - console.print( - Panel( - "[bold green]All clear.[/bold green] No unmuted failures.", - title="Prowler Verdict", - border_style="green", - ) - ) + # --- Unmuted failure details (grouped or per-finding) --- + if latest["unmuted"]: + if group_findings: + _print_grouped_findings(latest["unmuted"]) else: - console.print( - Panel( - f"[bold yellow]{len(latest['unmuted'])} unmuted failure(s) " - f"need triage.[/bold yellow]\n\n" - "For each: remediate (fix the pod spec) or mute " - "(add to mutelist + compensating control).", - title="Prowler Verdict", - border_style="yellow", - ) + _print_findings_detail(latest["unmuted"]) + + # --- Muted findings summary --- + if show_muted and latest["muted"]: + muted_table = Table( + show_header=True, + header_style="bold", + title="Muted Failures (for reference)", + ) + muted_table.add_column("Severity") + muted_table.add_column("Check") + muted_table.add_column("Count", justify="right") + + muted_groups: dict[tuple[str, str], int] = Counter() + for r in latest["muted"]: + muted_groups[(r["SEVERITY"], r["CHECK_ID"])] += 1 + + for (sev, check), count in sorted( + muted_groups.items(), + key=lambda x: severity_sort({"SEVERITY": x[0][0]}), + ): + muted_table.add_row( + f"[dim]{sev}[/dim]", + f"[dim]{check}[/dim]", + f"[dim]{count}[/dim]", + ) + + console.print(muted_table) + console.print() + + # --- Verdict --- + if not latest["unmuted"]: + console.print( + Panel( + "[bold green]All clear.[/bold green] No unmuted failures.", + title=f"{label} Verdict", + border_style="green", + ) + ) + else: + console.print( + Panel( + f"[bold yellow]{len(latest['unmuted'])} unmuted failure(s) " + f"need triage.[/bold yellow]\n\n" + "For each: remediate, or add a Resource entry to the " + "matching check in argocd/manifests/prowler/mutelist/.", + title=f"{label} Verdict", + border_style="yellow", + ) + ) + console.print() + + +def _print_findings_detail(unmuted: list[dict]) -> None: + """Per-finding detail table — appropriate when finding count is small.""" + detail_table = Table( + show_header=True, + header_style="bold", + title="Unmuted Failures — Action Needed", + ) + detail_table.add_column("Severity") + detail_table.add_column("Check") + detail_table.add_column("Resource") + detail_table.add_column("Detail", max_width=60) + + for r in sorted(unmuted, key=severity_sort): + sev = r["SEVERITY"] + style = _sev_style(sev) + detail_table.add_row( + f"[{style}]{sev}[/{style}]" if style else sev, + r["CHECK_ID"], + r.get("RESOURCE_NAME", ""), + r["STATUS_EXTENDED"][:60], + ) + + console.print(detail_table) + console.print() + + +def _worst_severity(rows: list[dict]) -> str: + """Return the most severe severity label across `rows`.""" + if not rows: + return "" + return min( + (r["SEVERITY"] for r in rows), + key=lambda s: severity_sort({"SEVERITY": s}), + ) + + +def _print_grouped_findings(unmuted: list[dict], top_n: int = 15) -> None: + """Top-N tables grouped by CHECK_ID and RESOURCE_NAME. + + Used for image and IaC scans where per-finding tables would be too + large to be useful. Shows count and worst severity for each group. + """ + by_check: dict[str, list[dict]] = {} + by_resource: dict[str, list[dict]] = {} + for r in unmuted: + by_check.setdefault(r["CHECK_ID"], []).append(r) + by_resource.setdefault(r.get("RESOURCE_NAME", "") or "(no resource)", []).append(r) + + check_table = Table( + show_header=True, + header_style="bold", + title=f"Top {top_n} Checks by Unmuted Finding Count", + ) + check_table.add_column("Worst Sev") + check_table.add_column("Check ID") + check_table.add_column("Count", justify="right") + + for check, rows in sorted( + by_check.items(), key=lambda kv: -len(kv[1]) + )[:top_n]: + worst = _worst_severity(rows) + style = _sev_style(worst) + check_table.add_row( + f"[{style}]{worst}[/{style}]" if style else worst, + check, + str(len(rows)), + ) + + console.print(check_table) + console.print() + + res_table = Table( + show_header=True, + header_style="bold", + title=f"Top {top_n} Resources by Unmuted Finding Count", + ) + res_table.add_column("Worst Sev") + res_table.add_column("Resource") + res_table.add_column("Count", justify="right") + + for resource, rows in sorted( + by_resource.items(), key=lambda kv: -len(kv[1]) + )[:top_n]: + worst = _worst_severity(rows) + style = _sev_style(worst) + res_table.add_row( + f"[{style}]{worst}[/{style}]" if style else worst, + resource[:80], + str(len(rows)), + ) + + console.print(res_table) + console.print() + + +def main( + full: Annotated[ + bool, typer.Option(help="(reserved) currently a no-op; all unmuted failures already shown") + ] = False, + show_muted: Annotated[ + bool, typer.Option(help="Also show muted failures") + ] = False, +) -> None: + del full # historical flag, kept for backwards compatibility + + with tempfile.TemporaryDirectory() as tmpdir: + for label, base, group in PROWLER_SCANS: + summarize_report( + label, + base, + tmpdir, + show_muted=show_muted, + group_findings=group, ) # --- Node-level MANUAL check verification --- - # Compensating control: node-config-automated-verification # These checks verify conditions Prowler reports as MANUAL because it # runs inside a pod and cannot evaluate them directly. run_node_verification(console) diff --git a/mise-tasks/runner-logs b/mise-tasks/runner-logs index 579a5fd..0d3028b 100755 --- a/mise-tasks/runner-logs +++ b/mise-tasks/runner-logs @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="List recent Forgejo Actions runs or fetch logs for a specific job" #USAGE arg "[run_number]" help="Run number to show jobs for (omit to list recent runs)" @@ -229,12 +229,35 @@ def fetch_log(run_number: int, job_index: int, repo: str, token: str) -> None: hex_prefix = f"{task_id & 0xff:02x}" log_path = f"~/forgejo/data/actions_log/{repo}/{hex_prefix}/{task_id}.log.zst" + # indri's login shell (fish) silently swallows SSH exit codes, so we can't + # rely on returncode. zstdcat itself also exits 0 with a "can't stat ... + # -- ignored" stderr message when the file is missing. Detect missing logs + # by running `test -f` over SSH and parsing the marker line from stdout. + probe = subprocess.run( + ["ssh", "indri", f"test -f {log_path} && echo EXISTS || echo MISSING"], + capture_output=True, + text=True, + ) + marker = probe.stdout.strip().splitlines()[-1] if probe.stdout.strip() else "" + if marker != "EXISTS": + typer.echo( + f"Error: log not found for run #{run_number} job {job_index} (task {task_id})", + err=True, + ) + typer.echo(f"Path: indri:{log_path}", err=True) + typer.echo( + "The runner may have crashed before uploading its log buffer " + "(action_task.log_in_storage = 0).", + err=True, + ) + raise typer.Exit(1) + result = subprocess.run( ["ssh", "indri", f"zstdcat {log_path}"], capture_output=True, text=True, ) - if result.returncode != 0: + if result.returncode != 0 or not result.stdout: typer.echo( f"Error: could not read log for run #{run_number} job {job_index} (task {task_id})", err=True, diff --git a/mise-tasks/service-review b/mise-tasks/service-review index 01c4ce0..f83b104 100755 --- a/mise-tasks/service-review +++ b/mise-tasks/service-review @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["pyyaml>=6.0.2", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["pyyaml==6.0.3", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Review the most stale service for version freshness" #USAGE flag "--limit " default="15" help="Number of services to show in the table" diff --git a/mise-tasks/spork-create b/mise-tasks/spork-create index 84d2999..3f18563 100755 --- a/mise-tasks/spork-create +++ b/mise-tasks/spork-create @@ -1,7 +1,7 @@ #!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.12" -# dependencies = ["httpx>=0.28.1", "rich>=14.0.0", "typer>=0.24.0"] +# dependencies = ["httpx==0.28.1", "rich==15.0.0", "typer==0.26.2"] # /// #MISE description="Create a spork (floating-branch soft-fork) of a mirrored upstream project" #USAGE arg "" help="Repository name in the mirrors/ org on forge (e.g. kingfisher)" diff --git a/mise.toml b/mise.toml index 12c92df..286c4e0 100644 --- a/mise.toml +++ b/mise.toml @@ -8,5 +8,5 @@ "pipx:borgmatic" = "2.1.4" prek = "0.3.4" pulumi = "3.215.0" -dagger = "0.20.1" -ty = "0.0.29" +dagger = "0.20.6" +"pipx:ty" = "0.0.29" diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 052f38d..bc893d5 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -16,8 +16,26 @@ in systemd.tpm2.enable = false; # Networking + # Wired interface (enp5s0) uses a static IP configured by NixOS scripted + # networking; NetworkManager is left enabled for the wireless fallback only. networking.hostName = "ringtail"; - networking.networkmanager.enable = true; + networking.networkmanager = { + enable = true; + unmanaged = [ "interface-name:enp5s0" ]; + }; + networking.useDHCP = false; + networking.interfaces.enp5s0.ipv4.addresses = [{ + address = "192.168.1.21"; + prefixLength = 24; + }]; + networking.defaultGateway = "192.168.1.1"; + networking.nameservers = [ "192.168.1.1" "1.1.1.1" ]; + + # K3s pod networking and Tailscale tunnel routing require IP forwarding. + # NixOS leaves this off by default; previously it was being enabled + # implicitly by NM/scripted-DHCP setup, but with static networking we + # have to set it explicitly. + boot.kernel.sysctl."net.ipv4.ip_forward" = 1; # Time zone time.timeZone = "America/Los_Angeles"; @@ -319,17 +337,25 @@ in output = { "DP-1" = { mode = "2560x1440@165Hz"; - adaptive_sync = "on"; + # VRR off: the OMEN 27i IPS pumps gamma/brightness when the panel + # refresh swings into its low VRR range (e.g. low-fps game + # cutscenes), producing a ~20Hz flicker that compounds over a long + # session until a reboot. Fixed refresh at 165Hz eliminates it. + # If you want VRR back, cap in-game fps so refresh never dips low. + adaptive_sync = "off"; bg = "~/.config/sway/wallpaper.jpg fill"; }; }; - keybindings = let mod = "Mod4"; in { - "${mod}+Return" = "exec wezterm"; - "${mod}+Shift+q" = "kill"; - "${mod}+d" = "exec wmenu-run"; - "${mod}+space" = "exec fuzzel"; - "${mod}+Shift+c" = "reload"; - "${mod}+l" = "exec swaylock -f"; + # Extend (not replace) the home-manager default sway keybindings. + # lib.mkForce is needed on keys whose defaults we want to override + # (same priority otherwise conflicts). Audio keys and Mod+d (wmenu-run + # vs the default menu binding) don't collide with defaults. + keybindings = let mod = "Mod4"; in lib.mkOptionDefault { + "${mod}+Return" = lib.mkForce "exec wezterm"; + "${mod}+d" = lib.mkForce "exec wmenu-run"; + "${mod}+space" = lib.mkForce "exec fuzzel"; + "${mod}+l" = lib.mkForce "exec swaylock -f"; + "${mod}+F1" = "exec grep '^bindsym' ~/.config/sway/config | fuzzel --dmenu"; "--locked XF86AudioMute" = "exec pactl set-sink-mute @DEFAULT_SINK@ toggle"; "--locked XF86AudioLowerVolume" = "exec pactl set-sink-volume @DEFAULT_SINK@ -5%"; "--locked XF86AudioRaiseVolume" = "exec pactl set-sink-volume @DEFAULT_SINK@ +5%"; @@ -401,8 +427,10 @@ in width = 40; horizontal-pad = 16; vertical-pad = 8; - border-radius = 8; - border-width = 2; + }; + border = { + radius = 8; + width = 2; }; colors = { background = "24273add"; @@ -586,6 +614,22 @@ in AllowSuspendThenHibernate=no ''; + # Cap systemd-coredump. Wine/Proton games (Diablo IV, etc.) segfault + # regularly and dump multi-GB cores; with the stock (effectively unbounded) + # limits, systemd-coredump then spends minutes streaming and compressing the + # dump to disk — e.g. a single D4 crash produced a 4.6G core, read 13.7G and + # wrote 17.4G, pinning the CPU and locking up the desktop for ~3.5 minutes. + # Those cores are useless anyway: Nix .so files carry no build-id, so no + # backtrace can be generated. Capping uncompressed size at 1G makes oversized + # cores get logged-but-skipped (the kernel stops dumping once we stop reading) + # while real service cores (well under 1G) are still captured. MaxUse bounds + # the on-disk store so frequent game crashes can't accumulate (was at 8.6G). + systemd.coredump.extraConfig = '' + ProcessSizeMax=1G + ExternalSizeMax=1G + MaxUse=2G + ''; + # NixOS release system.stateVersion = "25.11"; } diff --git a/nixos/ringtail/flake.lock b/nixos/ringtail/flake.lock index 86c20af..bb60501 100644 --- a/nixos/ringtail/flake.lock +++ b/nixos/ringtail/flake.lock @@ -7,11 +7,11 @@ ] }, "locked": { - "lastModified": 1773889306, - "narHash": "sha256-PAqwnsBSI9SVC2QugvQ3xeYCB0otOwCacB1ueQj2tgw=", + "lastModified": 1780290312, + "narHash": "sha256-eTAlX0CwgB84Ts3GaBd944A3DRXVMzgA0EqroZBISUo=", "owner": "nix-community", "repo": "disko", - "rev": "5ad85c82cc52264f4beddc934ba57f3789f28347", + "rev": "115e5211780054d8a890b41f0b7734cafad54dfe", "type": "github" }, "original": { @@ -27,11 +27,11 @@ ] }, "locked": { - "lastModified": 1775425411, - "narHash": "sha256-KY6HsebJHEe5nHOWP7ur09mb0drGxYSzE3rQxy62rJo=", + "lastModified": 1779506708, + "narHash": "sha256-QOD/CNm196nCJRheux/URi4/HE66fthdOMqCJoPP1Y0=", "owner": "nix-community", "repo": "home-manager", - "rev": "0d02ec1d0a05f88ef9e74b516842900c41f0f2fe", + "rev": "3ee51fbdac8c8bdfe1e7e1fcaba6520a563f394f", "type": "github" }, "original": { @@ -43,11 +43,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1775811116, - "narHash": "sha256-t+HZK42pB6N+i5RGbuy7Xluez/VvWbembBdvzsc23Ss=", + "lastModified": 1779796641, + "narHash": "sha256-ZsIrKmhp4vbBXoXXmR/tBXA/UCsAQiJL9vsgZEduhVY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "54170c54449ea4d6725efd30d719c5e505f1c10e", + "rev": "25f538306313eae3927264466c70d7001dcea1df", "type": "github" }, "original": { diff --git a/nixos/ringtail/gaming.nix b/nixos/ringtail/gaming.nix index 2b361b3..7c00378 100644 --- a/nixos/ringtail/gaming.nix +++ b/nixos/ringtail/gaming.nix @@ -5,8 +5,31 @@ programs.steam = { enable = true; dedicatedServer.openFirewall = true; + extraCompatPackages = [ pkgs.proton-ge-bin ]; }; + # Proton Experimental ships an accessibility bridge (xalia) that hangs during + # game launch when AT-SPI is not running on the host. This host has no AT-SPI, + # so disable xalia globally to avoid wedging iscriptevaluator.exe. + environment.sessionVariables.PROTON_USE_XALIA = "0"; + + # Subnautica 2 pre-launch wrapper. SN2 (UE5) writes Saved/running.dat as a + # "currently running" lockfile. If the prior session exited uncleanly (SIGKILL + # via Steam's Stop button, crash, etc.), the file persists and on next launch + # SN2 pops up an invisible (0x0-sized) Error dialog ("Your game might not have + # exited correctly last time...") that the GameThread blocks on forever — + # observable only as a black screen with a spinning loader. This wrapper + # removes the stale lockfiles before exec'ing the actual game command. + # Use as Steam launch option for Subnautica 2: + # sn2-prelaunch %command% + environment.systemPackages = [ + (pkgs.writeShellScriptBin "sn2-prelaunch" '' + saved="/mnt/games/SteamLibrary/steamapps/compatdata/1962700/pfx/drive_c/users/steamuser/AppData/Local/Subnautica2/Saved" + rm -f "$saved/running.dat" "$saved/beforelobby.dat" + exec "$@" + '') + ]; + # Gamescope — micro-compositor for game fullscreen/resolution management. # Use as Steam launch option: gamescope -W 2560 -H 1440 -f -- %command% programs.gamescope = { diff --git a/prek.toml b/prek.toml index 28776c5..2c66b82 100644 --- a/prek.toml +++ b/prek.toml @@ -22,13 +22,13 @@ hooks = [ # check-yaml with --unsafe (builtin fast path doesn't support --unsafe yet) [[repos]] repo = "https://github.com/pre-commit/pre-commit-hooks" -rev = "v6.0.0" +rev = "3e8a8703264a2f4a69428a0aa4dcb512790b2c8c" # v6.0.0 hooks = [{ id = "check-yaml", args = ["--unsafe"] }] # Secret detection (running both tools in parallel to compare coverage) [[repos]] repo = "https://github.com/trufflesecurity/trufflehog" -rev = "v3.94.0" +rev = "37b77001d0174ebec2fcca2bd83ff83a6d45a3ab" # v3.95.3 hooks = [ { id = "trufflehog", entry = "trufflehog git file://. --since-commit HEAD --no-verification --fail", stages = [ "pre-commit", @@ -38,7 +38,7 @@ hooks = [ [[repos]] repo = "https://github.com/mongodb/kingfisher" -rev = "v1.91.0" +rev = "6f560103cc6ea082ef4b80a9098e3f3111afb8bc" # v1.101.0 hooks = [ { id = "kingfisher", args = [ "scan", @@ -56,7 +56,7 @@ hooks = [ # YAML linting [[repos]] repo = "https://github.com/adrienverge/yamllint" -rev = "v1.38.0" +rev = "cba56bcde1fdd01c1deb3f945e69764c291a6530" # v1.38.0 hooks = [{ id = "yamllint", args = ["-c", ".yamllint.yaml"] }] # Ansible linting @@ -69,12 +69,12 @@ name = "ansible-lint" entry = "env ANSIBLE_ROLES_PATH=ansible/roles ansible-lint" language = "python" files = "^ansible/" -additional_dependencies = ["ansible-lint>=26.3.0", "ansible-core>=2.18"] +additional_dependencies = ["ansible-lint==26.4.0", "ansible-core==2.21.0"] # Python - ruff for linting and formatting [[repos]] repo = "https://github.com/astral-sh/ruff-pre-commit" -rev = "v0.15.7" +rev = "0c7b6c989466a93942def1f84baf36ddfcd60c83" # v0.15.14 hooks = [{ id = "ruff", args = ["--fix"] }, { id = "ruff-format" }] # Python - ty type checker @@ -92,30 +92,30 @@ pass_filenames = false # Shell scripts - shellcheck and shfmt [[repos]] repo = "https://github.com/shellcheck-py/shellcheck-py" -rev = "v0.11.0.1" +rev = "745eface02aef23e168a8afb6b5737818efbea95" # v0.11.0.1 hooks = [{ id = "shellcheck", args = ["--severity=warning"] }] [[repos]] repo = "https://github.com/scop/pre-commit-shfmt" -rev = "v3.13.0-1" +rev = "05c1426671b9237fb5e1444dd63aa5731bec0dfb" # v3.13.1-1 hooks = [{ id = "shfmt", args = ["-i", "2", "-ci", "-bn"] }] # TOML - taplo [[repos]] repo = "https://github.com/ComPWA/taplo-pre-commit" -rev = "v0.9.3" -hooks = [{ id = "taplo-format" }, { id = "taplo-lint" }] +rev = "23eab0f0eedcbedebff420f5fdfb284744adc7b3" # v0.9.3 +hooks = [{ id = "taplo-format" }, { id = "taplo-lint", args = ["--no-schema"] }] # JSON formatting (prettier for consistent style) [[repos]] repo = "https://github.com/rbubley/mirrors-prettier" -rev = "v3.8.1" +rev = "515f543f5718ebfd6ce22e16708bb32c68ff96e1" # v3.8.3 hooks = [{ id = "prettier", types_or = ["json"], args = ["--tab-width", "2"] }] # GitHub/Forgejo Actions workflow linting [[repos]] repo = "https://github.com/rhysd/actionlint" -rev = "v1.7.11" +rev = "914e7df21a07ef503a81201c76d2b11c789d3fca" # v1.7.12 hooks = [ { id = "actionlint-system", args = [ "-config-file", diff --git a/pulumi/gandi/README.md b/pulumi/gandi/README.md index 9d7b7aa..70d2821 100644 --- a/pulumi/gandi/README.md +++ b/pulumi/gandi/README.md @@ -27,50 +27,19 @@ pulumi stack select eblu-me # or: pulumi stack init eblu-me ## Authentication -This project requires a Gandi Personal Access Token (PAT) with LiveDNS permissions. +This project uses a Gandi Personal Access Token (PAT) shared with Caddy. See the [Gandi reference card](../../docs/reference/infrastructure/gandi.md) and [Rotate the Gandi PAT](../../docs/how-to/configuration/rotate-gandi-pat.md). -**The PAT expires every 30 days and must be cycled manually.** - -### Cycling the PAT - -1. Go to [Gandi PAT Management](https://admin.gandi.net/organizations/1db8d76a-f729-11ed-b8d1-00163e94b645/account/pat) - -2. Create a new PAT: - - Name: `blumeops-pulumi` (or similar) - - Expiration: 30 days (maximum is 90; shorter is fine if used rarely) - - Permissions required: - - **Manage domain name technical configurations** (required for DNS records) - - See and renew domain names - - Optional permissions (enabled but not strictly required): - - See & download SSL certificates - - Manage Cloud resources - - See Cloud resources - - View Organization - - Deploy Web Hosting instances - - Manage Web Hosting instances - - See and renew Web Hosting instances - -3. Update 1Password: - ```bash - # Update the existing item with the new PAT value - op item edit mco6ka3dc3rmw7zkg2dhia5d2m pat="" --vault vg6xf6vvfmoh5hqjjhlhbeoaie - ``` - -4. Delete the old PAT from Gandi admin console - -### Running with Authentication - -The mise task handles fetching the PAT from 1Password: +The mise tasks handle fetching the PAT from 1Password: ```bash -mise run dns-up # Preview and apply changes mise run dns-preview # Preview only +mise run dns-up # Preview and apply ``` Or manually: ```bash -export GANDI_PERSONAL_ACCESS_TOKEN=$(op read "op://vg6xf6vvfmoh5hqjjhlhbeoaie/mco6ka3dc3rmw7zkg2dhia5d2m/pat") +export GANDI_PERSONAL_ACCESS_TOKEN=$(op read "op://blumeops/gandi - blumeops/pat") pulumi up ``` diff --git a/pulumi/gandi/__main__.py b/pulumi/gandi/__main__.py index e448ed2..25fd0f7 100644 --- a/pulumi/gandi/__main__.py +++ b/pulumi/gandi/__main__.py @@ -8,7 +8,7 @@ This program manages DNS records for blumeops infrastructure: Authentication: Set GANDI_PERSONAL_ACCESS_TOKEN environment variable. - See docs/how-to/gandi-operations.md for PAT management instructions. + See docs/how-to/configuration/rotate-gandi-pat.md for PAT management. """ import os @@ -85,6 +85,15 @@ forge_public = gandi.livedns.Record( values=["blumeops-proxy.fly.dev."], ) +shower_public = gandi.livedns.Record( + "shower-public", + zone=domain, + name="shower", + type="CNAME", + ttl=300, + values=["blumeops-proxy.fly.dev."], +) + # ============== Exports ============== pulumi.export("domain", domain) pulumi.export("wildcard_fqdn", f"*.{subdomain}.{domain}") @@ -93,3 +102,4 @@ pulumi.export("target_ip", tailscale_ip) pulumi.export("docs_public_fqdn", f"docs.{domain}") pulumi.export("cv_public_fqdn", f"cv.{domain}") pulumi.export("forge_public_fqdn", f"forge.{domain}") +pulumi.export("shower_public_fqdn", f"shower.{domain}") diff --git a/pulumi/tailscale/__main__.py b/pulumi/tailscale/__main__.py index 2f5262b..3acbb62 100644 --- a/pulumi/tailscale/__main__.py +++ b/pulumi/tailscale/__main__.py @@ -37,7 +37,7 @@ acl = tailscale.Acl( # indri - Mac Mini M1, primary homelab server # Hosts forge, loki, zot registry, and the k8s control plane. -# Other services (grafana, kiwix, devpi, etc.) run in k8s with their own Tailscale devices. +# Other services (grafana, kiwix, etc.) run in k8s with their own Tailscale devices. indri = tailscale.get_device(name="indri.tail8d86e.ts.net") indri_tags = tailscale.DeviceTags( "indri-tags", diff --git a/pulumi/tailscale/policy.hujson b/pulumi/tailscale/policy.hujson index 84f1f17..88408ef 100644 --- a/pulumi/tailscale/policy.hujson +++ b/pulumi/tailscale/policy.hujson @@ -20,7 +20,8 @@ }, // --- Members: user-facing services only --- - // Kiwix, Forge, devpi, Miniflux, PostgreSQL + // Kiwix, Forge, Miniflux, PostgreSQL + // (devpi moved off-cluster to indri; reachable via Caddy on tag:flyio-target) { "src": ["autogroup:member"], "dst": ["tag:kiwix"], @@ -31,11 +32,6 @@ "dst": ["tag:forge"], "ip": ["tcp:443", "tcp:22"], }, - { - "src": ["autogroup:member"], - "dst": ["tag:devpi"], - "ip": ["tcp:443"], - }, { "src": ["autogroup:member"], "dst": ["tag:feed"], @@ -152,7 +148,6 @@ "tag:grafana": ["autogroup:admin", "tag:blumeops"], "tag:kiwix": ["autogroup:admin", "tag:blumeops"], "tag:forge": ["autogroup:admin", "tag:blumeops"], - "tag:devpi": ["autogroup:admin", "tag:blumeops"], "tag:loki": ["autogroup:admin", "tag:blumeops"], "tag:pg": ["autogroup:admin", "tag:blumeops"], "tag:feed": ["autogroup:admin", "tag:blumeops"], diff --git a/service-versions.yaml b/service-versions.yaml index 761aa8d..866c687 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -44,10 +44,20 @@ services: upstream-source: https://github.com/gethomepage/homepage/releases notes: Custom container, kustomize manifests + - name: shower + type: argocd + last-reviewed: 2026-05-15 + current-version: "1.1.3" + upstream-source: https://forge.eblu.me/eblume/adelaide-baby-shower-app + notes: | + Django app for Adelaide / Heidi / Addie's baby shower. Wheel + published to Forgejo Packages PyPI; runs on ringtail k3s. Public + at shower.eblu.me (fly proxy), tailnet admin at shower.ops.eblu.me. + - name: nvidia-device-plugin type: argocd - last-reviewed: 2026-03-27 - current-version: "v0.19.0" + last-reviewed: 2026-06-04 + current-version: "v0.19.2" upstream-source: https://github.com/NVIDIA/k8s-device-plugin/releases notes: DaemonSet + RuntimeClass on ringtail for GPU workloads @@ -72,22 +82,22 @@ services: - name: alloy-tracing-ringtail type: argocd - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases notes: Privileged DaemonSet with Beyla eBPF for HTTP tracing on ringtail - name: alloy-ringtail type: argocd - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases notes: DaemonSet on ringtail for host metrics and pod logs - name: alloy-k8s type: argocd - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases - name: tailscale-operator @@ -96,6 +106,15 @@ services: current-version: "v1.94.2" upstream-source: https://github.com/tailscale/tailscale/releases + - name: tailscale + type: container + last-reviewed: 2026-05-10 + current-version: "1.94.2" + upstream-source: https://github.com/tailscale/tailscale/releases + notes: | + Locally mirrored tailscale image used by ringtail's tailscale-operator + ProxyClass. Built via containers/tailscale/default.nix. + - name: grafana type: argocd last-reviewed: 2026-04-02 @@ -125,12 +144,28 @@ services: upstream-source: https://github.com/immich-app/immich/releases notes: Kustomize manifests with upstream images + - name: valkey + type: argocd + last-reviewed: 2026-05-28 + current-version: "8.1.7" + upstream-source: https://github.com/valkey-io/valkey/releases + notes: >- + Dual-build valkey image: container.py builds Alpine 3.22 + apk valkey + (arm64, indri) for paperless; default.nix builds via nixpkgs (amd64, + ringtail) for immich-ringtail. Both track upstream valkey 8.1.x; Alpine + 3.22 currently ships 8.1.7-r0 and nixpkgs valkey is 8.1.7. Alpine 3.23 + jumps to 9.0. Distinct from authentik-redis (nix-built Redis + 8.x) which has its own entry. + - name: external-secrets type: argocd - last-reviewed: 2026-03-25 + last-reviewed: 2026-06-04 current-version: "v2.2.0" upstream-source: https://github.com/external-secrets/external-secrets/releases - notes: Static kustomize manifests rendered from upstream Helm chart + notes: >- + Static kustomize manifests rendered from upstream Helm chart. Controller + image is locally built from the forge mirror via containers/external-secrets/container.py + (single all_providers static Go binary). - name: 1password-connect type: argocd @@ -190,9 +225,17 @@ services: - name: teslamate type: argocd - last-reviewed: 2026-04-14 + last-reviewed: "2026-06-03" current-version: "v3.0.0" upstream-source: https://github.com/teslamate-org/teslamate/releases + notes: >- + Tesla data logger. Container ported from Dagger (container.py) to Nix + (containers/teslamate/default.nix) — a from-scratch beamPackages + mixRelease (Elixir/Phoenix release with npm-built assets), since + teslamate is not in nixpkgs. Pins erlang_27 + elixir_1_18 from the + shared nixos-unstable rev; assets via in-release npm ci + esbuild; + ex_cldr locale data pre-fetched (LOCALES env) to avoid sandbox + downloads. Version unchanged (v3.0.0). Build verified on ringtail. - name: transmission type: argocd @@ -214,29 +257,36 @@ services: upstream-source: https://github.com/kiwix/kiwix-tools/releases - name: devpi - type: argocd - last-reviewed: 2026-04-18 + type: ansible + last-reviewed: 2026-04-29 current-version: "6.19.3" upstream-source: https://github.com/devpi/devpi/releases + notes: Installed via uv into a venv on indri; version pinned in ansible/roles/devpi/defaults/main.yml - name: cv - type: argocd - last-reviewed: 2026-03-07 + type: ansible + last-reviewed: 2026-04-29 current-version: "1.0.3" upstream-source: https://forge.eblu.me/eblume/cv - notes: Personal static site; review build deps (WeasyPrint, Jinja2) in source repo + notes: >- + Static tarball downloaded by ansible/roles/cv into ~/blumeops/cv/content on indri; + served directly by Caddy (kind=static). Migrated from minikube 2026-04-29. + Review build deps (WeasyPrint, Jinja2) in source repo on upstream review. - name: docs - type: argocd - last-reviewed: 2026-03-07 - current-version: "1.28.2" - upstream-source: https://github.com/jackyzha0/quartz/releases - notes: Quartz static site generator; container version tracks nginx base + type: ansible + last-reviewed: 2026-04-29 + current-version: "v1.16.0" + upstream-source: https://forge.eblu.me/eblume/blumeops/releases + notes: >- + Quartz-built tarball downloaded by ansible/roles/docs into ~/blumeops/docs/content + on indri; served directly by Caddy (kind=static, try_html). current-version + tracks the blumeops docs release tag. - name: forgejo-runner type: argocd - last-reviewed: 2026-03-30 - current-version: "12.7.3" + last-reviewed: 2026-04-20 + current-version: "12.8.2" upstream-source: https://code.forgejo.org/forgejo/runner/releases notes: >- Runner daemon version (code.forgejo.org/forgejo/runner). Job execution @@ -244,8 +294,8 @@ services: - name: runner-job-image type: argocd - last-reviewed: 2026-03-06 - current-version: "0.20.1" + last-reviewed: 2026-04-21 + current-version: "0.20.6" upstream-source: https://github.com/dagger/dagger/releases notes: >- Forgejo Actions job execution image. CONTAINER_APP_VERSION tracks the @@ -289,22 +339,36 @@ services: - name: mealie type: argocd - last-reviewed: 2026-03-16 - current-version: "v3.12.0" + last-reviewed: "2026-06-03" + current-version: "v3.16.0" upstream-source: https://github.com/mealie-recipes/mealie/releases - notes: Recipe manager; built from source via forge mirror + notes: >- + Recipe manager. Container ported from Dockerfile to Nix + (containers/mealie/default.nix wraps nixpkgs mealie from a pinned + nixos-unstable; single gunicorn process, SQLite on the mealie-data + PVC). Bumped v3.12.0 -> v3.16.0 as part of the port (the deferred + upgrade). Breaking-change review v3.13-v3.16: no schema breaking + changes, SQLite auto-migrates forward via init_db; notable items are + minor (OIDC missing-claims log -> DEBUG, NLP parser uses user-defined + units, Nuxt 3->4 frontend, new Announcements feature, path-traversal + patches). Source PVC retained for rollback. Build verified on ringtail. - name: paperless type: argocd - last-reviewed: "2026-04-08" - current-version: "v2.20.13" + last-reviewed: "2026-06-03" + current-version: "v2.20.15" upstream-source: https://github.com/paperless-ngx/paperless-ngx/releases - notes: Document management; built from source via forge mirror + notes: >- + Document management. Container ported from Dockerfile to Nix + (containers/paperless/default.nix wraps nixpkgs paperless-ngx from a + pinned nixos-unstable). Runs as web/worker/beat/consumer containers on + ringtail (multi-process; no s6). Bumped v2.20.13 -> v2.20.15 (the + unstable package version, same-minor patch) as part of the port. - name: unpoller type: argocd - last-reviewed: 2026-03-16 - current-version: "v2.34.0" + last-reviewed: 2026-05-28 + current-version: "v3.2.0" upstream-source: https://github.com/unpoller/unpoller/releases notes: UniFi metrics exporter for Prometheus @@ -331,25 +395,42 @@ services: - name: alloy type: ansible - last-reviewed: 2026-03-13 - current-version: "v1.14.0" + last-reviewed: 2026-04-30 + current-version: "v1.16.0" upstream-source: https://github.com/grafana/alloy/releases notes: Built from source on indri - name: zot type: ansible - last-reviewed: 2026-03-14 - current-version: "v2.1.15" + last-reviewed: 2026-05-04 + current-version: "v2.1.16" upstream-source: https://github.com/project-zot/zot/releases notes: Built from source on indri - name: caddy type: ansible - last-reviewed: 2026-03-15 + last-reviewed: 2026-05-06 current-version: "v2.11.2" upstream-source: https://github.com/caddyserver/caddy/releases notes: Built from source with Gandi DNS and Layer 4 plugins + - name: heph + type: ansible + last-reviewed: 2026-06-05 + current-version: "v1.2.1" + upstream-source: https://forge.eblu.me/eblume/hephaestus/releases + notes: >- + hephaestus task/context sync hub on indri (server-mode launchagent, + ansible/roles/heph; cargo-built from the forge). SELF-UPDATING: hephd + polls the forge for newer releases every 10 min and rebuilds + restarts + itself, so the running version drifts AHEAD of the ansible heph_version + pin. current-version here is the last observed/deployed tag, not a hard + pin — verify the live version via `curl https://heph.ops.eblu.me/config` + is served (hub up) and the hub log's `current=` line. Reconciling this + self-update vs IaC-pin drift is tracked in the heph "Hephaestus" project: + "Reconcile hephd self-update with ansible-pinned version (drift on indri + hub)" (node 01KTBXWT6XTHNDH92CVJY88E5K). + - name: borgmatic type: ansible last-reviewed: 2026-04-15 @@ -396,8 +477,8 @@ services: - name: dagger type: mise - last-reviewed: 2026-04-12 - current-version: "0.20.1" + last-reviewed: 2026-04-21 + current-version: "0.20.6" upstream-source: https://github.com/dagger/dagger/releases notes: Dagger CI/CD engine; pinned in mise.toml diff --git a/src/blumeops/main.py b/src/blumeops/main.py index 94b932b..9bbd12f 100644 --- a/src/blumeops/main.py +++ b/src/blumeops/main.py @@ -80,6 +80,10 @@ class Blumeops: "git", "clone", "--depth=1", + # Pin to last v4 release. v5.0.0 restructured config + # layout (.quartz/plugins, ../quartz imports) and breaks + # our quartz.config.ts/quartz.layout.ts. See changelog. + "--branch=v4.5.2", "https://github.com/jackyzha0/quartz.git", "/tmp/quartz", ]