From 4e16116c4fc1a782fb991424cd122c42420484bf Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 11:41:47 -0800 Subject: [PATCH 01/23] Port Frigate NVR to ringtail k3s with GPU acceleration Migrate Frigate from indri's minikube (arm64, ZMQ detector) to ringtail's k3s cluster to leverage the RTX 4080 for TensorRT-accelerated ONNX inference. - Enable nvidia-container-toolkit and configure k3s containerd nvidia runtime - Add NVIDIA device plugin ArgoCD app (RuntimeClass + DaemonSet) - Re-target Frigate ArgoCD app to ringtail k3s cluster - Switch image to x86_64 tensorrt variant with runtimeClassName: nvidia - Add GPU resource limit (nvidia.com/gpu: 1) and increase shm to 512Mi - Replace ZMQ detector with ONNX (auto-selects TensorRT execution provider) - Update NFS PV and database PVC comments for ringtail Co-Authored-By: Claude Opus 4.6 --- argocd/apps/frigate.yaml | 2 +- argocd/apps/nvidia-device-plugin.yaml | 18 ++++++++++ argocd/manifests/frigate/configmap.yaml | 5 ++- argocd/manifests/frigate/deployment.yaml | 6 ++-- argocd/manifests/frigate/pv-nfs.yaml | 4 +-- argocd/manifests/frigate/pvc-database.yaml | 2 +- .../nvidia-device-plugin/daemonset.yaml | 36 +++++++++++++++++++ .../nvidia-device-plugin/runtime-class.yaml | 6 ++++ .../feature-frigate-ringtail-gpu.infra.md | 1 + nixos/ringtail/configuration.nix | 12 +++++++ 10 files changed, 83 insertions(+), 9 deletions(-) create mode 100644 argocd/apps/nvidia-device-plugin.yaml create mode 100644 argocd/manifests/nvidia-device-plugin/daemonset.yaml create mode 100644 argocd/manifests/nvidia-device-plugin/runtime-class.yaml create mode 100644 docs/changelog.d/feature-frigate-ringtail-gpu.infra.md diff --git a/argocd/apps/frigate.yaml b/argocd/apps/frigate.yaml index a90f412..c443774 100644 --- a/argocd/apps/frigate.yaml +++ b/argocd/apps/frigate.yaml @@ -11,7 +11,7 @@ spec: targetRevision: main path: argocd/manifests/frigate destination: - server: https://kubernetes.default.svc + server: https://ringtail.tail8d86e.ts.net:6443 namespace: frigate syncPolicy: syncOptions: diff --git a/argocd/apps/nvidia-device-plugin.yaml b/argocd/apps/nvidia-device-plugin.yaml new file mode 100644 index 0000000..af8395f --- /dev/null +++ b/argocd/apps/nvidia-device-plugin.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: nvidia-device-plugin + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/nvidia-device-plugin + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: nvidia-device-plugin + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/frigate/configmap.yaml b/argocd/manifests/frigate/configmap.yaml index 9c96008..df7c70c 100644 --- a/argocd/manifests/frigate/configmap.yaml +++ b/argocd/manifests/frigate/configmap.yaml @@ -56,9 +56,8 @@ data: track: [person, car, dog, cat, bird] detectors: - apple_silicon: - type: zmq - endpoint: tcp://host.minikube.internal:5555 + onnx: + type: onnx model: model_type: yolo-generic diff --git a/argocd/manifests/frigate/deployment.yaml b/argocd/manifests/frigate/deployment.yaml index afe11fb..91c9a77 100644 --- a/argocd/manifests/frigate/deployment.yaml +++ b/argocd/manifests/frigate/deployment.yaml @@ -23,9 +23,10 @@ spec: mountPath: /config-ro - name: config mountPath: /config + runtimeClassName: nvidia containers: - name: frigate - image: ghcr.io/blakeblackshear/frigate:0.17.0-rc2-standard-arm64 + image: ghcr.io/blakeblackshear/frigate:0.17.0-rc2-tensorrt ports: - containerPort: 5000 name: http @@ -60,6 +61,7 @@ spec: limits: memory: "2Gi" cpu: "2000m" + nvidia.com/gpu: "1" livenessProbe: httpGet: path: /api/version @@ -87,4 +89,4 @@ spec: - name: shm emptyDir: medium: Memory - sizeLimit: 256Mi + sizeLimit: 512Mi diff --git a/argocd/manifests/frigate/pv-nfs.yaml b/argocd/manifests/frigate/pv-nfs.yaml index d3a592b..c7197ab 100644 --- a/argocd/manifests/frigate/pv-nfs.yaml +++ b/argocd/manifests/frigate/pv-nfs.yaml @@ -1,11 +1,11 @@ # NFS PersistentVolume for Frigate recordings -# Requires: NFS share on sifaka at /volume1/frigate with NFS permissions for indri +# Requires: NFS share on sifaka at /volume1/frigate with NFS permissions for ringtail # # To create on Synology: # 1. Control Panel > Shared Folder > Create # 2. Name: frigate, Location: Volume 1 # 3. Control Panel > File Services > NFS > NFS Rules -# 4. Add rule for "frigate" share: Hostname=indri, Privilege=Read/Write, Squash=No mapping +# 4. Add rule for "frigate" share: Hostname=ringtail, Privilege=Read/Write, Squash=No mapping apiVersion: v1 kind: PersistentVolume metadata: diff --git a/argocd/manifests/frigate/pvc-database.yaml b/argocd/manifests/frigate/pvc-database.yaml index 040bda3..1eacb1d 100644 --- a/argocd/manifests/frigate/pvc-database.yaml +++ b/argocd/manifests/frigate/pvc-database.yaml @@ -1,5 +1,5 @@ # PersistentVolumeClaim for Frigate SQLite database -# Uses minikube's default storage class for local provisioning +# Uses k3s local-path storage class for local provisioning apiVersion: v1 kind: PersistentVolumeClaim metadata: diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml new file mode 100644 index 0000000..50eb94e --- /dev/null +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -0,0 +1,36 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin + namespace: nvidia-device-plugin + labels: + app: nvidia-device-plugin +spec: + selector: + matchLabels: + app: nvidia-device-plugin + template: + metadata: + labels: + app: nvidia-device-plugin + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: system-node-critical + containers: + - name: nvidia-device-plugin + image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugins + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugins + hostPath: + path: /var/lib/kubelet/device-plugins diff --git a/argocd/manifests/nvidia-device-plugin/runtime-class.yaml b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml new file mode 100644 index 0000000..7ba6add --- /dev/null +++ b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/docs/changelog.d/feature-frigate-ringtail-gpu.infra.md b/docs/changelog.d/feature-frigate-ringtail-gpu.infra.md new file mode 100644 index 0000000..d204a4f --- /dev/null +++ b/docs/changelog.d/feature-frigate-ringtail-gpu.infra.md @@ -0,0 +1 @@ +Port Frigate NVR to ringtail k3s with RTX 4080 GPU acceleration (TensorRT/ONNX), replacing the ZMQ-based Apple Silicon detector on indri. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 6cb0581..0882fe5 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -35,6 +35,9 @@ in package = config.boot.kernelPackages.nvidiaPackages.stable; }; + # NVIDIA container toolkit (CDI specs + runtime for containerd/k3s GPU pods) + hardware.nvidia-container-toolkit.enable = true; + # Wayland / Sway programs.sway = { enable = true; @@ -109,6 +112,15 @@ in "--write-kubeconfig-mode=644" "--tls-san=ringtail.tail8d86e.ts.net" ]; + containerdConfigTemplate = '' + {{ template "base" . }} + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_type = "io.containerd.runc.v2" + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "${pkgs.nvidia-container-toolkit}/bin/nvidia-container-runtime" + ''; }; # K3s containerd registry mirrors (pull through Zot on indri) -- 2.50.1 (Apple Git-155) From 3e6d997c29664e8a3328140cdfcf6635323f4f68 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 11:44:10 -0800 Subject: [PATCH 02/23] Bump NVIDIA k8s-device-plugin to v0.18.2 Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 50eb94e..3d6780b 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -22,7 +22,7 @@ spec: priorityClassName: system-node-critical containers: - name: nvidia-device-plugin - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.0 + image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 securityContext: allowPrivilegeEscalation: false capabilities: -- 2.50.1 (Apple Git-155) From cf5194c13882f1a4df23778a9f96feed978b5138 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 11:45:53 -0800 Subject: [PATCH 03/23] Add nvidia-device-plugin to service version tracking Co-Authored-By: Claude Opus 4.6 --- service-versions.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/service-versions.yaml b/service-versions.yaml index dc102e9..1a435f5 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -51,6 +51,13 @@ services: upstream-source: https://github.com/gethomepage/homepage/releases notes: Deployed via Helm chart + - name: nvidia-device-plugin + type: argocd + last-reviewed: 2026-02-19 + current-version: "v0.18.2" + upstream-source: https://github.com/NVIDIA/k8s-device-plugin/releases + notes: DaemonSet + RuntimeClass on ringtail for GPU workloads + - name: frigate type: argocd last-reviewed: 2026-02-17 -- 2.50.1 (Apple Git-155) From 986505c7efb05b9ff131c923982f379e41911f59 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 11:50:50 -0800 Subject: [PATCH 04/23] Enable NFS client support on ringtail for k3s NFS volumes mount.nfs was missing, preventing NFS PersistentVolume mounts. Co-Authored-By: Claude Opus 4.6 --- nixos/ringtail/configuration.nix | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 0882fe5..bba3957 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -38,6 +38,9 @@ in # NVIDIA container toolkit (CDI specs + runtime for containerd/k3s GPU pods) hardware.nvidia-container-toolkit.enable = true; + # NFS client support (required for k3s to mount NFS PersistentVolumes) + boot.supportedFilesystems = [ "nfs" ]; + # Wayland / Sway programs.sway = { enable = true; -- 2.50.1 (Apple Git-155) From 57e5aeccc2a736387525584a4e0d8040d855e6ef Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:05:46 -0800 Subject: [PATCH 05/23] Fix containerd nvidia runtime config for v3 format K3s ships containerd 2.0+ which uses config v3 format. The plugin key path is 'io.containerd.cri.v1.runtime' not 'io.containerd.grpc.v1.cri'. Co-Authored-By: Claude Opus 4.6 --- nixos/ringtail/configuration.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index bba3957..9fde856 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -118,10 +118,10 @@ in containerdConfigTemplate = '' {{ template "base" . }} - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia] privileged_without_host_devices = false runtime_type = "io.containerd.runc.v2" - [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia.options] BinaryName = "${pkgs.nvidia-container-toolkit}/bin/nvidia-container-runtime" ''; }; -- 2.50.1 (Apple Git-155) From 7e498c5a34f1fa14f75fb8fe280b7c2e5183ed33 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:23:18 -0800 Subject: [PATCH 06/23] Add nvidia runtimeClass to device plugin DaemonSet The device plugin needs access to NVIDIA libraries (NVML) to discover GPUs. Running with the nvidia runtime makes device files visible. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 3d6780b..a0f0b11 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -19,6 +19,7 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule + runtimeClassName: nvidia priorityClassName: system-node-critical containers: - name: nvidia-device-plugin -- 2.50.1 (Apple Git-155) From 912dfcab10ec96374d8949a5c7fd0812b4654ea3 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:28:16 -0800 Subject: [PATCH 07/23] Switch to CDI for GPU device injection instead of nvidia-container-runtime NixOS splits nvidia-container-toolkit into separate derivations, making the nvidia-container-runtime binary path unreliable in containerd config. CDI (Container Device Interface) is the modern approach: - Enable CDI in k3s containerd config (cdi_spec_dirs: /var/run/cdi) - Device plugin uses CDI annotations to inject GPU devices - Remove RuntimeClass (not needed with CDI) - Remove runtimeClassName from Frigate deployment - Mount CDI specs into device plugin pod Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/deployment.yaml | 1 - argocd/manifests/nvidia-device-plugin/daemonset.yaml | 12 +++++++++++- .../nvidia-device-plugin/runtime-class.yaml | 6 ------ nixos/ringtail/configuration.nix | 8 +++----- 4 files changed, 14 insertions(+), 13 deletions(-) delete mode 100644 argocd/manifests/nvidia-device-plugin/runtime-class.yaml diff --git a/argocd/manifests/frigate/deployment.yaml b/argocd/manifests/frigate/deployment.yaml index 91c9a77..45bc9e1 100644 --- a/argocd/manifests/frigate/deployment.yaml +++ b/argocd/manifests/frigate/deployment.yaml @@ -23,7 +23,6 @@ spec: mountPath: /config-ro - name: config mountPath: /config - runtimeClassName: nvidia containers: - name: frigate image: ghcr.io/blakeblackshear/frigate:0.17.0-rc2-tensorrt diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index a0f0b11..e3c9081 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -19,11 +19,15 @@ spec: - key: nvidia.com/gpu operator: Exists effect: NoSchedule - runtimeClassName: nvidia priorityClassName: system-node-critical containers: - name: nvidia-device-plugin image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 + env: + - name: DEVICE_LIST_STRATEGY + value: cdi-annotations + - name: CDI_ROOT + value: /var/run/cdi securityContext: allowPrivilegeEscalation: false capabilities: @@ -31,7 +35,13 @@ spec: volumeMounts: - name: device-plugins mountPath: /var/lib/kubelet/device-plugins + - name: cdi-specs + mountPath: /var/run/cdi + readOnly: true volumes: - name: device-plugins hostPath: path: /var/lib/kubelet/device-plugins + - name: cdi-specs + hostPath: + path: /var/run/cdi diff --git a/argocd/manifests/nvidia-device-plugin/runtime-class.yaml b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml deleted file mode 100644 index 7ba6add..0000000 --- a/argocd/manifests/nvidia-device-plugin/runtime-class.yaml +++ /dev/null @@ -1,6 +0,0 @@ ---- -apiVersion: node.k8s.io/v1 -kind: RuntimeClass -metadata: - name: nvidia -handler: nvidia diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 9fde856..1137a9a 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -118,11 +118,9 @@ in containerdConfigTemplate = '' {{ template "base" . }} - [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia] - privileged_without_host_devices = false - runtime_type = "io.containerd.runc.v2" - [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia.options] - BinaryName = "${pkgs.nvidia-container-toolkit}/bin/nvidia-container-runtime" + [plugins.'io.containerd.cri.v1.runtime'] + enable_cdi = true + cdi_spec_dirs = ["/var/run/cdi", "/etc/cdi"] ''; }; -- 2.50.1 (Apple Git-155) From 5194de13b9add1b89ea513de00d071336da2d7c4 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:34:20 -0800 Subject: [PATCH 08/23] Mount host NVIDIA libraries into device plugin for NVML access The device plugin needs libnvidia-ml.so to discover GPUs even when using CDI annotations. Mount /run/opengl-driver/lib (NixOS NVIDIA lib path) into the pod and set LD_LIBRARY_PATH. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index e3c9081..b30d0cb 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -28,6 +28,8 @@ spec: value: cdi-annotations - name: CDI_ROOT value: /var/run/cdi + - name: LD_LIBRARY_PATH + value: /run/nvidia/lib securityContext: allowPrivilegeEscalation: false capabilities: @@ -38,6 +40,9 @@ spec: - name: cdi-specs mountPath: /var/run/cdi readOnly: true + - name: nvidia-libs + mountPath: /run/nvidia/lib + readOnly: true volumes: - name: device-plugins hostPath: @@ -45,3 +50,6 @@ spec: - name: cdi-specs hostPath: path: /var/run/cdi + - name: nvidia-libs + hostPath: + path: /run/opengl-driver/lib -- 2.50.1 (Apple Git-155) From 4427eb77f277e173ac0b947b7e6fbc1ba9eab43f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:36:15 -0800 Subject: [PATCH 09/23] Mount NVIDIA libs to standard lib path for NVML discovery go-nvml uses dl.Open which looks in standard library paths. Mount to /usr/lib/x86_64-linux-gnu for reliable discovery. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index b30d0cb..f3f9545 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -29,7 +29,7 @@ spec: - name: CDI_ROOT value: /var/run/cdi - name: LD_LIBRARY_PATH - value: /run/nvidia/lib + value: /usr/lib/x86_64-linux-gnu securityContext: allowPrivilegeEscalation: false capabilities: @@ -41,7 +41,7 @@ spec: mountPath: /var/run/cdi readOnly: true - name: nvidia-libs - mountPath: /run/nvidia/lib + mountPath: /usr/lib/x86_64-linux-gnu readOnly: true volumes: - name: device-plugins -- 2.50.1 (Apple Git-155) From 2cd32108bdafe63f4243c76d1db1b2ce283fee72 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:38:11 -0800 Subject: [PATCH 10/23] Run device plugin as privileged for GPU device node access NVML needs both libnvidia-ml.so and /dev/nvidia* device nodes. Mount libs to a non-clobbering path and run privileged (matching NVIDIA's official deployment) for device file access. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index f3f9545..4ded1bd 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -29,11 +29,9 @@ spec: - name: CDI_ROOT value: /var/run/cdi - name: LD_LIBRARY_PATH - value: /usr/lib/x86_64-linux-gnu + value: /run/nvidia/lib securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: ["ALL"] + privileged: true volumeMounts: - name: device-plugins mountPath: /var/lib/kubelet/device-plugins @@ -41,7 +39,7 @@ spec: mountPath: /var/run/cdi readOnly: true - name: nvidia-libs - mountPath: /usr/lib/x86_64-linux-gnu + mountPath: /run/nvidia/lib readOnly: true volumes: - name: device-plugins -- 2.50.1 (Apple Git-155) From 7b7358225c85bcabe4efe64bdc1a3db426707640 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:39:36 -0800 Subject: [PATCH 11/23] Remove CDI device-list-strategy from device plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CDI annotations require NVML validation that fails on NixOS. Use the default envvar strategy for the device plugin — CDI device injection still works at the containerd level via enable_cdi=true. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 4ded1bd..819a6e4 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -24,10 +24,6 @@ spec: - name: nvidia-device-plugin image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 env: - - name: DEVICE_LIST_STRATEGY - value: cdi-annotations - - name: CDI_ROOT - value: /var/run/cdi - name: LD_LIBRARY_PATH value: /run/nvidia/lib securityContext: -- 2.50.1 (Apple Git-155) From 1556eaa5e4d1adde4b5c6a568215e5e05688876e Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 12:42:46 -0800 Subject: [PATCH 12/23] Mount /nix/store to resolve NVIDIA library symlinks in device plugin NixOS /run/opengl-driver/lib contains symlinks to /nix/store paths. Without mounting the nix store, the symlinks are dangling inside the container and libnvidia-ml.so can't be loaded. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 819a6e4..98349fa 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -37,6 +37,9 @@ spec: - name: nvidia-libs mountPath: /run/nvidia/lib readOnly: true + - name: nix-store + mountPath: /nix/store + readOnly: true volumes: - name: device-plugins hostPath: @@ -47,3 +50,6 @@ spec: - name: nvidia-libs hostPath: path: /run/opengl-driver/lib + - name: nix-store + hostPath: + path: /nix/store -- 2.50.1 (Apple Git-155) From 37f625b1fa39e961959b84e793a664d80592bdca Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:04:38 -0800 Subject: [PATCH 13/23] Switch nvidia device plugin to CDI device list strategy Use CDI-based device injection instead of nvidia-container-runtime. The NixOS nvidia-container-toolkit module generates CDI specs with all the correct nix store paths, so containerd's native CDI support handles GPU device and library injection without a custom runtime. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 98349fa..73bd17e 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -23,6 +23,9 @@ spec: containers: - name: nvidia-device-plugin image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 + args: + - --device-list-strategy=cdi-cri + - --cdi-annotation-prefix=cdi.k8s.io/ env: - name: LD_LIBRARY_PATH value: /run/nvidia/lib -- 2.50.1 (Apple Git-155) From 9192a3120434e97c9e3d66c02f754a7692637ddd Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:20:01 -0800 Subject: [PATCH 14/23] Use nvidia-container-runtime.cdi for GPU workload injection Replace the CDI device-list-strategy approach (which fails because the device plugin generates its own CDI specs and can't find libs on NixOS) with the nvidia-container-runtime.cdi runtime handler approach: - Add wrapper script at /etc/nvidia-container-runtime/ that provides runc in PATH for nvidia-container-runtime.cdi - Register nvidia runtime handler in k3s containerd config - Create RuntimeClass for GPU workloads - Revert device plugin to default envvar strategy (already working) - Add runtimeClassName: nvidia to Frigate deployment The nvidia-container-runtime.cdi binary reads the NixOS-generated CDI specs from /var/run/cdi/ and injects GPU devices and driver libraries into containers at create time. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/deployment.yaml | 1 + .../nvidia-device-plugin/daemonset.yaml | 3 --- .../nvidia-device-plugin/runtime-class.yaml | 6 ++++++ nixos/ringtail/configuration.nix | 18 ++++++++++++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 argocd/manifests/nvidia-device-plugin/runtime-class.yaml diff --git a/argocd/manifests/frigate/deployment.yaml b/argocd/manifests/frigate/deployment.yaml index 45bc9e1..50d9d00 100644 --- a/argocd/manifests/frigate/deployment.yaml +++ b/argocd/manifests/frigate/deployment.yaml @@ -14,6 +14,7 @@ spec: labels: app: frigate spec: + runtimeClassName: nvidia initContainers: - name: copy-config image: busybox:1.37 diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 73bd17e..98349fa 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -23,9 +23,6 @@ spec: containers: - name: nvidia-device-plugin image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 - args: - - --device-list-strategy=cdi-cri - - --cdi-annotation-prefix=cdi.k8s.io/ env: - name: LD_LIBRARY_PATH value: /run/nvidia/lib diff --git a/argocd/manifests/nvidia-device-plugin/runtime-class.yaml b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml new file mode 100644 index 0000000..7ba6add --- /dev/null +++ b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 1137a9a..185dc75 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -38,6 +38,18 @@ in # NVIDIA container toolkit (CDI specs + runtime for containerd/k3s GPU pods) hardware.nvidia-container-toolkit.enable = true; + # Stable-path wrapper for nvidia-container-runtime.cdi (the CDI-based OCI + # runtime that injects GPU devices/libs from NixOS-generated CDI specs). + # The wrapper adds runc to PATH since k3s doesn't ship a standalone runc binary. + environment.etc."nvidia-container-runtime/nvidia-runtime-cdi-wrapper" = { + mode = "0755"; + text = '' + #!/bin/sh + export PATH="${pkgs.runc}/bin:$PATH" + exec ${pkgs.nvidia-container-toolkit.tools}/bin/nvidia-container-runtime.cdi "$@" + ''; + }; + # NFS client support (required for k3s to mount NFS PersistentVolumes) boot.supportedFilesystems = [ "nfs" ]; @@ -121,6 +133,12 @@ in [plugins.'io.containerd.cri.v1.runtime'] enable_cdi = true cdi_spec_dirs = ["/var/run/cdi", "/etc/cdi"] + + [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_type = "io.containerd.runc.v2" + [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia.options] + BinaryName = "/etc/nvidia-container-runtime/nvidia-runtime-cdi-wrapper" ''; }; -- 2.50.1 (Apple Git-155) From bb1e1e5af9cb050b5f88bbf73dbd95b5ebb3f52b Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:24:31 -0800 Subject: [PATCH 15/23] Use index-based device IDs in nvidia device plugin The CDI spec generated by NixOS uses index-based device names (0, all) not UUIDs. The device plugin must match by using --device-id-strategy=index, otherwise nvidia-container-runtime.cdi fails to resolve CDI devices. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 98349fa..0bdb66c 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -23,6 +23,8 @@ spec: containers: - name: nvidia-device-plugin image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 + args: + - --device-id-strategy=index env: - name: LD_LIBRARY_PATH value: /run/nvidia/lib -- 2.50.1 (Apple Git-155) From 27353792edd97add9c882c6688b5700b7a674e75 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:26:11 -0800 Subject: [PATCH 16/23] Use Recreate strategy for Frigate deployment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GPU resources can't be shared during rolling updates — the old pod holds nvidia.com/gpu preventing the new pod from scheduling. Recreate strategy ensures the old pod is terminated before the new one starts. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/deployment.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/argocd/manifests/frigate/deployment.yaml b/argocd/manifests/frigate/deployment.yaml index 50d9d00..1460bb3 100644 --- a/argocd/manifests/frigate/deployment.yaml +++ b/argocd/manifests/frigate/deployment.yaml @@ -6,6 +6,8 @@ metadata: namespace: frigate spec: replicas: 1 + strategy: + type: Recreate selector: matchLabels: app: frigate -- 2.50.1 (Apple Git-155) From 870d60201967b041ee1fe0a75cb9e2506e8bb115 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:28:35 -0800 Subject: [PATCH 17/23] Use Frigate default model instead of custom YOLOv9m The YOLOv9m ONNX model has ops not fully partitionable to CUDA EP, causing CUDA graph capture to fail on the -tensorrt image. Use the default model that ships with the image and is tested for GPU inference. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/configmap.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/argocd/manifests/frigate/configmap.yaml b/argocd/manifests/frigate/configmap.yaml index df7c70c..b0d83ea 100644 --- a/argocd/manifests/frigate/configmap.yaml +++ b/argocd/manifests/frigate/configmap.yaml @@ -59,15 +59,6 @@ data: onnx: type: onnx - model: - model_type: yolo-generic - width: 320 - height: 320 - input_tensor: nchw - input_dtype: float - path: /media/frigate/models/yolov9m.onnx - labelmap_path: /labelmap/coco-80.txt - record: enabled: true continuous: -- 2.50.1 (Apple Git-155) From 4b12e7f7fa636774490e588dca29fb2a11f2430e Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:31:35 -0800 Subject: [PATCH 18/23] Use YOLO-NAS model for TensorRT-compatible ONNX inference The YOLOv9m model fails with CUDA graph capture on the tensorrt image. Try YOLO-NAS-S which has a different architecture that may be fully partitionable to the CUDA execution provider. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/configmap.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/argocd/manifests/frigate/configmap.yaml b/argocd/manifests/frigate/configmap.yaml index b0d83ea..42a57d4 100644 --- a/argocd/manifests/frigate/configmap.yaml +++ b/argocd/manifests/frigate/configmap.yaml @@ -59,6 +59,15 @@ data: onnx: type: onnx + model: + model_type: yolonas + width: 320 + height: 320 + input_tensor: nchw + input_dtype: float + path: /media/frigate/models/yolo_nas_s.onnx + labelmap_path: /labelmap/coco-80.txt + record: enabled: true continuous: -- 2.50.1 (Apple Git-155) From 95873bcca2bd89496eff6be4486a53685c86b2dc Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:34:13 -0800 Subject: [PATCH 19/23] Fix YOLO-NAS input dtype: use int (uint8) not float YOLO-NAS expects uint8 input tensors, not float32. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/configmap.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/argocd/manifests/frigate/configmap.yaml b/argocd/manifests/frigate/configmap.yaml index 42a57d4..8dd0aba 100644 --- a/argocd/manifests/frigate/configmap.yaml +++ b/argocd/manifests/frigate/configmap.yaml @@ -64,7 +64,7 @@ data: width: 320 height: 320 input_tensor: nchw - input_dtype: float + input_dtype: int path: /media/frigate/models/yolo_nas_s.onnx labelmap_path: /labelmap/coco-80.txt -- 2.50.1 (Apple Git-155) From a5949f228d25ff0c0956f0545a4b22d90f65738a Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:39:34 -0800 Subject: [PATCH 20/23] Add Frigate and Ntfy as static homepage services These services moved to ringtail k3s and are no longer autodiscovered by homepage (which runs on indri's minikube). Add them as static service entries in the Infrastructure group. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/homepage/values.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/argocd/manifests/homepage/values.yaml b/argocd/manifests/homepage/values.yaml index 73d4252..151c46e 100644 --- a/argocd/manifests/homepage/values.yaml +++ b/argocd/manifests/homepage/values.yaml @@ -135,6 +135,17 @@ config: # type: caddy # url: http://indri.tail8d86e.ts.net:2019 + # Services on ringtail k3s (not autodiscovered — different cluster) + - Infrastructure: + - NVR: + href: https://nvr.ops.eblu.me + icon: frigate.png + description: Network video recorder + - Ntfy: + href: https://ntfy.ops.eblu.me + icon: ntfy.png + description: Push notifications + # External bookmarks bookmarks: - Admin: -- 2.50.1 (Apple Git-155) From 333950d3ba2d7baa8b1ad6dd7c966382abec17cf Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:47:23 -0800 Subject: [PATCH 21/23] Replace /nix/store mount with targeted nvidia driver lib path Create a stable symlink at /etc/nvidia-driver/lib pointing to the nvidia driver package's lib directory. The device plugin now mounts only the driver libs it needs instead of the entire nix store. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/nvidia-device-plugin/daemonset.yaml | 8 +------- nixos/ringtail/configuration.nix | 4 ++++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml index 0bdb66c..479d6e9 100644 --- a/argocd/manifests/nvidia-device-plugin/daemonset.yaml +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -39,9 +39,6 @@ spec: - name: nvidia-libs mountPath: /run/nvidia/lib readOnly: true - - name: nix-store - mountPath: /nix/store - readOnly: true volumes: - name: device-plugins hostPath: @@ -51,7 +48,4 @@ spec: path: /var/run/cdi - name: nvidia-libs hostPath: - path: /run/opengl-driver/lib - - name: nix-store - hostPath: - path: /nix/store + path: /etc/nvidia-driver/lib diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 185dc75..5a0035b 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -38,6 +38,10 @@ in # NVIDIA container toolkit (CDI specs + runtime for containerd/k3s GPU pods) hardware.nvidia-container-toolkit.enable = true; + # Stable path to NVIDIA driver libraries for k3s device plugin pod mounts. + # Avoids mounting all of /nix/store — only the driver derivation is needed. + environment.etc."nvidia-driver/lib".source = "${config.hardware.nvidia.package}/lib"; + # Stable-path wrapper for nvidia-container-runtime.cdi (the CDI-based OCI # runtime that injects GPU devices/libs from NixOS-generated CDI specs). # The wrapper adds runc to PATH since k3s doesn't ship a standalone runc binary. -- 2.50.1 (Apple Git-155) From 5d44213017e1fe15ce3af551ef1bdb65274f69ca Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 13:59:29 -0800 Subject: [PATCH 22/23] Filter frigate-notify alerts by zone and label frigate-notify was firing on every MQTT detection event regardless of zone, causing notification spam. Add filters to match the Frigate review config: only alert for person/car in the driveway_entrance zone, and drop all unzoned events. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/configmap-notify.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/argocd/manifests/frigate/configmap-notify.yaml b/argocd/manifests/frigate/configmap-notify.yaml index ed357ad..890557a 100644 --- a/argocd/manifests/frigate/configmap-notify.yaml +++ b/argocd/manifests/frigate/configmap-notify.yaml @@ -23,6 +23,16 @@ data: general: title: "Frigate Alert" + zones: + unzoned: drop + allow: + - driveway_entrance + + labels: + allow: + - person + - car + ntfy: enabled: true server: http://ntfy.ntfy.svc.cluster.local:80 -- 2.50.1 (Apple Git-155) From 03dc4a5235c43d46faa5a3fbaa8c945e831571fa Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 14:03:07 -0800 Subject: [PATCH 23/23] Keep MQTT for real-time alerts, add Pacific timezone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Revert webapi change — polling latency is too high for alerts. MQTT with zone/label filters gives sub-second delivery. Add TZ=America/Los_Angeles to frigate-notify for local timestamps. Co-Authored-By: Claude Opus 4.6 --- argocd/manifests/frigate/deployment-notify.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/argocd/manifests/frigate/deployment-notify.yaml b/argocd/manifests/frigate/deployment-notify.yaml index 6273d71..4083d4d 100644 --- a/argocd/manifests/frigate/deployment-notify.yaml +++ b/argocd/manifests/frigate/deployment-notify.yaml @@ -17,6 +17,9 @@ spec: containers: - name: frigate-notify image: ghcr.io/0x2142/frigate-notify:v0.3.5 + env: + - name: TZ + value: America/Los_Angeles volumeMounts: - name: config mountPath: /app/config.yml -- 2.50.1 (Apple Git-155)