From d5d32fe91fe0f1f847581d970d9b41038dba3e94 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 19 Feb 2026 14:27:04 -0800 Subject: [PATCH] Port Frigate NVR to ringtail k3s with GPU acceleration (#217) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Enable NVIDIA container toolkit on ringtail NixOS and configure k3s containerd with nvidia runtime - Add NVIDIA device plugin ArgoCD app (RuntimeClass + DaemonSet) to expose `nvidia.com/gpu` resources - Re-target Frigate from indri minikube (arm64, ZMQ detector) to ringtail k3s (x86_64, TensorRT/ONNX) - Switch Frigate image to `-tensorrt` variant with GPU resource limits and increased shared memory ## Manual Prerequisites 1. **NFS access**: Verify ringtail can mount `sifaka:/volume1/frigate` ```fish ssh ringtail 'sudo mount -t nfs sifaka:/volume1/frigate /mnt/storage1 && ls /mnt/storage1 && sudo umount /mnt/storage1' ``` 2. **YOLO model**: Verify `/volume1/frigate/models/yolov9m.onnx` exists on sifaka ## Deployment Steps 1. Provision ringtail: `mise run provision-ringtail` 2. Sync ArgoCD apps: `argocd app sync apps --prune` 3. Deploy NVIDIA device plugin: `argocd app sync nvidia-device-plugin` 4. Verify GPU: `kubectl --context=k3s-ringtail get nodes -o json | jq '.items[].status.capacity'` 5. Deploy Frigate: `argocd app sync frigate` ## Verification - [ ] `nvidia.com/gpu: 1` visible in node capacity - [ ] Frigate pod running with GPU allocated - [ ] Frigate UI loads at `https://nvr.ops.eblu.me` - [ ] Detector shows ONNX/TensorRT on System page - [ ] Camera feed with bounding boxes in live view - [ ] TensorRT engine build completes (watch logs on first start) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/217 --- argocd/apps/frigate.yaml | 2 +- argocd/apps/nvidia-device-plugin.yaml | 18 +++++++ .../manifests/frigate/configmap-notify.yaml | 10 ++++ argocd/manifests/frigate/configmap.yaml | 11 ++-- .../manifests/frigate/deployment-notify.yaml | 3 ++ argocd/manifests/frigate/deployment.yaml | 8 ++- argocd/manifests/frigate/pv-nfs.yaml | 4 +- argocd/manifests/frigate/pvc-database.yaml | 2 +- argocd/manifests/homepage/values.yaml | 11 ++++ .../nvidia-device-plugin/daemonset.yaml | 51 +++++++++++++++++++ .../nvidia-device-plugin/runtime-class.yaml | 6 +++ .../feature-frigate-ringtail-gpu.infra.md | 1 + nixos/ringtail/configuration.nix | 35 +++++++++++++ service-versions.yaml | 7 +++ 14 files changed, 157 insertions(+), 12 deletions(-) create mode 100644 argocd/apps/nvidia-device-plugin.yaml create mode 100644 argocd/manifests/nvidia-device-plugin/daemonset.yaml create mode 100644 argocd/manifests/nvidia-device-plugin/runtime-class.yaml create mode 100644 docs/changelog.d/feature-frigate-ringtail-gpu.infra.md diff --git a/argocd/apps/frigate.yaml b/argocd/apps/frigate.yaml index a90f412..c443774 100644 --- a/argocd/apps/frigate.yaml +++ b/argocd/apps/frigate.yaml @@ -11,7 +11,7 @@ spec: targetRevision: main path: argocd/manifests/frigate destination: - server: https://kubernetes.default.svc + server: https://ringtail.tail8d86e.ts.net:6443 namespace: frigate syncPolicy: syncOptions: diff --git a/argocd/apps/nvidia-device-plugin.yaml b/argocd/apps/nvidia-device-plugin.yaml new file mode 100644 index 0000000..af8395f --- /dev/null +++ b/argocd/apps/nvidia-device-plugin.yaml @@ -0,0 +1,18 @@ +--- +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: nvidia-device-plugin + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/nvidia-device-plugin + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: nvidia-device-plugin + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/frigate/configmap-notify.yaml b/argocd/manifests/frigate/configmap-notify.yaml index ed357ad..890557a 100644 --- a/argocd/manifests/frigate/configmap-notify.yaml +++ b/argocd/manifests/frigate/configmap-notify.yaml @@ -23,6 +23,16 @@ data: general: title: "Frigate Alert" + zones: + unzoned: drop + allow: + - driveway_entrance + + labels: + allow: + - person + - car + ntfy: enabled: true server: http://ntfy.ntfy.svc.cluster.local:80 diff --git a/argocd/manifests/frigate/configmap.yaml b/argocd/manifests/frigate/configmap.yaml index 9c96008..8dd0aba 100644 --- a/argocd/manifests/frigate/configmap.yaml +++ b/argocd/manifests/frigate/configmap.yaml @@ -56,17 +56,16 @@ data: track: [person, car, dog, cat, bird] detectors: - apple_silicon: - type: zmq - endpoint: tcp://host.minikube.internal:5555 + onnx: + type: onnx model: - model_type: yolo-generic + model_type: yolonas width: 320 height: 320 input_tensor: nchw - input_dtype: float - path: /media/frigate/models/yolov9m.onnx + input_dtype: int + path: /media/frigate/models/yolo_nas_s.onnx labelmap_path: /labelmap/coco-80.txt record: diff --git a/argocd/manifests/frigate/deployment-notify.yaml b/argocd/manifests/frigate/deployment-notify.yaml index 6273d71..4083d4d 100644 --- a/argocd/manifests/frigate/deployment-notify.yaml +++ b/argocd/manifests/frigate/deployment-notify.yaml @@ -17,6 +17,9 @@ spec: containers: - name: frigate-notify image: ghcr.io/0x2142/frigate-notify:v0.3.5 + env: + - name: TZ + value: America/Los_Angeles volumeMounts: - name: config mountPath: /app/config.yml diff --git a/argocd/manifests/frigate/deployment.yaml b/argocd/manifests/frigate/deployment.yaml index afe11fb..1460bb3 100644 --- a/argocd/manifests/frigate/deployment.yaml +++ b/argocd/manifests/frigate/deployment.yaml @@ -6,6 +6,8 @@ metadata: namespace: frigate spec: replicas: 1 + strategy: + type: Recreate selector: matchLabels: app: frigate @@ -14,6 +16,7 @@ spec: labels: app: frigate spec: + runtimeClassName: nvidia initContainers: - name: copy-config image: busybox:1.37 @@ -25,7 +28,7 @@ spec: mountPath: /config containers: - name: frigate - image: ghcr.io/blakeblackshear/frigate:0.17.0-rc2-standard-arm64 + image: ghcr.io/blakeblackshear/frigate:0.17.0-rc2-tensorrt ports: - containerPort: 5000 name: http @@ -60,6 +63,7 @@ spec: limits: memory: "2Gi" cpu: "2000m" + nvidia.com/gpu: "1" livenessProbe: httpGet: path: /api/version @@ -87,4 +91,4 @@ spec: - name: shm emptyDir: medium: Memory - sizeLimit: 256Mi + sizeLimit: 512Mi diff --git a/argocd/manifests/frigate/pv-nfs.yaml b/argocd/manifests/frigate/pv-nfs.yaml index d3a592b..c7197ab 100644 --- a/argocd/manifests/frigate/pv-nfs.yaml +++ b/argocd/manifests/frigate/pv-nfs.yaml @@ -1,11 +1,11 @@ # NFS PersistentVolume for Frigate recordings -# Requires: NFS share on sifaka at /volume1/frigate with NFS permissions for indri +# Requires: NFS share on sifaka at /volume1/frigate with NFS permissions for ringtail # # To create on Synology: # 1. Control Panel > Shared Folder > Create # 2. Name: frigate, Location: Volume 1 # 3. Control Panel > File Services > NFS > NFS Rules -# 4. Add rule for "frigate" share: Hostname=indri, Privilege=Read/Write, Squash=No mapping +# 4. Add rule for "frigate" share: Hostname=ringtail, Privilege=Read/Write, Squash=No mapping apiVersion: v1 kind: PersistentVolume metadata: diff --git a/argocd/manifests/frigate/pvc-database.yaml b/argocd/manifests/frigate/pvc-database.yaml index 040bda3..1eacb1d 100644 --- a/argocd/manifests/frigate/pvc-database.yaml +++ b/argocd/manifests/frigate/pvc-database.yaml @@ -1,5 +1,5 @@ # PersistentVolumeClaim for Frigate SQLite database -# Uses minikube's default storage class for local provisioning +# Uses k3s local-path storage class for local provisioning apiVersion: v1 kind: PersistentVolumeClaim metadata: diff --git a/argocd/manifests/homepage/values.yaml b/argocd/manifests/homepage/values.yaml index 73d4252..151c46e 100644 --- a/argocd/manifests/homepage/values.yaml +++ b/argocd/manifests/homepage/values.yaml @@ -135,6 +135,17 @@ config: # type: caddy # url: http://indri.tail8d86e.ts.net:2019 + # Services on ringtail k3s (not autodiscovered — different cluster) + - Infrastructure: + - NVR: + href: https://nvr.ops.eblu.me + icon: frigate.png + description: Network video recorder + - Ntfy: + href: https://ntfy.ops.eblu.me + icon: ntfy.png + description: Push notifications + # External bookmarks bookmarks: - Admin: diff --git a/argocd/manifests/nvidia-device-plugin/daemonset.yaml b/argocd/manifests/nvidia-device-plugin/daemonset.yaml new file mode 100644 index 0000000..479d6e9 --- /dev/null +++ b/argocd/manifests/nvidia-device-plugin/daemonset.yaml @@ -0,0 +1,51 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin + namespace: nvidia-device-plugin + labels: + app: nvidia-device-plugin +spec: + selector: + matchLabels: + app: nvidia-device-plugin + template: + metadata: + labels: + app: nvidia-device-plugin + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: system-node-critical + containers: + - name: nvidia-device-plugin + image: nvcr.io/nvidia/k8s-device-plugin:v0.18.2 + args: + - --device-id-strategy=index + env: + - name: LD_LIBRARY_PATH + value: /run/nvidia/lib + securityContext: + privileged: true + volumeMounts: + - name: device-plugins + mountPath: /var/lib/kubelet/device-plugins + - name: cdi-specs + mountPath: /var/run/cdi + readOnly: true + - name: nvidia-libs + mountPath: /run/nvidia/lib + readOnly: true + volumes: + - name: device-plugins + hostPath: + path: /var/lib/kubelet/device-plugins + - name: cdi-specs + hostPath: + path: /var/run/cdi + - name: nvidia-libs + hostPath: + path: /etc/nvidia-driver/lib diff --git a/argocd/manifests/nvidia-device-plugin/runtime-class.yaml b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml new file mode 100644 index 0000000..7ba6add --- /dev/null +++ b/argocd/manifests/nvidia-device-plugin/runtime-class.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: node.k8s.io/v1 +kind: RuntimeClass +metadata: + name: nvidia +handler: nvidia diff --git a/docs/changelog.d/feature-frigate-ringtail-gpu.infra.md b/docs/changelog.d/feature-frigate-ringtail-gpu.infra.md new file mode 100644 index 0000000..d204a4f --- /dev/null +++ b/docs/changelog.d/feature-frigate-ringtail-gpu.infra.md @@ -0,0 +1 @@ +Port Frigate NVR to ringtail k3s with RTX 4080 GPU acceleration (TensorRT/ONNX), replacing the ZMQ-based Apple Silicon detector on indri. diff --git a/nixos/ringtail/configuration.nix b/nixos/ringtail/configuration.nix index 6cb0581..5a0035b 100644 --- a/nixos/ringtail/configuration.nix +++ b/nixos/ringtail/configuration.nix @@ -35,6 +35,28 @@ in package = config.boot.kernelPackages.nvidiaPackages.stable; }; + # NVIDIA container toolkit (CDI specs + runtime for containerd/k3s GPU pods) + hardware.nvidia-container-toolkit.enable = true; + + # Stable path to NVIDIA driver libraries for k3s device plugin pod mounts. + # Avoids mounting all of /nix/store — only the driver derivation is needed. + environment.etc."nvidia-driver/lib".source = "${config.hardware.nvidia.package}/lib"; + + # Stable-path wrapper for nvidia-container-runtime.cdi (the CDI-based OCI + # runtime that injects GPU devices/libs from NixOS-generated CDI specs). + # The wrapper adds runc to PATH since k3s doesn't ship a standalone runc binary. + environment.etc."nvidia-container-runtime/nvidia-runtime-cdi-wrapper" = { + mode = "0755"; + text = '' + #!/bin/sh + export PATH="${pkgs.runc}/bin:$PATH" + exec ${pkgs.nvidia-container-toolkit.tools}/bin/nvidia-container-runtime.cdi "$@" + ''; + }; + + # NFS client support (required for k3s to mount NFS PersistentVolumes) + boot.supportedFilesystems = [ "nfs" ]; + # Wayland / Sway programs.sway = { enable = true; @@ -109,6 +131,19 @@ in "--write-kubeconfig-mode=644" "--tls-san=ringtail.tail8d86e.ts.net" ]; + containerdConfigTemplate = '' + {{ template "base" . }} + + [plugins.'io.containerd.cri.v1.runtime'] + enable_cdi = true + cdi_spec_dirs = ["/var/run/cdi", "/etc/cdi"] + + [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia] + privileged_without_host_devices = false + runtime_type = "io.containerd.runc.v2" + [plugins.'io.containerd.cri.v1.runtime'.containerd.runtimes.nvidia.options] + BinaryName = "/etc/nvidia-container-runtime/nvidia-runtime-cdi-wrapper" + ''; }; # K3s containerd registry mirrors (pull through Zot on indri) diff --git a/service-versions.yaml b/service-versions.yaml index dc102e9..1a435f5 100644 --- a/service-versions.yaml +++ b/service-versions.yaml @@ -51,6 +51,13 @@ services: upstream-source: https://github.com/gethomepage/homepage/releases notes: Deployed via Helm chart + - name: nvidia-device-plugin + type: argocd + last-reviewed: 2026-02-19 + current-version: "v0.18.2" + upstream-source: https://github.com/NVIDIA/k8s-device-plugin/releases + notes: DaemonSet + RuntimeClass on ringtail for GPU workloads + - name: frigate type: argocd last-reviewed: 2026-02-17