From 483db74a3ccc544373a2eb8c820e3e742c9191ee Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 16:03:05 -0800 Subject: [PATCH 1/6] Add SMART disk health monitoring and Ansible provisioning for sifaka NAS Adds smartctl_exporter alongside the existing node_exporter on sifaka, routed through Caddy L4 TCP proxy at nas.ops.eblu.me, with a Grafana dashboard for disk health visibility. Introduces the first Ansible playbook for sifaka (mise run provision-sifaka) and shared exporter port variables in group_vars/all.yml. Co-Authored-By: Claude Opus 4.6 --- ansible/group_vars/all.yml | 4 + ansible/inventory/host_vars/sifaka.yml | 3 + ansible/playbooks/sifaka.yml | 7 + ansible/roles/caddy/defaults/main.yml | 4 + .../roles/sifaka_exporters/defaults/main.yml | 5 + .../roles/sifaka_exporters/handlers/main.yml | 10 + ansible/roles/sifaka_exporters/tasks/main.yml | 81 +++++ .../dashboards/configmap-sifaka-disks.yaml | 314 ++++++++++++++++++ .../grafana-config/kustomization.yaml | 1 + argocd/manifests/prometheus/configmap.yaml | 11 +- ...eature-sifaka-ops-observability.feature.md | 1 + docs/reference/infrastructure/routing.md | 2 + docs/reference/storage/sifaka.md | 9 +- mise-tasks/provision-sifaka | 9 + 14 files changed, 456 insertions(+), 5 deletions(-) create mode 100644 ansible/inventory/host_vars/sifaka.yml create mode 100644 ansible/playbooks/sifaka.yml create mode 100644 ansible/roles/sifaka_exporters/defaults/main.yml create mode 100644 ansible/roles/sifaka_exporters/handlers/main.yml create mode 100644 ansible/roles/sifaka_exporters/tasks/main.yml create mode 100644 argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml create mode 100644 docs/changelog.d/feature-sifaka-ops-observability.feature.md create mode 100755 mise-tasks/provision-sifaka diff --git a/ansible/group_vars/all.yml b/ansible/group_vars/all.yml index a9f303d..342a493 100644 --- a/ansible/group_vars/all.yml +++ b/ansible/group_vars/all.yml @@ -1,2 +1,6 @@ --- ansible_managed: "Managed by ansible - do not edit. Source: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git" + +# Sifaka NAS exporter ports — shared by caddy (indri) and sifaka_exporters roles +sifaka_node_exporter_port: 9100 +sifaka_smartctl_exporter_port: 9633 diff --git a/ansible/inventory/host_vars/sifaka.yml b/ansible/inventory/host_vars/sifaka.yml new file mode 100644 index 0000000..1afd4d8 --- /dev/null +++ b/ansible/inventory/host_vars/sifaka.yml @@ -0,0 +1,3 @@ +--- +ansible_user: eblume +ansible_python_interpreter: /usr/bin/python3 diff --git a/ansible/playbooks/sifaka.yml b/ansible/playbooks/sifaka.yml new file mode 100644 index 0000000..511a358 --- /dev/null +++ b/ansible/playbooks/sifaka.yml @@ -0,0 +1,7 @@ +--- +- name: Configure sifaka + hosts: nas + + roles: + - role: sifaka_exporters + tags: sifaka_exporters diff --git a/ansible/roles/caddy/defaults/main.yml b/ansible/roles/caddy/defaults/main.yml index c35ef76..5d88fbf 100644 --- a/ansible/roles/caddy/defaults/main.yml +++ b/ansible/roles/caddy/defaults/main.yml @@ -84,3 +84,7 @@ caddy_tcp_services: backend: "localhost:2200" # Forgejo SSH - port: 5432 backend: "pg.tail8d86e.ts.net:5432" # PostgreSQL + - port: "{{ sifaka_node_exporter_port }}" + backend: "sifaka:{{ sifaka_node_exporter_port }}" # Sifaka node_exporter + - port: "{{ sifaka_smartctl_exporter_port }}" + backend: "sifaka:{{ sifaka_smartctl_exporter_port }}" # Sifaka smartctl_exporter diff --git a/ansible/roles/sifaka_exporters/defaults/main.yml b/ansible/roles/sifaka_exporters/defaults/main.yml new file mode 100644 index 0000000..2e1b782 --- /dev/null +++ b/ansible/roles/sifaka_exporters/defaults/main.yml @@ -0,0 +1,5 @@ +--- +# Docker images for Prometheus exporters on sifaka NAS +# Ports are defined in group_vars/all.yml (shared with caddy role) +sifaka_exporters_node_exporter_image: "prom/node-exporter:latest" +sifaka_exporters_smartctl_exporter_image: "prometheuscommunity/smartctl-exporter:latest" diff --git a/ansible/roles/sifaka_exporters/handlers/main.yml b/ansible/roles/sifaka_exporters/handlers/main.yml new file mode 100644 index 0000000..c29f536 --- /dev/null +++ b/ansible/roles/sifaka_exporters/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: Restart node_exporter + ansible.builtin.command: docker restart node_exporter + listen: Restart node_exporter + changed_when: true + +- name: Restart smartctl_exporter + ansible.builtin.command: docker restart smartctl_exporter + listen: Restart smartctl_exporter + changed_when: true diff --git a/ansible/roles/sifaka_exporters/tasks/main.yml b/ansible/roles/sifaka_exporters/tasks/main.yml new file mode 100644 index 0000000..36d441b --- /dev/null +++ b/ansible/roles/sifaka_exporters/tasks/main.yml @@ -0,0 +1,81 @@ +--- +# Manage Prometheus exporter containers on sifaka NAS +# Uses command module to avoid requiring docker Python SDK on Synology + +# --- node_exporter --- + +- name: Pull node_exporter image + ansible.builtin.command: docker pull {{ sifaka_exporters_node_exporter_image }} + register: sifaka_exporters_node_pull + changed_when: "'Downloaded newer image' in sifaka_exporters_node_pull.stdout" + +- name: Check if node_exporter container exists + ansible.builtin.command: docker inspect node_exporter --format {% raw %}'{{.Config.Image}}'{% endraw %} + register: sifaka_exporters_node_inspect + changed_when: false + failed_when: false + +- name: Remove node_exporter container if image changed + ansible.builtin.command: docker rm -f node_exporter + when: + - sifaka_exporters_node_inspect.rc == 0 + - sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image + changed_when: true + +- name: Start node_exporter container + ansible.builtin.command: + argv: + - docker + - run + - -d + - --name=node_exporter + - --restart=always + - --net=host + - --pid=host + - -v + - /:/host:ro,rslave + - "{{ sifaka_exporters_node_exporter_image }}" + - --path.rootfs=/host + register: sifaka_exporters_node_start + when: > + sifaka_exporters_node_inspect.rc != 0 or + sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image + changed_when: sifaka_exporters_node_start.rc == 0 + +# --- smartctl_exporter --- + +- name: Pull smartctl_exporter image + ansible.builtin.command: docker pull {{ sifaka_exporters_smartctl_exporter_image }} + register: sifaka_exporters_smartctl_pull + changed_when: "'Downloaded newer image' in sifaka_exporters_smartctl_pull.stdout" + +- name: Check if smartctl_exporter container exists + ansible.builtin.command: docker inspect smartctl_exporter --format {% raw %}'{{.Config.Image}}'{% endraw %} + register: sifaka_exporters_smartctl_inspect + changed_when: false + failed_when: false + +- name: Remove smartctl_exporter container if image changed + ansible.builtin.command: docker rm -f smartctl_exporter + when: + - sifaka_exporters_smartctl_inspect.rc == 0 + - sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image + changed_when: true + +- name: Start smartctl_exporter container + ansible.builtin.command: + argv: + - docker + - run + - -d + - --name=smartctl_exporter + - --restart=always + - --privileged + - -p + - "{{ sifaka_smartctl_exporter_port }}:{{ sifaka_smartctl_exporter_port }}" + - "{{ sifaka_exporters_smartctl_exporter_image }}" + register: sifaka_exporters_smartctl_start + when: > + sifaka_exporters_smartctl_inspect.rc != 0 or + sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image + changed_when: sifaka_exporters_smartctl_start.rc == 0 diff --git a/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml b/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml new file mode 100644 index 0000000..5d994ee --- /dev/null +++ b/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml @@ -0,0 +1,314 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-sifaka-disks + namespace: monitoring + labels: + grafana_dashboard: "1" +data: + sifaka-disks.json: | + { + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "Health Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "FAILING" }, "1": { "color": "green", "text": "HEALTHY" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 }, + "id": 1, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_healthy{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" } + ], + "title": "SMART Health Status", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "panels": [], + "title": "Temperature", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 40 }, { "color": "red", "value": 50 }] }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 6 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Current Temperature", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisLabel": "", + "axisPlacement": "auto", + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line+area" } + }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "transparent", "value": null }, { "color": "red", "value": 50 }] }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 10 }, + "id": 3, + "options": { + "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Temperature Over Time", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, + "id": 102, + "panels": [], + "title": "Wear Indicators", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 19 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Reallocated_Sector_Ct\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Reallocated Sectors", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 19 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Current_Pending_Sector\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Pending Sectors", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 100 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 23 }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"UDMA_CRC_Error_Count\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "UDMA CRC Errors", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 23 }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Offline_Uncorrectable\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Offline Uncorrectable Sectors", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 }, + "id": 103, + "panels": [], + "title": "Lifetime", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "h" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 0, "y": 28 }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_on_seconds{job=\"smartctl-sifaka\"} / 3600", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Power-On Hours", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 12, "x": 12, "y": 28 }, + "id": 9, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, + "textMode": "value_and_name" + }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_cycle_count{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" } + ], + "title": "Power Cycle Count", + "type": "stat" + } + ], + "refresh": "1m", + "schemaVersion": 38, + "tags": ["sifaka", "storage", "smart"], + "templating": { "list": [] }, + "time": { "from": "now-24h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Sifaka Disk Health", + "uid": "sifaka-disk-health", + "version": 1, + "weekStart": "" + } diff --git a/argocd/manifests/grafana-config/kustomization.yaml b/argocd/manifests/grafana-config/kustomization.yaml index 00dc5c6..a0a2356 100644 --- a/argocd/manifests/grafana-config/kustomization.yaml +++ b/argocd/manifests/grafana-config/kustomization.yaml @@ -19,6 +19,7 @@ resources: - dashboards/configmap-zot.yaml - dashboards/configmap-docs-apm.yaml - dashboards/configmap-flyio.yaml + - dashboards/configmap-sifaka-disks.yaml # TeslaMate dashboards - dashboards/configmap-teslamate-overview.yaml - dashboards/configmap-teslamate-charges.yaml diff --git a/argocd/manifests/prometheus/configmap.yaml b/argocd/manifests/prometheus/configmap.yaml index cc43999..0881d2e 100644 --- a/argocd/manifests/prometheus/configmap.yaml +++ b/argocd/manifests/prometheus/configmap.yaml @@ -13,12 +13,15 @@ data: # K8s services are scraped directly scrape_configs: - # Sifaka NAS node-exporter (via LAN - Docker NATs through indri) - # Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts) - # If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml + # Sifaka NAS exporters (via Caddy L4 TCP proxy on indri) - job_name: "node-exporter-sifaka" static_configs: - - targets: ["192.168.1.203:9100"] + - targets: ["nas.ops.eblu.me:9100"] + + - job_name: "smartctl-sifaka" + scrape_interval: 60s + static_configs: + - targets: ["nas.ops.eblu.me:9633"] # CNPG PostgreSQL metrics (k8s internal) - job_name: "cnpg-postgres" diff --git a/docs/changelog.d/feature-sifaka-ops-observability.feature.md b/docs/changelog.d/feature-sifaka-ops-observability.feature.md new file mode 100644 index 0000000..156e253 --- /dev/null +++ b/docs/changelog.d/feature-sifaka-ops-observability.feature.md @@ -0,0 +1 @@ +Add SMART disk health monitoring for sifaka NAS with smartctl_exporter, Grafana dashboard, Ansible playbook, and Caddy L4 routing via ops.eblu.me. diff --git a/docs/reference/infrastructure/routing.md b/docs/reference/infrastructure/routing.md index cf8e115..9270909 100644 --- a/docs/reference/infrastructure/routing.md +++ b/docs/reference/infrastructure/routing.md @@ -62,6 +62,8 @@ DNS CNAMEs point to `blumeops-proxy.fly.dev`. TLS via Fly.io-managed Let's Encry | 443 | Caddy | HTTPS | 0.0.0.0 | Reverse proxy | | 2222 | Caddy L4 | TCP | 0.0.0.0 | SSH proxy to Forgejo | | 5432 | Caddy L4 | TCP | 0.0.0.0 | PostgreSQL proxy | +| 9100 | Caddy L4 | TCP | 0.0.0.0 | Sifaka node_exporter proxy | +| 9633 | Caddy L4 | TCP | 0.0.0.0 | Sifaka smartctl_exporter proxy | | 2200 | Forgejo SSH | TCP | localhost | Built-in SSH server | | 3001 | Forgejo | HTTP | localhost | Web UI | | 5050 | Zot | HTTP | localhost | Registry API | diff --git a/docs/reference/storage/sifaka.md b/docs/reference/storage/sifaka.md index caad5c5..33c2d7e 100644 --- a/docs/reference/storage/sifaka.md +++ b/docs/reference/storage/sifaka.md @@ -37,7 +37,14 @@ Synology NAS providing network storage and backup target. ## Monitoring -Node exporter running in Docker container, scraped by [[prometheus]] at `sifaka:9100`. +Prometheus exporters run as Docker containers, managed by Ansible (`mise run provision-sifaka`). + +| Exporter | Port | Purpose | +|----------|------|---------| +| node_exporter | 9100 | System metrics (CPU, memory, disk I/O) | +| smartctl_exporter | 9633 | SMART disk health data | + +Scraped by [[prometheus]] via Caddy L4 TCP proxy at `nas.ops.eblu.me:9100` and `nas.ops.eblu.me:9633`. Dashboard: [[grafana]] > Sifaka Disk Health. ## Tailscale diff --git a/mise-tasks/provision-sifaka b/mise-tasks/provision-sifaka new file mode 100755 index 0000000..8ef0631 --- /dev/null +++ b/mise-tasks/provision-sifaka @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +#MISE description="Run ansible playbook to provision sifaka" + +set -euo pipefail + +export MISE_TASK_OUTPUT=interleave + +cd ansible +ansible-playbook playbooks/sifaka.yml "$@" -- 2.50.1 (Apple Git-155) From 4ed2e3bb5e13a6f898a679ba7f1fa4627033d599 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 17:12:32 -0800 Subject: [PATCH 2/6] Fix sifaka_exporters role for Synology environment - Use full docker path (/volume1/@appstore/ContainerManager/usr/bin/docker) - Match existing container name (prom-node-exporter-1) - Remove unnecessary node_exporter flags (--pid=host, volume mounts) - Add become: true for all docker tasks (requires sudo on Synology) - Run smartctl_exporter as --user=root (image drops to nobody internally) - Explicitly specify /dev/sata* devices (Synology uses non-standard paths) Co-Authored-By: Claude Opus 4.6 --- .../roles/sifaka_exporters/defaults/main.yml | 10 ++++ .../roles/sifaka_exporters/handlers/main.yml | 6 ++- ansible/roles/sifaka_exporters/tasks/main.yml | 54 +++++++++++-------- 3 files changed, 46 insertions(+), 24 deletions(-) diff --git a/ansible/roles/sifaka_exporters/defaults/main.yml b/ansible/roles/sifaka_exporters/defaults/main.yml index 2e1b782..a7acd4e 100644 --- a/ansible/roles/sifaka_exporters/defaults/main.yml +++ b/ansible/roles/sifaka_exporters/defaults/main.yml @@ -1,5 +1,15 @@ --- # Docker images for Prometheus exporters on sifaka NAS # Ports are defined in group_vars/all.yml (shared with caddy role) +sifaka_exporters_docker: /volume1/@appstore/ContainerManager/usr/bin/docker sifaka_exporters_node_exporter_image: "prom/node-exporter:latest" +sifaka_exporters_node_exporter_name: "prom-node-exporter-1" sifaka_exporters_smartctl_exporter_image: "prometheuscommunity/smartctl-exporter:latest" +sifaka_exporters_smartctl_exporter_name: "smartctl-exporter" + +# Synology uses /dev/sata* instead of /dev/sd* — smartctl can't auto-detect them +sifaka_exporters_smartctl_devices: + - /dev/sata1 + - /dev/sata2 + - /dev/sata3 + - /dev/sata4 diff --git a/ansible/roles/sifaka_exporters/handlers/main.yml b/ansible/roles/sifaka_exporters/handlers/main.yml index c29f536..f4c6355 100644 --- a/ansible/roles/sifaka_exporters/handlers/main.yml +++ b/ansible/roles/sifaka_exporters/handlers/main.yml @@ -1,10 +1,12 @@ --- - name: Restart node_exporter - ansible.builtin.command: docker restart node_exporter + ansible.builtin.command: "{{ sifaka_exporters_docker }} restart {{ sifaka_exporters_node_exporter_name }}" + become: true listen: Restart node_exporter changed_when: true - name: Restart smartctl_exporter - ansible.builtin.command: docker restart smartctl_exporter + ansible.builtin.command: "{{ sifaka_exporters_docker }} restart {{ sifaka_exporters_smartctl_exporter_name }}" + become: true listen: Restart smartctl_exporter changed_when: true diff --git a/ansible/roles/sifaka_exporters/tasks/main.yml b/ansible/roles/sifaka_exporters/tasks/main.yml index 36d441b..5d3a77c 100644 --- a/ansible/roles/sifaka_exporters/tasks/main.yml +++ b/ansible/roles/sifaka_exporters/tasks/main.yml @@ -1,22 +1,26 @@ --- # Manage Prometheus exporter containers on sifaka NAS # Uses command module to avoid requiring docker Python SDK on Synology +# Requires passwordless sudo for docker — see docs/reference/storage/sifaka.md # --- node_exporter --- - name: Pull node_exporter image - ansible.builtin.command: docker pull {{ sifaka_exporters_node_exporter_image }} + ansible.builtin.command: "{{ sifaka_exporters_docker }} pull {{ sifaka_exporters_node_exporter_image }}" + become: true register: sifaka_exporters_node_pull changed_when: "'Downloaded newer image' in sifaka_exporters_node_pull.stdout" - name: Check if node_exporter container exists - ansible.builtin.command: docker inspect node_exporter --format {% raw %}'{{.Config.Image}}'{% endraw %} + ansible.builtin.command: "{{ sifaka_exporters_docker }} inspect {{ sifaka_exporters_node_exporter_name }} --format {% raw %}'{{.Config.Image}}'{% endraw %}" + become: true register: sifaka_exporters_node_inspect changed_when: false failed_when: false - name: Remove node_exporter container if image changed - ansible.builtin.command: docker rm -f node_exporter + ansible.builtin.command: "{{ sifaka_exporters_docker }} rm -f {{ sifaka_exporters_node_exporter_name }}" + become: true when: - sifaka_exporters_node_inspect.rc == 0 - sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image @@ -25,17 +29,14 @@ - name: Start node_exporter container ansible.builtin.command: argv: - - docker + - "{{ sifaka_exporters_docker }}" - run - -d - - --name=node_exporter + - "--name={{ sifaka_exporters_node_exporter_name }}" - --restart=always - --net=host - - --pid=host - - -v - - /:/host:ro,rslave - "{{ sifaka_exporters_node_exporter_image }}" - - --path.rootfs=/host + become: true register: sifaka_exporters_node_start when: > sifaka_exporters_node_inspect.rc != 0 or @@ -45,35 +46,44 @@ # --- smartctl_exporter --- - name: Pull smartctl_exporter image - ansible.builtin.command: docker pull {{ sifaka_exporters_smartctl_exporter_image }} + ansible.builtin.command: "{{ sifaka_exporters_docker }} pull {{ sifaka_exporters_smartctl_exporter_image }}" + become: true register: sifaka_exporters_smartctl_pull changed_when: "'Downloaded newer image' in sifaka_exporters_smartctl_pull.stdout" - name: Check if smartctl_exporter container exists - ansible.builtin.command: docker inspect smartctl_exporter --format {% raw %}'{{.Config.Image}}'{% endraw %} + ansible.builtin.command: "{{ sifaka_exporters_docker }} inspect {{ sifaka_exporters_smartctl_exporter_name }} --format {% raw %}'{{.Config.Image}}'{% endraw %}" + become: true register: sifaka_exporters_smartctl_inspect changed_when: false failed_when: false - name: Remove smartctl_exporter container if image changed - ansible.builtin.command: docker rm -f smartctl_exporter + ansible.builtin.command: "{{ sifaka_exporters_docker }} rm -f {{ sifaka_exporters_smartctl_exporter_name }}" + become: true when: - sifaka_exporters_smartctl_inspect.rc == 0 - sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image changed_when: true +- name: Build smartctl_exporter device arguments + ansible.builtin.set_fact: + sifaka_exporters_smartctl_device_args: >- + {{ sifaka_exporters_smartctl_devices | map('regex_replace', '^(.*)$', '--smartctl.device=\1') | list }} + - name: Start smartctl_exporter container ansible.builtin.command: - argv: - - docker - - run - - -d - - --name=smartctl_exporter - - --restart=always - - --privileged - - -p - - "{{ sifaka_smartctl_exporter_port }}:{{ sifaka_smartctl_exporter_port }}" - - "{{ sifaka_exporters_smartctl_exporter_image }}" + argv: >- + {{ [ + sifaka_exporters_docker, 'run', '-d', + '--name=' + sifaka_exporters_smartctl_exporter_name, + '--restart=always', + '--privileged', + '--user=root', + '-p', sifaka_smartctl_exporter_port | string + ':' + sifaka_smartctl_exporter_port | string, + sifaka_exporters_smartctl_exporter_image + ] + sifaka_exporters_smartctl_device_args }} + become: true register: sifaka_exporters_smartctl_start when: > sifaka_exporters_smartctl_inspect.rc != 0 or -- 2.50.1 (Apple Git-155) From 316c213aae09ae6a0f56c5fac20e89731f19e650 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 17:22:45 -0800 Subject: [PATCH 3/6] Document sifaka first-time setup and hardware details Adds one-time setup steps (SSH, sudoers, Docker path, device naming) to the sifaka reference card for reproducibility if the NAS is replaced. Co-Authored-By: Claude Opus 4.6 --- docs/reference/storage/sifaka.md | 60 ++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/docs/reference/storage/sifaka.md b/docs/reference/storage/sifaka.md index 33c2d7e..cd751cd 100644 --- a/docs/reference/storage/sifaka.md +++ b/docs/reference/storage/sifaka.md @@ -13,8 +13,8 @@ Synology NAS providing network storage and backup target. | Property | Value | |----------|-------| | **Dashboard** | https://nas.ops.eblu.me | -| **Model** | Synology | -| **Storage** | 10.9TB RAID 5 | +| **Model** | Synology DS423+ (DSM 7) | +| **Storage** | 10.9TB RAID 5 (4x Seagate IronWolf 4TB, ST4000VN006) | | **Role** | Backup target, media storage | ## Network Shares @@ -46,6 +46,62 @@ Prometheus exporters run as Docker containers, managed by Ansible (`mise run pro Scraped by [[prometheus]] via Caddy L4 TCP proxy at `nas.ops.eblu.me:9100` and `nas.ops.eblu.me:9633`. Dashboard: [[grafana]] > Sifaka Disk Health. +## First-Time Setup + +These steps were performed once to enable Ansible provisioning. They are documented here for reference if sifaka is ever replaced or reset. + +### 1. Enable SSH + +DSM Control Panel > Terminal & SNMP > Enable SSH service (port 22). + +### 2. SSH Key Authentication + +From a tailnet client with an existing SSH key: + +```bash +ssh-copy-id eblume@sifaka # uses password auth initially +``` + +Synology requires strict permissions on the home directory. On sifaka: + +```bash +chmod 755 ~ # DSM defaults to 777; SSH refuses keys otherwise +chmod 700 ~/.ssh +chmod 600 ~/.ssh/authorized_keys +``` + +Home directory path: `/var/services/homes/eblume`. + +### 3. Passwordless Sudo for Docker + +Ansible needs `become: true` for Docker commands. Create a sudoers drop-in: + +```bash +sudo vi /etc/sudoers.d/docker-ansible +``` + +Contents: + +``` +eblume ALL=(ALL) NOPASSWD: /volume1/@appstore/ContainerManager/usr/bin/docker +``` + +This grants passwordless sudo only for the Docker binary — no broader root access. + +### 4. Docker Path + +Synology installs Docker via Container Manager at a non-standard path: + +``` +/volume1/@appstore/ContainerManager/usr/bin/docker +``` + +This is configured in the `sifaka_exporters` role defaults. + +### 5. Synology Device Naming + +Synology uses `/dev/sata*` (e.g., `/dev/sata1` through `/dev/sata4`) instead of the standard `/dev/sd*` naming. The `smartctl_exporter` cannot auto-detect these devices, so they are passed explicitly via `--smartctl.device=` flags in the Ansible role. + ## Tailscale - Tag: `tag:nas` -- 2.50.1 (Apple Git-155) From 97420f302babc2979f48e5de8aef36c520503440 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 17:28:34 -0800 Subject: [PATCH 4/6] Move group_vars into inventory dir and document sifaka setup Ansible searches for group_vars/ relative to the inventory directory, not the project root. Also adds first-time setup docs and hardware details to the sifaka reference card. Co-Authored-By: Claude Opus 4.6 --- ansible/{ => inventory}/group_vars/all.yml | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ansible/{ => inventory}/group_vars/all.yml (100%) diff --git a/ansible/group_vars/all.yml b/ansible/inventory/group_vars/all.yml similarity index 100% rename from ansible/group_vars/all.yml rename to ansible/inventory/group_vars/all.yml -- 2.50.1 (Apple Git-155) From 663c2d92b0cede723bb6048538317663236d9b3b Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 17:39:19 -0800 Subject: [PATCH 5/6] Fix dashboard: correct metric name and stat panel layout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - smartctl_device_smart_healthy → smartctl_device_smart_status - Make all stat panels full-width with auto orientation so 4 device values display side-by-side instead of stacked vertically Co-Authored-By: Claude Opus 4.6 --- .../dashboards/configmap-sifaka-disks.yaml | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml b/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml index 5d994ee..8dd86f3 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml @@ -46,7 +46,7 @@ data: "textMode": "value_and_name" }, "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_healthy{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" } + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_status{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" } ], "title": "SMART Health Status", "type": "stat" @@ -76,7 +76,7 @@ data: "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -144,13 +144,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 12, "x": 0, "y": 19 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 19 }, "id": 4, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -170,13 +170,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 12, "x": 12, "y": 19 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 23 }, "id": 5, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -196,13 +196,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 12, "x": 0, "y": 23 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 27 }, "id": 6, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -222,13 +222,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 12, "x": 12, "y": 23 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 31 }, "id": 7, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -240,7 +240,7 @@ data: }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }, "id": 103, "panels": [], "title": "Lifetime", @@ -257,13 +257,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 12, "x": 0, "y": 28 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 36 }, "id": 8, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -283,13 +283,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 12, "x": 12, "y": 28 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 40 }, "id": 9, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, -- 2.50.1 (Apple Git-155) From c3d9a2478dc3dd9bef54b8cf2a122fac81717d0f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 9 Feb 2026 17:42:37 -0800 Subject: [PATCH 6/6] Fix SMART Health Status panel layout Switch to auto orientation and increase height so the 4 device status blocks display as horizontal squares instead of vertical strips. Co-Authored-By: Claude Opus 4.6 --- .../dashboards/configmap-sifaka-disks.yaml | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml b/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml index 8dd86f3..a92ec23 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml @@ -35,13 +35,13 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 1 }, "id": 1, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", - "orientation": "horizontal", + "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value_and_name" }, @@ -53,7 +53,7 @@ data: }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 101, "panels": [], "title": "Temperature", @@ -70,7 +70,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 6 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 8 }, "id": 2, "options": { "colorMode": "value", @@ -114,7 +114,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 10 }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 }, "id": 3, "options": { "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, @@ -128,7 +128,7 @@ data: }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }, "id": 102, "panels": [], "title": "Wear Indicators", @@ -144,7 +144,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 19 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 21 }, "id": 4, "options": { "colorMode": "value", @@ -170,7 +170,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 23 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 25 }, "id": 5, "options": { "colorMode": "value", @@ -196,7 +196,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 27 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 29 }, "id": 6, "options": { "colorMode": "value", @@ -222,7 +222,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 31 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 33 }, "id": 7, "options": { "colorMode": "value", @@ -240,7 +240,7 @@ data: }, { "collapsed": false, - "gridPos": { "h": 1, "w": 24, "x": 0, "y": 35 }, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 }, "id": 103, "panels": [], "title": "Lifetime", @@ -257,7 +257,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 36 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 38 }, "id": 8, "options": { "colorMode": "value", @@ -283,7 +283,7 @@ data: }, "overrides": [] }, - "gridPos": { "h": 4, "w": 24, "x": 0, "y": 40 }, + "gridPos": { "h": 4, "w": 24, "x": 0, "y": 42 }, "id": 9, "options": { "colorMode": "value", -- 2.50.1 (Apple Git-155)