Operations and observability for sifaka NAS #135

Merged
eblume merged 6 commits from feature/sifaka-ops-observability into main 2026-02-09 17:44:06 -08:00
15 changed files with 538 additions and 9 deletions

View file

@ -1,2 +0,0 @@
---
ansible_managed: "Managed by ansible - do not edit. Source: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git"

View file

@ -0,0 +1,6 @@
---
ansible_managed: "Managed by ansible - do not edit. Source: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git"
# Sifaka NAS exporter ports — shared by caddy (indri) and sifaka_exporters roles
sifaka_node_exporter_port: 9100
sifaka_smartctl_exporter_port: 9633

View file

@ -0,0 +1,3 @@
---
ansible_user: eblume
ansible_python_interpreter: /usr/bin/python3

View file

@ -0,0 +1,7 @@
---
- name: Configure sifaka
hosts: nas
roles:
- role: sifaka_exporters
tags: sifaka_exporters

View file

@ -84,3 +84,7 @@ caddy_tcp_services:
backend: "localhost:2200" # Forgejo SSH backend: "localhost:2200" # Forgejo SSH
- port: 5432 - port: 5432
backend: "pg.tail8d86e.ts.net:5432" # PostgreSQL backend: "pg.tail8d86e.ts.net:5432" # PostgreSQL
- port: "{{ sifaka_node_exporter_port }}"
backend: "sifaka:{{ sifaka_node_exporter_port }}" # Sifaka node_exporter
- port: "{{ sifaka_smartctl_exporter_port }}"
backend: "sifaka:{{ sifaka_smartctl_exporter_port }}" # Sifaka smartctl_exporter

View file

@ -0,0 +1,15 @@
---
# Docker images for Prometheus exporters on sifaka NAS
# Ports are defined in group_vars/all.yml (shared with caddy role)
sifaka_exporters_docker: /volume1/@appstore/ContainerManager/usr/bin/docker
sifaka_exporters_node_exporter_image: "prom/node-exporter:latest"
sifaka_exporters_node_exporter_name: "prom-node-exporter-1"
sifaka_exporters_smartctl_exporter_image: "prometheuscommunity/smartctl-exporter:latest"
sifaka_exporters_smartctl_exporter_name: "smartctl-exporter"
# Synology uses /dev/sata* instead of /dev/sd* — smartctl can't auto-detect them
sifaka_exporters_smartctl_devices:
- /dev/sata1
- /dev/sata2
- /dev/sata3
- /dev/sata4

View file

@ -0,0 +1,12 @@
---
- name: Restart node_exporter
ansible.builtin.command: "{{ sifaka_exporters_docker }} restart {{ sifaka_exporters_node_exporter_name }}"
become: true
listen: Restart node_exporter
changed_when: true
- name: Restart smartctl_exporter
ansible.builtin.command: "{{ sifaka_exporters_docker }} restart {{ sifaka_exporters_smartctl_exporter_name }}"
become: true
listen: Restart smartctl_exporter
changed_when: true

View file

@ -0,0 +1,91 @@
---
# Manage Prometheus exporter containers on sifaka NAS
# Uses command module to avoid requiring docker Python SDK on Synology
# Requires passwordless sudo for docker — see docs/reference/storage/sifaka.md
# --- node_exporter ---
- name: Pull node_exporter image
ansible.builtin.command: "{{ sifaka_exporters_docker }} pull {{ sifaka_exporters_node_exporter_image }}"
become: true
register: sifaka_exporters_node_pull
changed_when: "'Downloaded newer image' in sifaka_exporters_node_pull.stdout"
- name: Check if node_exporter container exists
ansible.builtin.command: "{{ sifaka_exporters_docker }} inspect {{ sifaka_exporters_node_exporter_name }} --format {% raw %}'{{.Config.Image}}'{% endraw %}"
become: true
register: sifaka_exporters_node_inspect
changed_when: false
failed_when: false
- name: Remove node_exporter container if image changed
ansible.builtin.command: "{{ sifaka_exporters_docker }} rm -f {{ sifaka_exporters_node_exporter_name }}"
become: true
when:
- sifaka_exporters_node_inspect.rc == 0
- sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image
changed_when: true
- name: Start node_exporter container
ansible.builtin.command:
argv:
- "{{ sifaka_exporters_docker }}"
- run
- -d
- "--name={{ sifaka_exporters_node_exporter_name }}"
- --restart=always
- --net=host
- "{{ sifaka_exporters_node_exporter_image }}"
become: true
register: sifaka_exporters_node_start
when: >
sifaka_exporters_node_inspect.rc != 0 or
sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image
changed_when: sifaka_exporters_node_start.rc == 0
# --- smartctl_exporter ---
- name: Pull smartctl_exporter image
ansible.builtin.command: "{{ sifaka_exporters_docker }} pull {{ sifaka_exporters_smartctl_exporter_image }}"
become: true
register: sifaka_exporters_smartctl_pull
changed_when: "'Downloaded newer image' in sifaka_exporters_smartctl_pull.stdout"
- name: Check if smartctl_exporter container exists
ansible.builtin.command: "{{ sifaka_exporters_docker }} inspect {{ sifaka_exporters_smartctl_exporter_name }} --format {% raw %}'{{.Config.Image}}'{% endraw %}"
become: true
register: sifaka_exporters_smartctl_inspect
changed_when: false
failed_when: false
- name: Remove smartctl_exporter container if image changed
ansible.builtin.command: "{{ sifaka_exporters_docker }} rm -f {{ sifaka_exporters_smartctl_exporter_name }}"
become: true
when:
- sifaka_exporters_smartctl_inspect.rc == 0
- sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image
changed_when: true
- name: Build smartctl_exporter device arguments
ansible.builtin.set_fact:
sifaka_exporters_smartctl_device_args: >-
{{ sifaka_exporters_smartctl_devices | map('regex_replace', '^(.*)$', '--smartctl.device=\1') | list }}
- name: Start smartctl_exporter container
ansible.builtin.command:
argv: >-
{{ [
sifaka_exporters_docker, 'run', '-d',
'--name=' + sifaka_exporters_smartctl_exporter_name,
'--restart=always',
'--privileged',
'--user=root',
'-p', sifaka_smartctl_exporter_port | string + ':' + sifaka_smartctl_exporter_port | string,
sifaka_exporters_smartctl_exporter_image
] + sifaka_exporters_smartctl_device_args }}
become: true
register: sifaka_exporters_smartctl_start
when: >
sifaka_exporters_smartctl_inspect.rc != 0 or
sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image
changed_when: sifaka_exporters_smartctl_start.rc == 0

View file

@ -0,0 +1,314 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-sifaka-disks
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
sifaka-disks.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Health Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "text": "FAILING" }, "1": { "color": "green", "text": "HEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
},
"overrides": []
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_status{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" }
],
"title": "SMART Health Status",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
"id": 101,
"panels": [],
"title": "Temperature",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 40 }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 8 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Current Temperature",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "",
"axisPlacement": "auto",
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line+area" }
},
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "transparent", "value": null }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
"id": 3,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Temperature Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
"id": 102,
"panels": [],
"title": "Wear Indicators",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 21 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Reallocated_Sector_Ct\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Reallocated Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 25 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Current_Pending_Sector\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Pending Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 100 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 29 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"UDMA_CRC_Error_Count\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "UDMA CRC Errors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 33 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Offline_Uncorrectable\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Offline Uncorrectable Sectors",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 },
"id": 103,
"panels": [],
"title": "Lifetime",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "h"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 38 },
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_on_seconds{job=\"smartctl-sifaka\"} / 3600", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power-On Hours",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 42 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_cycle_count{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power Cycle Count",
"type": "stat"
}
],
"refresh": "1m",
"schemaVersion": 38,
"tags": ["sifaka", "storage", "smart"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Sifaka Disk Health",
"uid": "sifaka-disk-health",
"version": 1,
"weekStart": ""
}

View file

@ -19,6 +19,7 @@ resources:
- dashboards/configmap-zot.yaml - dashboards/configmap-zot.yaml
- dashboards/configmap-docs-apm.yaml - dashboards/configmap-docs-apm.yaml
- dashboards/configmap-flyio.yaml - dashboards/configmap-flyio.yaml
- dashboards/configmap-sifaka-disks.yaml
# TeslaMate dashboards # TeslaMate dashboards
- dashboards/configmap-teslamate-overview.yaml - dashboards/configmap-teslamate-overview.yaml
- dashboards/configmap-teslamate-charges.yaml - dashboards/configmap-teslamate-charges.yaml

View file

@ -13,12 +13,15 @@ data:
# K8s services are scraped directly # K8s services are scraped directly
scrape_configs: scrape_configs:
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri) # Sifaka NAS exporters (via Caddy L4 TCP proxy on indri)
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
- job_name: "node-exporter-sifaka" - job_name: "node-exporter-sifaka"
static_configs: static_configs:
- targets: ["192.168.1.203:9100"] - targets: ["nas.ops.eblu.me:9100"]
- job_name: "smartctl-sifaka"
scrape_interval: 60s
static_configs:
- targets: ["nas.ops.eblu.me:9633"]
# CNPG PostgreSQL metrics (k8s internal) # CNPG PostgreSQL metrics (k8s internal)
- job_name: "cnpg-postgres" - job_name: "cnpg-postgres"

View file

@ -0,0 +1 @@
Add SMART disk health monitoring for sifaka NAS with smartctl_exporter, Grafana dashboard, Ansible playbook, and Caddy L4 routing via ops.eblu.me.

View file

@ -62,6 +62,8 @@ DNS CNAMEs point to `blumeops-proxy.fly.dev`. TLS via Fly.io-managed Let's Encry
| 443 | Caddy | HTTPS | 0.0.0.0 | Reverse proxy | | 443 | Caddy | HTTPS | 0.0.0.0 | Reverse proxy |
| 2222 | Caddy L4 | TCP | 0.0.0.0 | SSH proxy to Forgejo | | 2222 | Caddy L4 | TCP | 0.0.0.0 | SSH proxy to Forgejo |
| 5432 | Caddy L4 | TCP | 0.0.0.0 | PostgreSQL proxy | | 5432 | Caddy L4 | TCP | 0.0.0.0 | PostgreSQL proxy |
| 9100 | Caddy L4 | TCP | 0.0.0.0 | Sifaka node_exporter proxy |
| 9633 | Caddy L4 | TCP | 0.0.0.0 | Sifaka smartctl_exporter proxy |
| 2200 | Forgejo SSH | TCP | localhost | Built-in SSH server | | 2200 | Forgejo SSH | TCP | localhost | Built-in SSH server |
| 3001 | Forgejo | HTTP | localhost | Web UI | | 3001 | Forgejo | HTTP | localhost | Web UI |
| 5050 | Zot | HTTP | localhost | Registry API | | 5050 | Zot | HTTP | localhost | Registry API |

View file

@ -13,8 +13,8 @@ Synology NAS providing network storage and backup target.
| Property | Value | | Property | Value |
|----------|-------| |----------|-------|
| **Dashboard** | https://nas.ops.eblu.me | | **Dashboard** | https://nas.ops.eblu.me |
| **Model** | Synology | | **Model** | Synology DS423+ (DSM 7) |
| **Storage** | 10.9TB RAID 5 | | **Storage** | 10.9TB RAID 5 (4x Seagate IronWolf 4TB, ST4000VN006) |
| **Role** | Backup target, media storage | | **Role** | Backup target, media storage |
## Network Shares ## Network Shares
@ -37,7 +37,70 @@ Synology NAS providing network storage and backup target.
## Monitoring ## Monitoring
Node exporter running in Docker container, scraped by [[prometheus]] at `sifaka:9100`. Prometheus exporters run as Docker containers, managed by Ansible (`mise run provision-sifaka`).
| Exporter | Port | Purpose |
|----------|------|---------|
| node_exporter | 9100 | System metrics (CPU, memory, disk I/O) |
| smartctl_exporter | 9633 | SMART disk health data |
Scraped by [[prometheus]] via Caddy L4 TCP proxy at `nas.ops.eblu.me:9100` and `nas.ops.eblu.me:9633`. Dashboard: [[grafana]] > Sifaka Disk Health.
## First-Time Setup
These steps were performed once to enable Ansible provisioning. They are documented here for reference if sifaka is ever replaced or reset.
### 1. Enable SSH
DSM Control Panel > Terminal & SNMP > Enable SSH service (port 22).
### 2. SSH Key Authentication
From a tailnet client with an existing SSH key:
```bash
ssh-copy-id eblume@sifaka # uses password auth initially
```
Synology requires strict permissions on the home directory. On sifaka:
```bash
chmod 755 ~ # DSM defaults to 777; SSH refuses keys otherwise
chmod 700 ~/.ssh
chmod 600 ~/.ssh/authorized_keys
```
Home directory path: `/var/services/homes/eblume`.
### 3. Passwordless Sudo for Docker
Ansible needs `become: true` for Docker commands. Create a sudoers drop-in:
```bash
sudo vi /etc/sudoers.d/docker-ansible
```
Contents:
```
eblume ALL=(ALL) NOPASSWD: /volume1/@appstore/ContainerManager/usr/bin/docker
```
This grants passwordless sudo only for the Docker binary — no broader root access.
### 4. Docker Path
Synology installs Docker via Container Manager at a non-standard path:
```
/volume1/@appstore/ContainerManager/usr/bin/docker
```
This is configured in the `sifaka_exporters` role defaults.
### 5. Synology Device Naming
Synology uses `/dev/sata*` (e.g., `/dev/sata1` through `/dev/sata4`) instead of the standard `/dev/sd*` naming. The `smartctl_exporter` cannot auto-detect these devices, so they are passed explicitly via `--smartctl.device=` flags in the Ansible role.
## Tailscale ## Tailscale

9
mise-tasks/provision-sifaka Executable file
View file

@ -0,0 +1,9 @@
#!/usr/bin/env bash
#MISE description="Run ansible playbook to provision sifaka"
set -euo pipefail
export MISE_TASK_OUTPUT=interleave
cd ansible
ansible-playbook playbooks/sifaka.yml "$@"