Operations and observability for sifaka NAS #135

Merged
eblume merged 6 commits from feature/sifaka-ops-observability into main 2026-02-09 17:44:06 -08:00
14 changed files with 456 additions and 5 deletions
Showing only changes of commit 483db74a3c - Show all commits

Add SMART disk health monitoring and Ansible provisioning for sifaka NAS

Adds smartctl_exporter alongside the existing node_exporter on sifaka,
routed through Caddy L4 TCP proxy at nas.ops.eblu.me, with a Grafana
dashboard for disk health visibility. Introduces the first Ansible
playbook for sifaka (mise run provision-sifaka) and shared exporter
port variables in group_vars/all.yml.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Erich Blume 2026-02-09 16:03:05 -08:00

View file

@ -1,2 +1,6 @@
---
ansible_managed: "Managed by ansible - do not edit. Source: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git"
# Sifaka NAS exporter ports — shared by caddy (indri) and sifaka_exporters roles
sifaka_node_exporter_port: 9100
sifaka_smartctl_exporter_port: 9633

View file

@ -0,0 +1,3 @@
---
ansible_user: eblume
ansible_python_interpreter: /usr/bin/python3

View file

@ -0,0 +1,7 @@
---
- name: Configure sifaka
hosts: nas
roles:
- role: sifaka_exporters
tags: sifaka_exporters

View file

@ -84,3 +84,7 @@ caddy_tcp_services:
backend: "localhost:2200" # Forgejo SSH
- port: 5432
backend: "pg.tail8d86e.ts.net:5432" # PostgreSQL
- port: "{{ sifaka_node_exporter_port }}"
backend: "sifaka:{{ sifaka_node_exporter_port }}" # Sifaka node_exporter
- port: "{{ sifaka_smartctl_exporter_port }}"
backend: "sifaka:{{ sifaka_smartctl_exporter_port }}" # Sifaka smartctl_exporter

View file

@ -0,0 +1,5 @@
---
# Docker images for Prometheus exporters on sifaka NAS
# Ports are defined in group_vars/all.yml (shared with caddy role)
sifaka_exporters_node_exporter_image: "prom/node-exporter:latest"
sifaka_exporters_smartctl_exporter_image: "prometheuscommunity/smartctl-exporter:latest"

View file

@ -0,0 +1,10 @@
---
- name: Restart node_exporter
ansible.builtin.command: docker restart node_exporter
listen: Restart node_exporter
changed_when: true
- name: Restart smartctl_exporter
ansible.builtin.command: docker restart smartctl_exporter
listen: Restart smartctl_exporter
changed_when: true

View file

@ -0,0 +1,81 @@
---
# Manage Prometheus exporter containers on sifaka NAS
# Uses command module to avoid requiring docker Python SDK on Synology
# --- node_exporter ---
- name: Pull node_exporter image
ansible.builtin.command: docker pull {{ sifaka_exporters_node_exporter_image }}
register: sifaka_exporters_node_pull
changed_when: "'Downloaded newer image' in sifaka_exporters_node_pull.stdout"
- name: Check if node_exporter container exists
ansible.builtin.command: docker inspect node_exporter --format {% raw %}'{{.Config.Image}}'{% endraw %}
register: sifaka_exporters_node_inspect
changed_when: false
failed_when: false
- name: Remove node_exporter container if image changed
ansible.builtin.command: docker rm -f node_exporter
when:
- sifaka_exporters_node_inspect.rc == 0
- sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image
changed_when: true
- name: Start node_exporter container
ansible.builtin.command:
argv:
- docker
- run
- -d
- --name=node_exporter
- --restart=always
- --net=host
- --pid=host
- -v
- /:/host:ro,rslave
- "{{ sifaka_exporters_node_exporter_image }}"
- --path.rootfs=/host
register: sifaka_exporters_node_start
when: >
sifaka_exporters_node_inspect.rc != 0 or
sifaka_exporters_node_inspect.stdout != sifaka_exporters_node_exporter_image
changed_when: sifaka_exporters_node_start.rc == 0
# --- smartctl_exporter ---
- name: Pull smartctl_exporter image
ansible.builtin.command: docker pull {{ sifaka_exporters_smartctl_exporter_image }}
register: sifaka_exporters_smartctl_pull
changed_when: "'Downloaded newer image' in sifaka_exporters_smartctl_pull.stdout"
- name: Check if smartctl_exporter container exists
ansible.builtin.command: docker inspect smartctl_exporter --format {% raw %}'{{.Config.Image}}'{% endraw %}
register: sifaka_exporters_smartctl_inspect
changed_when: false
failed_when: false
- name: Remove smartctl_exporter container if image changed
ansible.builtin.command: docker rm -f smartctl_exporter
when:
- sifaka_exporters_smartctl_inspect.rc == 0
- sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image
changed_when: true
- name: Start smartctl_exporter container
ansible.builtin.command:
argv:
- docker
- run
- -d
- --name=smartctl_exporter
- --restart=always
- --privileged
- -p
- "{{ sifaka_smartctl_exporter_port }}:{{ sifaka_smartctl_exporter_port }}"
- "{{ sifaka_exporters_smartctl_exporter_image }}"
register: sifaka_exporters_smartctl_start
when: >
sifaka_exporters_smartctl_inspect.rc != 0 or
sifaka_exporters_smartctl_inspect.stdout != sifaka_exporters_smartctl_exporter_image
changed_when: sifaka_exporters_smartctl_start.rc == 0

View file

@ -0,0 +1,314 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-sifaka-disks
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
sifaka-disks.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Health Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "text": "FAILING" }, "1": { "color": "green", "text": "HEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_healthy{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" }
],
"title": "SMART Health Status",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"panels": [],
"title": "Temperature",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 40 }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 6 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Current Temperature",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "",
"axisPlacement": "auto",
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line+area" }
},
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "transparent", "value": null }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 10 },
"id": 3,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Temperature Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 },
"id": 102,
"panels": [],
"title": "Wear Indicators",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 19 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Reallocated_Sector_Ct\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Reallocated Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 19 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Current_Pending_Sector\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Pending Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 100 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 23 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"UDMA_CRC_Error_Count\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "UDMA CRC Errors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 23 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Offline_Uncorrectable\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Offline Uncorrectable Sectors",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 },
"id": 103,
"panels": [],
"title": "Lifetime",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "h"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 28 },
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_on_seconds{job=\"smartctl-sifaka\"} / 3600", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power-On Hours",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 28 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_cycle_count{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power Cycle Count",
"type": "stat"
}
],
"refresh": "1m",
"schemaVersion": 38,
"tags": ["sifaka", "storage", "smart"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Sifaka Disk Health",
"uid": "sifaka-disk-health",
"version": 1,
"weekStart": ""
}

View file

@ -19,6 +19,7 @@ resources:
- dashboards/configmap-zot.yaml
- dashboards/configmap-docs-apm.yaml
- dashboards/configmap-flyio.yaml
- dashboards/configmap-sifaka-disks.yaml
# TeslaMate dashboards
- dashboards/configmap-teslamate-overview.yaml
- dashboards/configmap-teslamate-charges.yaml

View file

@ -13,12 +13,15 @@ data:
# K8s services are scraped directly
scrape_configs:
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
# Sifaka NAS exporters (via Caddy L4 TCP proxy on indri)
- job_name: "node-exporter-sifaka"
static_configs:
- targets: ["192.168.1.203:9100"]
- targets: ["nas.ops.eblu.me:9100"]
- job_name: "smartctl-sifaka"
scrape_interval: 60s
static_configs:
- targets: ["nas.ops.eblu.me:9633"]
# CNPG PostgreSQL metrics (k8s internal)
- job_name: "cnpg-postgres"

View file

@ -0,0 +1 @@
Add SMART disk health monitoring for sifaka NAS with smartctl_exporter, Grafana dashboard, Ansible playbook, and Caddy L4 routing via ops.eblu.me.

View file

@ -62,6 +62,8 @@ DNS CNAMEs point to `blumeops-proxy.fly.dev`. TLS via Fly.io-managed Let's Encry
| 443 | Caddy | HTTPS | 0.0.0.0 | Reverse proxy |
| 2222 | Caddy L4 | TCP | 0.0.0.0 | SSH proxy to Forgejo |
| 5432 | Caddy L4 | TCP | 0.0.0.0 | PostgreSQL proxy |
| 9100 | Caddy L4 | TCP | 0.0.0.0 | Sifaka node_exporter proxy |
| 9633 | Caddy L4 | TCP | 0.0.0.0 | Sifaka smartctl_exporter proxy |
| 2200 | Forgejo SSH | TCP | localhost | Built-in SSH server |
| 3001 | Forgejo | HTTP | localhost | Web UI |
| 5050 | Zot | HTTP | localhost | Registry API |

View file

@ -37,7 +37,14 @@ Synology NAS providing network storage and backup target.
## Monitoring
Node exporter running in Docker container, scraped by [[prometheus]] at `sifaka:9100`.
Prometheus exporters run as Docker containers, managed by Ansible (`mise run provision-sifaka`).
| Exporter | Port | Purpose |
|----------|------|---------|
| node_exporter | 9100 | System metrics (CPU, memory, disk I/O) |
| smartctl_exporter | 9633 | SMART disk health data |
Scraped by [[prometheus]] via Caddy L4 TCP proxy at `nas.ops.eblu.me:9100` and `nas.ops.eblu.me:9633`. Dashboard: [[grafana]] > Sifaka Disk Health.
## Tailscale

9
mise-tasks/provision-sifaka Executable file
View file

@ -0,0 +1,9 @@
#!/usr/bin/env bash
#MISE description="Run ansible playbook to provision sifaka"
set -euo pipefail
export MISE_TASK_OUTPUT=interleave
cd ansible
ansible-playbook playbooks/sifaka.yml "$@"