blumeops/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml
Erich Blume 85e36cd807 Operations and observability for sifaka NAS (#135)
## Summary
- Add `smartctl_exporter` Docker container to sifaka for SMART disk health monitoring
- Formalize existing `node_exporter` container under Ansible management
- Route both exporters through Caddy L4 TCP proxy (`nas.ops.eblu.me:9100`, `nas.ops.eblu.me:9633`), replacing the hardcoded LAN IP in Prometheus
- Create "Sifaka Disk Health" Grafana dashboard (health status, temperature, wear indicators, lifetime)
- Introduce `ansible/playbooks/sifaka.yml` and `mise run provision-sifaka` — first Ansible playbook for the NAS
- Shared exporter port variables in `group_vars/all.yml` to avoid duplication between Caddy and sifaka roles

## Prerequisites before deploy
- [ ] Enable SSH on sifaka (DSM Control Panel > Terminal & SNMP)
- [ ] Verify `ssh eblume@sifaka 'docker ps'` works
- [ ] Run `mise run provision-sifaka` to deploy containers
- [ ] Run `mise run provision-indri -- --tags caddy` to add L4 routes
- [ ] `argocd app sync prometheus` + `argocd app sync grafana-config`

## Test plan
- [ ] Verify smartctl_exporter metrics: `curl http://nas.ops.eblu.me:9633/metrics`
- [ ] Verify Prometheus targets page shows both sifaka jobs as UP
- [ ] Verify Grafana "Sifaka Disk Health" dashboard loads with data

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/135
2026-02-09 17:44:05 -08:00

314 lines
12 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-sifaka-disks
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
sifaka-disks.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Health Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "text": "FAILING" }, "1": { "color": "green", "text": "HEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
},
"overrides": []
},
"gridPos": { "h": 6, "w": 24, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_status{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" }
],
"title": "SMART Health Status",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 },
"id": 101,
"panels": [],
"title": "Temperature",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 40 }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 8 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Current Temperature",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "",
"axisPlacement": "auto",
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line+area" }
},
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "transparent", "value": null }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 12 },
"id": 3,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Temperature Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 },
"id": 102,
"panels": [],
"title": "Wear Indicators",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 21 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Reallocated_Sector_Ct\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Reallocated Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 25 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Current_Pending_Sector\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Pending Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 100 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 29 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"UDMA_CRC_Error_Count\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "UDMA CRC Errors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 33 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Offline_Uncorrectable\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Offline Uncorrectable Sectors",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 37 },
"id": 103,
"panels": [],
"title": "Lifetime",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "h"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 38 },
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_on_seconds{job=\"smartctl-sifaka\"} / 3600", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power-On Hours",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 42 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_cycle_count{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power Cycle Count",
"type": "stat"
}
],
"refresh": "1m",
"schemaVersion": 38,
"tags": ["sifaka", "storage", "smart"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Sifaka Disk Health",
"uid": "sifaka-disk-health",
"version": 1,
"weekStart": ""
}