blumeops/argocd/manifests/grafana-config/dashboards/configmap-sifaka-disks.yaml
Erich Blume 483db74a3c Add SMART disk health monitoring and Ansible provisioning for sifaka NAS
Adds smartctl_exporter alongside the existing node_exporter on sifaka,
routed through Caddy L4 TCP proxy at nas.ops.eblu.me, with a Grafana
dashboard for disk health visibility. Introduces the first Ansible
playbook for sifaka (mise run provision-sifaka) and shared exporter
port variables in group_vars/all.yml.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 16:03:05 -08:00

314 lines
12 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboard-sifaka-disks
namespace: monitoring
labels:
grafana_dashboard: "1"
data:
sifaka-disks.json: |
{
"annotations": { "list": [] },
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [],
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 100,
"panels": [],
"title": "Health Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "text": "FAILING" }, "1": { "color": "green", "text": "HEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 },
"id": 1,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_smart_healthy{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}} ({{model_name}})", "refId": "A" }
],
"title": "SMART Health Status",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 101,
"panels": [],
"title": "Temperature",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 40 }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 24, "x": 0, "y": 6 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Current Temperature",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisLabel": "",
"axisPlacement": "auto",
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line+area" }
},
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "transparent", "value": null }, { "color": "red", "value": 50 }] },
"unit": "celsius"
},
"overrides": []
},
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 10 },
"id": 3,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_temperature{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Temperature Over Time",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 },
"id": 102,
"panels": [],
"title": "Wear Indicators",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 19 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Reallocated_Sector_Ct\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Reallocated Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 19 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Current_Pending_Sector\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Pending Sectors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 100 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 23 },
"id": 6,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"UDMA_CRC_Error_Count\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "UDMA CRC Errors",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 1 }, { "color": "red", "value": 10 }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 23 },
"id": 7,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_attribute{job=\"smartctl-sifaka\", attribute_name=\"Offline_Uncorrectable\", attribute_value_type=\"raw\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Offline Uncorrectable Sectors",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 27 },
"id": 103,
"panels": [],
"title": "Lifetime",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "h"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 0, "y": 28 },
"id": 8,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_on_seconds{job=\"smartctl-sifaka\"} / 3600", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power-On Hours",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
},
"overrides": []
},
"gridPos": { "h": 4, "w": 12, "x": 12, "y": 28 },
"id": 9,
"options": {
"colorMode": "value",
"graphMode": "none",
"justifyMode": "center",
"orientation": "horizontal",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "value_and_name"
},
"targets": [
{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "smartctl_device_power_cycle_count{job=\"smartctl-sifaka\"}", "legendFormat": "{{device}}", "refId": "A" }
],
"title": "Power Cycle Count",
"type": "stat"
}
],
"refresh": "1m",
"schemaVersion": 38,
"tags": ["sifaka", "storage", "smart"],
"templating": { "list": [] },
"time": { "from": "now-24h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Sifaka Disk Health",
"uid": "sifaka-disk-health",
"version": 1,
"weekStart": ""
}