Log filtering cleanup and observability improvements #45

Merged
eblume merged 7 commits from feature/log-filtering-cleanup into main 2026-01-22 17:30:08 -08:00
6 changed files with 352 additions and 16 deletions
Showing only changes of commit 358bbcdffb - Show all commits

Add macOS power/thermal metrics collection and dashboard

- Add powermetrics collector to Alloy role (via LaunchDaemon, requires root)
- Collect CPU, GPU, ANE power (watts) and thermal pressure level
- Add "Power & Thermal" section to macOS Grafana dashboard with:
  - Total power stat
  - Thermal pressure indicator (Nominal/Moderate/Heavy/Critical)
  - Stacked power consumption graph (CPU/GPU/ANE)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Erich Blume 2026-01-22 16:59:07 -08:00

View file

@ -96,3 +96,8 @@ alloy_postgres_database: postgres
alloy_op_vault: vg6xf6vvfmoh5hqjjhlhbeoaie alloy_op_vault: vg6xf6vvfmoh5hqjjhlhbeoaie
alloy_op_postgres_item: guxu3j7ajhjyey6xxl2ovsl2ui alloy_op_postgres_item: guxu3j7ajhjyey6xxl2ovsl2ui
alloy_op_postgres_field: alloy-user-pw alloy_op_postgres_field: alloy-user-pw
# macOS power metrics collection (via powermetrics, requires root)
alloy_collect_power_metrics: true
alloy_power_metrics_script: /usr/local/bin/macos-power-metrics
alloy_power_metrics_interval: 30 # seconds between collection

View file

@ -4,3 +4,10 @@
launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist 2>/dev/null || true launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist 2>/dev/null || true
launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
changed_when: true changed_when: true
- name: Reload macos-power-metrics
ansible.builtin.shell: |
launchctl unload /Library/LaunchDaemons/mcquack.eblume.macos-power-metrics.plist 2>/dev/null || true
launchctl load /Library/LaunchDaemons/mcquack.eblume.macos-power-metrics.plist
become: true
changed_when: true

View file

@ -93,3 +93,39 @@
when: alloy_launchctl_check.rc != 0 when: alloy_launchctl_check.rc != 0
changed_when: true changed_when: true
failed_when: false failed_when: false
# === macOS Power Metrics (requires root) ===
- name: Deploy macos-power-metrics script
ansible.builtin.template:
src: macos-power-metrics.sh.j2
dest: "{{ alloy_power_metrics_script }}"
mode: '0755'
become: true
notify: Reload macos-power-metrics
when: alloy_collect_power_metrics | default(false)
- name: Deploy macos-power-metrics LaunchDaemon plist
ansible.builtin.template:
src: macos-power-metrics.plist.j2
dest: /Library/LaunchDaemons/mcquack.eblume.macos-power-metrics.plist
mode: '0644'
become: true
notify: Reload macos-power-metrics
when: alloy_collect_power_metrics | default(false)
- name: Check if macos-power-metrics LaunchDaemon is loaded
ansible.builtin.command: launchctl list mcquack.eblume.macos-power-metrics
register: alloy_power_metrics_launchctl_check
changed_when: false
failed_when: false
when: alloy_collect_power_metrics | default(false)
- name: Load macos-power-metrics LaunchDaemon if not loaded
ansible.builtin.command: launchctl load /Library/LaunchDaemons/mcquack.eblume.macos-power-metrics.plist
become: true
when:
- alloy_collect_power_metrics | default(false)
- alloy_power_metrics_launchctl_check.rc != 0
changed_when: true
failed_when: false

View file

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- {{ ansible_managed }} -->
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>Label</key>
<string>mcquack.eblume.macos-power-metrics</string>
<key>ProgramArguments</key>
<array>
<string>{{ alloy_power_metrics_script }}</string>
</array>
<key>StartInterval</key>
<integer>{{ alloy_power_metrics_interval }}</integer>
<key>RunAtLoad</key>
<true/>
<key>StandardErrorPath</key>
<string>/var/log/mcquack.macos-power-metrics.err.log</string>
<key>StandardOutPath</key>
<string>/var/log/mcquack.macos-power-metrics.out.log</string>
</dict>
</plist>

View file

@ -0,0 +1,79 @@
#!/bin/bash
# {{ ansible_managed }}
# Collects macOS power and thermal metrics for node_exporter textfile collector
# Requires root to run powermetrics
set -euo pipefail
OUTPUT_FILE="{{ alloy_textfile_dir }}/macos_power.prom"
TEMP_FILE="${OUTPUT_FILE}.tmp"
# Run powermetrics for one sample
POWER_OUTPUT=$(/usr/bin/powermetrics --samplers cpu_power,thermal -n 1 -i 1 2>/dev/null || echo "")
if [ -z "$POWER_OUTPUT" ]; then
# powermetrics failed, write zeros
cat > "$TEMP_FILE" << 'EOF'
# HELP macos_cpu_power_watts CPU power consumption in watts
# TYPE macos_cpu_power_watts gauge
macos_cpu_power_watts 0
# HELP macos_gpu_power_watts GPU power consumption in watts
# TYPE macos_gpu_power_watts gauge
macos_gpu_power_watts 0
# HELP macos_ane_power_watts Apple Neural Engine power consumption in watts
# TYPE macos_ane_power_watts gauge
macos_ane_power_watts 0
# HELP macos_combined_power_watts Combined CPU+GPU+ANE power consumption in watts
# TYPE macos_combined_power_watts gauge
macos_combined_power_watts 0
# HELP macos_thermal_pressure Current thermal pressure level (0=Nominal, 1=Moderate, 2=Heavy, 3=Critical)
# TYPE macos_thermal_pressure gauge
macos_thermal_pressure 0
EOF
mv "$TEMP_FILE" "$OUTPUT_FILE"
exit 0
fi
# Parse power values (in mW, convert to W)
CPU_POWER_MW=$(echo "$POWER_OUTPUT" | grep "^CPU Power:" | awk '{print $3}' || echo "0")
GPU_POWER_MW=$(echo "$POWER_OUTPUT" | grep "^GPU Power:" | awk '{print $3}' || echo "0")
ANE_POWER_MW=$(echo "$POWER_OUTPUT" | grep "^ANE Power:" | awk '{print $3}' || echo "0")
COMBINED_POWER_MW=$(echo "$POWER_OUTPUT" | grep "^Combined Power" | awk '{print $5}' || echo "0")
# Convert mW to W (divide by 1000)
CPU_POWER=$(echo "scale=3; ${CPU_POWER_MW:-0} / 1000" | bc)
GPU_POWER=$(echo "scale=3; ${GPU_POWER_MW:-0} / 1000" | bc)
ANE_POWER=$(echo "scale=3; ${ANE_POWER_MW:-0} / 1000" | bc)
COMBINED_POWER=$(echo "scale=3; ${COMBINED_POWER_MW:-0} / 1000" | bc)
# Parse thermal pressure level
THERMAL_LEVEL=$(echo "$POWER_OUTPUT" | grep "Current pressure level:" | awk '{print $4}' || echo "Nominal")
case "$THERMAL_LEVEL" in
Nominal) THERMAL_VALUE=0 ;;
Moderate) THERMAL_VALUE=1 ;;
Heavy) THERMAL_VALUE=2 ;;
Critical) THERMAL_VALUE=3 ;;
*) THERMAL_VALUE=0 ;;
esac
# Write metrics
cat > "$TEMP_FILE" << EOF
# HELP macos_cpu_power_watts CPU power consumption in watts
# TYPE macos_cpu_power_watts gauge
macos_cpu_power_watts $CPU_POWER
# HELP macos_gpu_power_watts GPU power consumption in watts
# TYPE macos_gpu_power_watts gauge
macos_gpu_power_watts $GPU_POWER
# HELP macos_ane_power_watts Apple Neural Engine power consumption in watts
# TYPE macos_ane_power_watts gauge
macos_ane_power_watts $ANE_POWER
# HELP macos_combined_power_watts Combined CPU+GPU+ANE power consumption in watts
# TYPE macos_combined_power_watts gauge
macos_combined_power_watts $COMBINED_POWER
# HELP macos_thermal_pressure Current thermal pressure level (0=Nominal, 1=Moderate, 2=Heavy, 3=Critical)
# TYPE macos_thermal_pressure gauge
macos_thermal_pressure $THERMAL_VALUE
EOF
# Atomic move
mv "$TEMP_FILE" "$OUTPUT_FILE"

View file

@ -271,6 +271,194 @@ data:
{ {
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 106,
"panels": [],
"title": "Power & Thermal",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 5 },
{ "color": "orange", "value": 10 },
{ "color": "red", "value": 15 }
]
},
"unit": "watt",
"decimals": 1
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 6 },
"id": 60,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "macos_combined_power_watts{instance=~\"$instance\"}",
"refId": "A"
}
],
"title": "Total Power",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "green", "index": 0, "text": "Nominal" } }, "type": "value" },
{ "options": { "1": { "color": "yellow", "index": 1, "text": "Moderate" } }, "type": "value" },
{ "options": { "2": { "color": "orange", "index": 2, "text": "Heavy" } }, "type": "value" },
{ "options": { "3": { "color": "red", "index": 3, "text": "Critical" } }, "type": "value" }
],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 1 },
{ "color": "orange", "value": 2 },
{ "color": "red", "value": 3 }
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 6 },
"id": 61,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": ["lastNotNull"],
"fields": "",
"values": false
},
"textMode": "auto"
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "macos_thermal_pressure{instance=~\"$instance\"}",
"refId": "A"
}
],
"title": "Thermal Pressure",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 30,
"gradientMode": "opacity",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "smooth",
"lineWidth": 2,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" },
"thresholdsStyle": { "mode": "off" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [{ "color": "green", "value": null }]
},
"unit": "watt"
},
"overrides": [
{
"matcher": { "id": "byName", "options": "CPU" },
"properties": [
{ "id": "color", "value": { "fixedColor": "orange", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "GPU" },
"properties": [
{ "id": "color", "value": { "fixedColor": "blue", "mode": "fixed" } }
]
},
{
"matcher": { "id": "byName", "options": "ANE" },
"properties": [
{ "id": "color", "value": { "fixedColor": "purple", "mode": "fixed" } }
]
}
]
},
"gridPos": { "h": 8, "w": 16, "x": 8, "y": 6 },
"id": 62,
"options": {
"legend": {
"calcs": ["mean", "max", "lastNotNull"],
"displayMode": "table",
"placement": "right",
"showLegend": true
},
"tooltip": { "mode": "multi", "sort": "desc" }
},
"targets": [
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "macos_cpu_power_watts{instance=~\"$instance\"}",
"legendFormat": "CPU",
"refId": "A"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "macos_gpu_power_watts{instance=~\"$instance\"}",
"legendFormat": "GPU",
"refId": "B"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"expr": "macos_ane_power_watts{instance=~\"$instance\"}",
"legendFormat": "ANE",
"refId": "C"
}
],
"title": "Power Consumption",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 101, "id": 101,
"panels": [], "panels": [],
"title": "CPU", "title": "CPU",
@ -350,7 +538,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 },
"id": 10, "id": 10,
"options": { "options": {
"legend": { "legend": {
@ -435,7 +623,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 15 },
"id": 11, "id": 11,
"options": { "options": {
"legend": { "legend": {
@ -471,7 +659,7 @@ data:
}, },
{ {
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 },
"id": 102, "id": 102,
"panels": [], "panels": [],
"title": "Memory (macOS)", "title": "Memory (macOS)",
@ -559,7 +747,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 },
"id": 20, "id": 20,
"options": { "options": {
"legend": { "legend": {
@ -626,7 +814,7 @@ data:
}, },
"overrides": [] "overrides": []
}, },
"gridPos": { "h": 8, "w": 4, "x": 12, "y": 15 }, "gridPos": { "h": 8, "w": 4, "x": 12, "y": 24 },
"id": 21, "id": 21,
"options": { "options": {
"orientation": "auto", "orientation": "auto",
@ -702,7 +890,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 8, "x": 16, "y": 15 }, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 24 },
"id": 22, "id": 22,
"options": { "options": {
"legend": { "legend": {
@ -732,7 +920,7 @@ data:
}, },
{ {
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 },
"id": 103, "id": 103,
"panels": [], "panels": [],
"title": "Disk", "title": "Disk",
@ -792,7 +980,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 },
"id": 30, "id": 30,
"options": { "options": {
"legend": { "legend": {
@ -874,7 +1062,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 },
"id": 31, "id": 31,
"options": { "options": {
"legend": { "legend": {
@ -904,7 +1092,7 @@ data:
}, },
{ {
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 32 }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 41 },
"id": 104, "id": 104,
"panels": [], "panels": [],
"title": "Filesystem", "title": "Filesystem",
@ -930,7 +1118,7 @@ data:
}, },
"overrides": [] "overrides": []
}, },
"gridPos": { "h": 6, "w": 12, "x": 0, "y": 33 }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 42 },
"id": 40, "id": 40,
"options": { "options": {
"displayMode": "gradient", "displayMode": "gradient",
@ -991,7 +1179,7 @@ data:
}, },
"overrides": [] "overrides": []
}, },
"gridPos": { "h": 6, "w": 12, "x": 12, "y": 33 }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 42 },
"id": 41, "id": 41,
"options": { "options": {
"legend": { "legend": {
@ -1015,7 +1203,7 @@ data:
}, },
{ {
"collapsed": false, "collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 48 },
"id": 105, "id": 105,
"panels": [], "panels": [],
"title": "Network", "title": "Network",
@ -1075,7 +1263,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 49 },
"id": 50, "id": 50,
"options": { "options": {
"legend": { "legend": {
@ -1157,7 +1345,7 @@ data:
} }
] ]
}, },
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 40 }, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 49 },
"id": 51, "id": 51,
"options": { "options": {
"legend": { "legend": {
@ -1220,7 +1408,7 @@ data:
}, },
"overrides": [] "overrides": []
}, },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 48 }, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 57 },
"id": 52, "id": 52,
"options": { "options": {
"legend": { "legend": {