Migrate observability stack to Kubernetes (#42)
Note: the name of this branch was chosen before the scope widened to encompass the entire observability stack. Summary - Fix Grafana data source URLs (docker driver uses host.minikube.internal, not host.containers.internal) - Migrate Prometheus and Loki from indri to Kubernetes with Tailscale Ingresses - Expose CNPG PostgreSQL metrics via Tailscale and update dashboard to use cnpg_* metrics - Update Alloy to push metrics/logs to k8s endpoints (prometheus.tail8d86e.ts.net, loki.tail8d86e.ts.net) - Add ACL rule for port 9187 (CNPG metrics) - Delete obsolete ansible roles for prometheus and loki Changes - argocd/manifests/prometheus/ - New Prometheus StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/manifests/loki/ - New Loki StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/apps/prometheus.yaml, argocd/apps/loki.yaml - ArgoCD Applications - argocd/manifests/grafana/values.yaml - Data sources now use k8s internal DNS - argocd/manifests/databases/service-metrics-tailscale.yaml - CNPG metrics endpoint - argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml - Updated to cnpg_* metrics - ansible/roles/alloy/defaults/main.yml - Push to k8s Tailscale endpoints - pulumi/policy.hujson - ACL for port 9187 - Deleted ansible/roles/prometheus/ and ansible/roles/loki/ Deployment and Testing - Stop prometheus and loki on indri - Sync ArgoCD apps (apps, prometheus, loki, grafana) - Run mise run provision-indri -- --tags alloy - Verify Grafana dashboards show data 🤖 Generated with https://claude.ai/claude-code Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/42
This commit is contained in:
parent
5a829e0afd
commit
17023085cb
36 changed files with 569 additions and 270 deletions
|
|
@ -23,12 +23,8 @@
|
|||
tags: [borgmatic]
|
||||
|
||||
roles:
|
||||
- role: loki
|
||||
tags: loki
|
||||
- role: alloy
|
||||
tags: alloy
|
||||
- role: prometheus
|
||||
tags: prometheus
|
||||
- role: borgmatic
|
||||
tags: borgmatic
|
||||
- role: borgmatic_metrics
|
||||
|
|
|
|||
|
|
@ -1,14 +1,42 @@
|
|||
---
|
||||
# Grafana Alloy configuration
|
||||
#
|
||||
# BUILDING FROM SOURCE (required for CGO DNS resolution on macOS):
|
||||
#
|
||||
# Alloy must be built with CGO_ENABLED=1 to use macOS native DNS resolver,
|
||||
# which is required for Tailscale MagicDNS hostname resolution.
|
||||
# The Homebrew bottle is built with CGO_ENABLED=0.
|
||||
#
|
||||
# Build on dev machine (gilbert), then copy to indri:
|
||||
#
|
||||
# 1. Clone from forge mirror:
|
||||
# git clone ssh://forgejo@forge.tail8d86e.ts.net/eblume/alloy.git ~/code/3rd/alloy
|
||||
#
|
||||
# 2. Set up build tools via mise:
|
||||
# cd ~/code/3rd/alloy && mise use go@1.25 node yarn
|
||||
#
|
||||
# 3. Build with CGO enabled (default in Makefile):
|
||||
# cd ~/code/3rd/alloy && mise x -- make alloy
|
||||
#
|
||||
# 4. Copy binary to indri:
|
||||
# scp ~/code/3rd/alloy/build/alloy indri:~/.local/bin/alloy
|
||||
#
|
||||
# 5. Run ansible to deploy config and LaunchAgent
|
||||
|
||||
# Binary and paths
|
||||
alloy_binary: /Users/erichblume/.local/bin/alloy
|
||||
alloy_config_dir: /Users/erichblume/.config/grafana-alloy
|
||||
alloy_data_dir: /Users/erichblume/.local/share/grafana-alloy
|
||||
alloy_log_dir: /Users/erichblume/Library/Logs
|
||||
|
||||
# Textfile collector directory (same as node_exporter for compatibility)
|
||||
alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile
|
||||
|
||||
# Prometheus remote write endpoint
|
||||
alloy_prometheus_url: "http://localhost:9090/api/v1/write"
|
||||
# Prometheus remote write endpoint (k8s via Tailscale)
|
||||
alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write"
|
||||
|
||||
# Loki endpoint (used in Phase 2)
|
||||
alloy_loki_url: "http://localhost:3100/loki/api/v1/push"
|
||||
# Loki endpoint (k8s via Tailscale)
|
||||
alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push"
|
||||
|
||||
# Instance label for metrics
|
||||
alloy_instance_label: indri
|
||||
|
|
@ -16,39 +44,21 @@ alloy_instance_label: indri
|
|||
# Scrape interval
|
||||
alloy_scrape_interval: "15s"
|
||||
|
||||
# Config paths
|
||||
alloy_config_dir: /opt/homebrew/etc/grafana-alloy
|
||||
alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data
|
||||
|
||||
# Log paths to collect
|
||||
alloy_brew_logs:
|
||||
- path: /opt/homebrew/var/log/grafana-stdout.log
|
||||
service: grafana
|
||||
stream: stdout
|
||||
- path: /opt/homebrew/var/log/grafana-stderr.log
|
||||
service: grafana
|
||||
stream: stderr
|
||||
- path: /opt/homebrew/var/log/forgejo.log
|
||||
service: forgejo
|
||||
stream: stdout
|
||||
- path: /opt/homebrew/var/log/prometheus.err.log
|
||||
service: prometheus
|
||||
stream: stderr
|
||||
- path: /opt/homebrew/var/log/tailscaled.log
|
||||
service: tailscale
|
||||
stream: stdout
|
||||
- path: /opt/homebrew/var/transmission/transmission-daemon.log
|
||||
service: transmission
|
||||
stream: stdout
|
||||
# NOTE: postgresql and miniflux removed - now hosted in k8s
|
||||
|
||||
alloy_mcquack_logs:
|
||||
# NOTE: devpi logs removed - now hosted in k8s
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log
|
||||
service: kiwix
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.alloy.out.log
|
||||
service: alloy
|
||||
stream: stdout
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log
|
||||
service: kiwix
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.alloy.err.log
|
||||
service: alloy
|
||||
stream: stderr
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log
|
||||
service: borgmatic
|
||||
|
|
@ -75,8 +85,7 @@ alloy_collect_logs: true
|
|||
alloy_collect_zot: true
|
||||
alloy_zot_metrics_url: "http://localhost:5050/metrics"
|
||||
|
||||
# PostgreSQL metrics collection
|
||||
# NOTE: Disabled - brew postgresql removed, k8s CNPG metrics TBD
|
||||
# PostgreSQL metrics collection (disabled, CNPG metrics scraped directly by k8s Prometheus)
|
||||
alloy_collect_postgres: false
|
||||
alloy_postgres_host: localhost
|
||||
alloy_postgres_port: 5432
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
- name: Restart alloy
|
||||
ansible.builtin.command: brew services restart grafana-alloy
|
||||
async: 120
|
||||
poll: 0
|
||||
ansible.builtin.shell: |
|
||||
launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist 2>/dev/null || true
|
||||
launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
|
||||
changed_when: true
|
||||
|
|
|
|||
|
|
@ -1,11 +1,18 @@
|
|||
---
|
||||
# Grafana Alloy installation and configuration
|
||||
# Replaces node_exporter for metrics, adds log collection
|
||||
# See defaults/main.yml for build instructions
|
||||
|
||||
- name: Install grafana-alloy via homebrew
|
||||
community.general.homebrew:
|
||||
name: grafana-alloy
|
||||
state: present
|
||||
- name: Verify alloy binary exists
|
||||
ansible.builtin.stat:
|
||||
path: "{{ alloy_binary }}"
|
||||
register: alloy_binary_stat
|
||||
|
||||
- name: Fail if alloy binary not found
|
||||
ansible.builtin.fail:
|
||||
msg: |
|
||||
Alloy binary not found at {{ alloy_binary }}.
|
||||
Please build from source first (see ansible/roles/alloy/defaults/main.yml)
|
||||
when: not alloy_binary_stat.stat.exists
|
||||
|
||||
- name: Ensure alloy config directory exists
|
||||
ansible.builtin.file:
|
||||
|
|
@ -68,8 +75,21 @@
|
|||
notify: Restart alloy
|
||||
no_log: true
|
||||
|
||||
- name: Ensure alloy service is started
|
||||
ansible.builtin.command: brew services start grafana-alloy
|
||||
register: alloy_brew_start
|
||||
changed_when: "'Successfully started' in alloy_brew_start.stdout"
|
||||
- name: Deploy alloy LaunchAgent plist
|
||||
ansible.builtin.template:
|
||||
src: alloy.plist.j2
|
||||
dest: ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
|
||||
mode: '0644'
|
||||
notify: Restart alloy
|
||||
|
||||
- name: Check if alloy LaunchAgent is loaded
|
||||
ansible.builtin.command: launchctl list mcquack.eblume.alloy
|
||||
register: alloy_launchctl_check
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Load alloy LaunchAgent if not loaded
|
||||
ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
|
||||
when: alloy_launchctl_check.rc != 0
|
||||
changed_when: true
|
||||
failed_when: false
|
||||
|
|
|
|||
24
ansible/roles/alloy/templates/alloy.plist.j2
Normal file
24
ansible/roles/alloy/templates/alloy.plist.j2
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!-- {{ ansible_managed }} -->
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>Label</key>
|
||||
<string>mcquack.eblume.alloy</string>
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>{{ alloy_binary }}</string>
|
||||
<string>run</string>
|
||||
<string>{{ alloy_config_dir }}/config.alloy</string>
|
||||
<string>--storage.path={{ alloy_data_dir }}</string>
|
||||
</array>
|
||||
<key>RunAtLoad</key>
|
||||
<true/>
|
||||
<key>KeepAlive</key>
|
||||
<true/>
|
||||
<key>StandardOutPath</key>
|
||||
<string>{{ alloy_log_dir }}/mcquack.alloy.out.log</string>
|
||||
<key>StandardErrorPath</key>
|
||||
<string>{{ alloy_log_dir }}/mcquack.alloy.err.log</string>
|
||||
</dict>
|
||||
</plist>
|
||||
|
|
@ -43,7 +43,7 @@ prometheus.exporter.postgres "postgresql" {
|
|||
data_source_names = ["postgresql://{{ alloy_postgres_user }}:{{ alloy_postgres_password | urlencode }}@{{ alloy_postgres_host }}:{{ alloy_postgres_port }}/{{ alloy_postgres_database }}?sslmode=disable"]
|
||||
|
||||
// Custom queries for vacuum and XID monitoring
|
||||
custom_queries_config_path = "/opt/homebrew/etc/grafana-alloy/postgres_queries.yaml"
|
||||
custom_queries_config_path = "{{ alloy_config_dir }}/postgres_queries.yaml"
|
||||
}
|
||||
|
||||
// Scrape PostgreSQL metrics
|
||||
|
|
|
|||
|
|
@ -1,12 +0,0 @@
|
|||
---
|
||||
# Loki configuration
|
||||
|
||||
# Server settings
|
||||
loki_http_port: 3100
|
||||
|
||||
# Storage paths
|
||||
loki_data_dir: /opt/homebrew/var/loki
|
||||
loki_config_file: /opt/homebrew/etc/loki-local-config.yaml
|
||||
|
||||
# Retention settings
|
||||
loki_retention_period: 744h # 31 days
|
||||
|
|
@ -1,6 +0,0 @@
|
|||
---
|
||||
- name: Restart loki
|
||||
ansible.builtin.command: brew services restart loki
|
||||
async: 120
|
||||
poll: 0
|
||||
changed_when: true
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
---
|
||||
dependencies: []
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
---
|
||||
# Loki installation and configuration
|
||||
|
||||
- name: Install loki via homebrew
|
||||
community.general.homebrew:
|
||||
name: loki
|
||||
state: present
|
||||
|
||||
- name: Ensure loki data directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ loki_data_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Ensure loki chunks directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ loki_data_dir }}/chunks"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Ensure loki rules directory exists
|
||||
ansible.builtin.file:
|
||||
path: "{{ loki_data_dir }}/rules"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Deploy loki configuration
|
||||
ansible.builtin.template:
|
||||
src: loki-config.yaml.j2
|
||||
dest: "{{ loki_config_file }}"
|
||||
mode: '0644'
|
||||
notify: Restart loki
|
||||
|
||||
- name: Ensure loki service is started
|
||||
ansible.builtin.command: brew services start loki
|
||||
register: loki_brew_start
|
||||
changed_when: "'Successfully started' in loki_brew_start.stdout"
|
||||
failed_when: false
|
||||
|
|
@ -1,54 +0,0 @@
|
|||
# {{ ansible_managed }}
|
||||
# Loki configuration for single-node deployment
|
||||
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: {{ loki_http_port }}
|
||||
http_listen_address: 0.0.0.0
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: {{ loki_data_dir }}
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: {{ loki_data_dir }}/chunks
|
||||
rules_directory: {{ loki_data_dir }}/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
tsdb_shipper:
|
||||
active_index_directory: {{ loki_data_dir }}/tsdb-index
|
||||
cache_location: {{ loki_data_dir }}/tsdb-cache
|
||||
|
||||
limits_config:
|
||||
retention_period: {{ loki_retention_period }}
|
||||
|
||||
compactor:
|
||||
working_directory: {{ loki_data_dir }}/compactor
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
delete_request_store: filesystem
|
||||
|
|
@ -3,3 +3,4 @@ minikube_metrics_dir: /opt/homebrew/var/node_exporter/textfile
|
|||
minikube_metrics_script: /Users/erichblume/bin/minikube-metrics
|
||||
minikube_metrics_interval: 60 # seconds between metric collection
|
||||
minikube_metrics_log_dir: /opt/homebrew/var/log
|
||||
minikube_metrics_user_home: /Users/erichblume
|
||||
|
|
|
|||
|
|
@ -5,6 +5,13 @@
|
|||
<dict>
|
||||
<key>Label</key>
|
||||
<string>mcquack.eblume.minikube-metrics</string>
|
||||
<key>EnvironmentVariables</key>
|
||||
<dict>
|
||||
<key>HOME</key>
|
||||
<string>{{ minikube_metrics_user_home }}</string>
|
||||
<key>PATH</key>
|
||||
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
|
||||
</dict>
|
||||
<key>ProgramArguments</key>
|
||||
<array>
|
||||
<string>{{ minikube_metrics_script }}</string>
|
||||
|
|
|
|||
|
|
@ -4,6 +4,10 @@
|
|||
|
||||
set -euo pipefail
|
||||
|
||||
# Use absolute paths for LaunchAgent compatibility
|
||||
MINIKUBE="/opt/homebrew/bin/minikube"
|
||||
KUBECTL="/opt/homebrew/bin/kubectl"
|
||||
|
||||
OUTPUT_FILE="{{ minikube_metrics_dir }}/minikube.prom"
|
||||
TEMP_FILE="${OUTPUT_FILE}.tmp"
|
||||
|
||||
|
|
@ -22,7 +26,7 @@ cat > "$TEMP_FILE" << 'HEADER'
|
|||
HEADER
|
||||
|
||||
# Check if minikube is running
|
||||
if minikube status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then
|
||||
if $MINIKUBE status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then
|
||||
echo "minikube_up 1" >> "$TEMP_FILE"
|
||||
else
|
||||
echo "minikube_up 0" >> "$TEMP_FILE"
|
||||
|
|
@ -35,22 +39,22 @@ else
|
|||
fi
|
||||
|
||||
# Check API server health
|
||||
if kubectl get --raw /healthz >/dev/null 2>&1; then
|
||||
if $KUBECTL get --raw /healthz >/dev/null 2>&1; then
|
||||
echo "minikube_apiserver_up 1" >> "$TEMP_FILE"
|
||||
else
|
||||
echo "minikube_apiserver_up 0" >> "$TEMP_FILE"
|
||||
fi
|
||||
|
||||
# Get node count
|
||||
NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
NODE_COUNT=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "minikube_node_count ${NODE_COUNT:-0}" >> "$TEMP_FILE"
|
||||
|
||||
# Get pod count (all namespaces)
|
||||
POD_COUNT=$(kubectl get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
POD_COUNT=$($KUBECTL get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "minikube_pod_count ${POD_COUNT:-0}" >> "$TEMP_FILE"
|
||||
|
||||
# Get namespace count
|
||||
NS_COUNT=$(kubectl get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
NS_COUNT=$($KUBECTL get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||
echo "minikube_namespace_count ${NS_COUNT:-0}" >> "$TEMP_FILE"
|
||||
|
||||
# Atomic move
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
---
|
||||
- name: Restart prometheus
|
||||
ansible.builtin.command: brew services restart prometheus
|
||||
changed_when: true
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
---
|
||||
- name: Install prometheus via homebrew
|
||||
community.general.homebrew:
|
||||
name: prometheus
|
||||
state: present
|
||||
|
||||
- name: Configure prometheus.yml
|
||||
ansible.builtin.template:
|
||||
src: prometheus.yml.j2
|
||||
dest: /opt/homebrew/etc/prometheus.yml
|
||||
mode: '0644'
|
||||
notify: Restart prometheus
|
||||
|
||||
- name: Configure prometheus.args
|
||||
ansible.builtin.template:
|
||||
src: prometheus.args.j2
|
||||
dest: /opt/homebrew/etc/prometheus.args
|
||||
mode: '0644'
|
||||
notify: Restart prometheus
|
||||
|
||||
- name: Ensure prometheus service is started
|
||||
ansible.builtin.command: brew services start prometheus
|
||||
register: prometheus_brew_start
|
||||
changed_when: "'Successfully started' in prometheus_brew_start.stdout"
|
||||
failed_when: false
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
--config.file /opt/homebrew/etc/prometheus.yml
|
||||
--web.listen-address=0.0.0.0:9090
|
||||
--storage.tsdb.path /opt/homebrew/var/prometheus
|
||||
--web.enable-remote-write-receiver
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
# {{ ansible_managed }}
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
# Note: indri system metrics are pushed via Alloy remote_write
|
||||
# Sifaka still uses traditional scraping via node_exporter
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "node-exporter-sifaka"
|
||||
static_configs:
|
||||
- targets: ["sifaka:9100"]
|
||||
|
||||
- job_name: "loki"
|
||||
static_configs:
|
||||
- targets: ["localhost:3100"]
|
||||
Loading…
Add table
Add a link
Reference in a new issue