Migrate observability stack to Kubernetes (#42)
Note: the name of this branch was chosen before the scope widened to encompass the entire observability stack. Summary - Fix Grafana data source URLs (docker driver uses host.minikube.internal, not host.containers.internal) - Migrate Prometheus and Loki from indri to Kubernetes with Tailscale Ingresses - Expose CNPG PostgreSQL metrics via Tailscale and update dashboard to use cnpg_* metrics - Update Alloy to push metrics/logs to k8s endpoints (prometheus.tail8d86e.ts.net, loki.tail8d86e.ts.net) - Add ACL rule for port 9187 (CNPG metrics) - Delete obsolete ansible roles for prometheus and loki Changes - argocd/manifests/prometheus/ - New Prometheus StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/manifests/loki/ - New Loki StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/apps/prometheus.yaml, argocd/apps/loki.yaml - ArgoCD Applications - argocd/manifests/grafana/values.yaml - Data sources now use k8s internal DNS - argocd/manifests/databases/service-metrics-tailscale.yaml - CNPG metrics endpoint - argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml - Updated to cnpg_* metrics - ansible/roles/alloy/defaults/main.yml - Push to k8s Tailscale endpoints - pulumi/policy.hujson - ACL for port 9187 - Deleted ansible/roles/prometheus/ and ansible/roles/loki/ Deployment and Testing - Stop prometheus and loki on indri - Sync ArgoCD apps (apps, prometheus, loki, grafana) - Run mise run provision-indri -- --tags alloy - Verify Grafana dashboards show data 🤖 Generated with https://claude.ai/claude-code Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/42
This commit is contained in:
parent
5a829e0afd
commit
17023085cb
36 changed files with 569 additions and 270 deletions
|
|
@ -23,12 +23,8 @@
|
||||||
tags: [borgmatic]
|
tags: [borgmatic]
|
||||||
|
|
||||||
roles:
|
roles:
|
||||||
- role: loki
|
|
||||||
tags: loki
|
|
||||||
- role: alloy
|
- role: alloy
|
||||||
tags: alloy
|
tags: alloy
|
||||||
- role: prometheus
|
|
||||||
tags: prometheus
|
|
||||||
- role: borgmatic
|
- role: borgmatic
|
||||||
tags: borgmatic
|
tags: borgmatic
|
||||||
- role: borgmatic_metrics
|
- role: borgmatic_metrics
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,42 @@
|
||||||
---
|
---
|
||||||
# Grafana Alloy configuration
|
# Grafana Alloy configuration
|
||||||
|
#
|
||||||
|
# BUILDING FROM SOURCE (required for CGO DNS resolution on macOS):
|
||||||
|
#
|
||||||
|
# Alloy must be built with CGO_ENABLED=1 to use macOS native DNS resolver,
|
||||||
|
# which is required for Tailscale MagicDNS hostname resolution.
|
||||||
|
# The Homebrew bottle is built with CGO_ENABLED=0.
|
||||||
|
#
|
||||||
|
# Build on dev machine (gilbert), then copy to indri:
|
||||||
|
#
|
||||||
|
# 1. Clone from forge mirror:
|
||||||
|
# git clone ssh://forgejo@forge.tail8d86e.ts.net/eblume/alloy.git ~/code/3rd/alloy
|
||||||
|
#
|
||||||
|
# 2. Set up build tools via mise:
|
||||||
|
# cd ~/code/3rd/alloy && mise use go@1.25 node yarn
|
||||||
|
#
|
||||||
|
# 3. Build with CGO enabled (default in Makefile):
|
||||||
|
# cd ~/code/3rd/alloy && mise x -- make alloy
|
||||||
|
#
|
||||||
|
# 4. Copy binary to indri:
|
||||||
|
# scp ~/code/3rd/alloy/build/alloy indri:~/.local/bin/alloy
|
||||||
|
#
|
||||||
|
# 5. Run ansible to deploy config and LaunchAgent
|
||||||
|
|
||||||
|
# Binary and paths
|
||||||
|
alloy_binary: /Users/erichblume/.local/bin/alloy
|
||||||
|
alloy_config_dir: /Users/erichblume/.config/grafana-alloy
|
||||||
|
alloy_data_dir: /Users/erichblume/.local/share/grafana-alloy
|
||||||
|
alloy_log_dir: /Users/erichblume/Library/Logs
|
||||||
|
|
||||||
# Textfile collector directory (same as node_exporter for compatibility)
|
# Textfile collector directory (same as node_exporter for compatibility)
|
||||||
alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile
|
alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile
|
||||||
|
|
||||||
# Prometheus remote write endpoint
|
# Prometheus remote write endpoint (k8s via Tailscale)
|
||||||
alloy_prometheus_url: "http://localhost:9090/api/v1/write"
|
alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write"
|
||||||
|
|
||||||
# Loki endpoint (used in Phase 2)
|
# Loki endpoint (k8s via Tailscale)
|
||||||
alloy_loki_url: "http://localhost:3100/loki/api/v1/push"
|
alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push"
|
||||||
|
|
||||||
# Instance label for metrics
|
# Instance label for metrics
|
||||||
alloy_instance_label: indri
|
alloy_instance_label: indri
|
||||||
|
|
@ -16,39 +44,21 @@ alloy_instance_label: indri
|
||||||
# Scrape interval
|
# Scrape interval
|
||||||
alloy_scrape_interval: "15s"
|
alloy_scrape_interval: "15s"
|
||||||
|
|
||||||
# Config paths
|
|
||||||
alloy_config_dir: /opt/homebrew/etc/grafana-alloy
|
|
||||||
alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data
|
|
||||||
|
|
||||||
# Log paths to collect
|
# Log paths to collect
|
||||||
alloy_brew_logs:
|
alloy_brew_logs:
|
||||||
- path: /opt/homebrew/var/log/grafana-stdout.log
|
|
||||||
service: grafana
|
|
||||||
stream: stdout
|
|
||||||
- path: /opt/homebrew/var/log/grafana-stderr.log
|
|
||||||
service: grafana
|
|
||||||
stream: stderr
|
|
||||||
- path: /opt/homebrew/var/log/forgejo.log
|
- path: /opt/homebrew/var/log/forgejo.log
|
||||||
service: forgejo
|
service: forgejo
|
||||||
stream: stdout
|
stream: stdout
|
||||||
- path: /opt/homebrew/var/log/prometheus.err.log
|
|
||||||
service: prometheus
|
|
||||||
stream: stderr
|
|
||||||
- path: /opt/homebrew/var/log/tailscaled.log
|
- path: /opt/homebrew/var/log/tailscaled.log
|
||||||
service: tailscale
|
service: tailscale
|
||||||
stream: stdout
|
stream: stdout
|
||||||
- path: /opt/homebrew/var/transmission/transmission-daemon.log
|
|
||||||
service: transmission
|
|
||||||
stream: stdout
|
|
||||||
# NOTE: postgresql and miniflux removed - now hosted in k8s
|
|
||||||
|
|
||||||
alloy_mcquack_logs:
|
alloy_mcquack_logs:
|
||||||
# NOTE: devpi logs removed - now hosted in k8s
|
- path: /Users/erichblume/Library/Logs/mcquack.alloy.out.log
|
||||||
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log
|
service: alloy
|
||||||
service: kiwix
|
|
||||||
stream: stdout
|
stream: stdout
|
||||||
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log
|
- path: /Users/erichblume/Library/Logs/mcquack.alloy.err.log
|
||||||
service: kiwix
|
service: alloy
|
||||||
stream: stderr
|
stream: stderr
|
||||||
- path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log
|
- path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log
|
||||||
service: borgmatic
|
service: borgmatic
|
||||||
|
|
@ -75,8 +85,7 @@ alloy_collect_logs: true
|
||||||
alloy_collect_zot: true
|
alloy_collect_zot: true
|
||||||
alloy_zot_metrics_url: "http://localhost:5050/metrics"
|
alloy_zot_metrics_url: "http://localhost:5050/metrics"
|
||||||
|
|
||||||
# PostgreSQL metrics collection
|
# PostgreSQL metrics collection (disabled, CNPG metrics scraped directly by k8s Prometheus)
|
||||||
# NOTE: Disabled - brew postgresql removed, k8s CNPG metrics TBD
|
|
||||||
alloy_collect_postgres: false
|
alloy_collect_postgres: false
|
||||||
alloy_postgres_host: localhost
|
alloy_postgres_host: localhost
|
||||||
alloy_postgres_port: 5432
|
alloy_postgres_port: 5432
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
---
|
---
|
||||||
- name: Restart alloy
|
- name: Restart alloy
|
||||||
ansible.builtin.command: brew services restart grafana-alloy
|
ansible.builtin.shell: |
|
||||||
async: 120
|
launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist 2>/dev/null || true
|
||||||
poll: 0
|
launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,18 @@
|
||||||
---
|
---
|
||||||
# Grafana Alloy installation and configuration
|
# Grafana Alloy installation and configuration
|
||||||
# Replaces node_exporter for metrics, adds log collection
|
# See defaults/main.yml for build instructions
|
||||||
|
|
||||||
- name: Install grafana-alloy via homebrew
|
- name: Verify alloy binary exists
|
||||||
community.general.homebrew:
|
ansible.builtin.stat:
|
||||||
name: grafana-alloy
|
path: "{{ alloy_binary }}"
|
||||||
state: present
|
register: alloy_binary_stat
|
||||||
|
|
||||||
|
- name: Fail if alloy binary not found
|
||||||
|
ansible.builtin.fail:
|
||||||
|
msg: |
|
||||||
|
Alloy binary not found at {{ alloy_binary }}.
|
||||||
|
Please build from source first (see ansible/roles/alloy/defaults/main.yml)
|
||||||
|
when: not alloy_binary_stat.stat.exists
|
||||||
|
|
||||||
- name: Ensure alloy config directory exists
|
- name: Ensure alloy config directory exists
|
||||||
ansible.builtin.file:
|
ansible.builtin.file:
|
||||||
|
|
@ -68,8 +75,21 @@
|
||||||
notify: Restart alloy
|
notify: Restart alloy
|
||||||
no_log: true
|
no_log: true
|
||||||
|
|
||||||
- name: Ensure alloy service is started
|
- name: Deploy alloy LaunchAgent plist
|
||||||
ansible.builtin.command: brew services start grafana-alloy
|
ansible.builtin.template:
|
||||||
register: alloy_brew_start
|
src: alloy.plist.j2
|
||||||
changed_when: "'Successfully started' in alloy_brew_start.stdout"
|
dest: ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
|
||||||
|
mode: '0644'
|
||||||
|
notify: Restart alloy
|
||||||
|
|
||||||
|
- name: Check if alloy LaunchAgent is loaded
|
||||||
|
ansible.builtin.command: launchctl list mcquack.eblume.alloy
|
||||||
|
register: alloy_launchctl_check
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Load alloy LaunchAgent if not loaded
|
||||||
|
ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist
|
||||||
|
when: alloy_launchctl_check.rc != 0
|
||||||
|
changed_when: true
|
||||||
failed_when: false
|
failed_when: false
|
||||||
|
|
|
||||||
24
ansible/roles/alloy/templates/alloy.plist.j2
Normal file
24
ansible/roles/alloy/templates/alloy.plist.j2
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!-- {{ ansible_managed }} -->
|
||||||
|
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||||
|
<plist version="1.0">
|
||||||
|
<dict>
|
||||||
|
<key>Label</key>
|
||||||
|
<string>mcquack.eblume.alloy</string>
|
||||||
|
<key>ProgramArguments</key>
|
||||||
|
<array>
|
||||||
|
<string>{{ alloy_binary }}</string>
|
||||||
|
<string>run</string>
|
||||||
|
<string>{{ alloy_config_dir }}/config.alloy</string>
|
||||||
|
<string>--storage.path={{ alloy_data_dir }}</string>
|
||||||
|
</array>
|
||||||
|
<key>RunAtLoad</key>
|
||||||
|
<true/>
|
||||||
|
<key>KeepAlive</key>
|
||||||
|
<true/>
|
||||||
|
<key>StandardOutPath</key>
|
||||||
|
<string>{{ alloy_log_dir }}/mcquack.alloy.out.log</string>
|
||||||
|
<key>StandardErrorPath</key>
|
||||||
|
<string>{{ alloy_log_dir }}/mcquack.alloy.err.log</string>
|
||||||
|
</dict>
|
||||||
|
</plist>
|
||||||
|
|
@ -43,7 +43,7 @@ prometheus.exporter.postgres "postgresql" {
|
||||||
data_source_names = ["postgresql://{{ alloy_postgres_user }}:{{ alloy_postgres_password | urlencode }}@{{ alloy_postgres_host }}:{{ alloy_postgres_port }}/{{ alloy_postgres_database }}?sslmode=disable"]
|
data_source_names = ["postgresql://{{ alloy_postgres_user }}:{{ alloy_postgres_password | urlencode }}@{{ alloy_postgres_host }}:{{ alloy_postgres_port }}/{{ alloy_postgres_database }}?sslmode=disable"]
|
||||||
|
|
||||||
// Custom queries for vacuum and XID monitoring
|
// Custom queries for vacuum and XID monitoring
|
||||||
custom_queries_config_path = "/opt/homebrew/etc/grafana-alloy/postgres_queries.yaml"
|
custom_queries_config_path = "{{ alloy_config_dir }}/postgres_queries.yaml"
|
||||||
}
|
}
|
||||||
|
|
||||||
// Scrape PostgreSQL metrics
|
// Scrape PostgreSQL metrics
|
||||||
|
|
|
||||||
|
|
@ -1,12 +0,0 @@
|
||||||
---
|
|
||||||
# Loki configuration
|
|
||||||
|
|
||||||
# Server settings
|
|
||||||
loki_http_port: 3100
|
|
||||||
|
|
||||||
# Storage paths
|
|
||||||
loki_data_dir: /opt/homebrew/var/loki
|
|
||||||
loki_config_file: /opt/homebrew/etc/loki-local-config.yaml
|
|
||||||
|
|
||||||
# Retention settings
|
|
||||||
loki_retention_period: 744h # 31 days
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
---
|
|
||||||
- name: Restart loki
|
|
||||||
ansible.builtin.command: brew services restart loki
|
|
||||||
async: 120
|
|
||||||
poll: 0
|
|
||||||
changed_when: true
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
---
|
|
||||||
dependencies: []
|
|
||||||
|
|
@ -1,38 +0,0 @@
|
||||||
---
|
|
||||||
# Loki installation and configuration
|
|
||||||
|
|
||||||
- name: Install loki via homebrew
|
|
||||||
community.general.homebrew:
|
|
||||||
name: loki
|
|
||||||
state: present
|
|
||||||
|
|
||||||
- name: Ensure loki data directory exists
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ loki_data_dir }}"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
- name: Ensure loki chunks directory exists
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ loki_data_dir }}/chunks"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
- name: Ensure loki rules directory exists
|
|
||||||
ansible.builtin.file:
|
|
||||||
path: "{{ loki_data_dir }}/rules"
|
|
||||||
state: directory
|
|
||||||
mode: '0755'
|
|
||||||
|
|
||||||
- name: Deploy loki configuration
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: loki-config.yaml.j2
|
|
||||||
dest: "{{ loki_config_file }}"
|
|
||||||
mode: '0644'
|
|
||||||
notify: Restart loki
|
|
||||||
|
|
||||||
- name: Ensure loki service is started
|
|
||||||
ansible.builtin.command: brew services start loki
|
|
||||||
register: loki_brew_start
|
|
||||||
changed_when: "'Successfully started' in loki_brew_start.stdout"
|
|
||||||
failed_when: false
|
|
||||||
|
|
@ -1,54 +0,0 @@
|
||||||
# {{ ansible_managed }}
|
|
||||||
# Loki configuration for single-node deployment
|
|
||||||
|
|
||||||
auth_enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
http_listen_port: {{ loki_http_port }}
|
|
||||||
http_listen_address: 0.0.0.0
|
|
||||||
grpc_listen_port: 9096
|
|
||||||
|
|
||||||
common:
|
|
||||||
instance_addr: 127.0.0.1
|
|
||||||
path_prefix: {{ loki_data_dir }}
|
|
||||||
storage:
|
|
||||||
filesystem:
|
|
||||||
chunks_directory: {{ loki_data_dir }}/chunks
|
|
||||||
rules_directory: {{ loki_data_dir }}/rules
|
|
||||||
replication_factor: 1
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: inmemory
|
|
||||||
|
|
||||||
query_range:
|
|
||||||
results_cache:
|
|
||||||
cache:
|
|
||||||
embedded_cache:
|
|
||||||
enabled: true
|
|
||||||
max_size_mb: 100
|
|
||||||
|
|
||||||
schema_config:
|
|
||||||
configs:
|
|
||||||
- from: 2024-01-01
|
|
||||||
store: tsdb
|
|
||||||
object_store: filesystem
|
|
||||||
schema: v13
|
|
||||||
index:
|
|
||||||
prefix: index_
|
|
||||||
period: 24h
|
|
||||||
|
|
||||||
storage_config:
|
|
||||||
tsdb_shipper:
|
|
||||||
active_index_directory: {{ loki_data_dir }}/tsdb-index
|
|
||||||
cache_location: {{ loki_data_dir }}/tsdb-cache
|
|
||||||
|
|
||||||
limits_config:
|
|
||||||
retention_period: {{ loki_retention_period }}
|
|
||||||
|
|
||||||
compactor:
|
|
||||||
working_directory: {{ loki_data_dir }}/compactor
|
|
||||||
compaction_interval: 10m
|
|
||||||
retention_enabled: true
|
|
||||||
retention_delete_delay: 2h
|
|
||||||
retention_delete_worker_count: 150
|
|
||||||
delete_request_store: filesystem
|
|
||||||
|
|
@ -3,3 +3,4 @@ minikube_metrics_dir: /opt/homebrew/var/node_exporter/textfile
|
||||||
minikube_metrics_script: /Users/erichblume/bin/minikube-metrics
|
minikube_metrics_script: /Users/erichblume/bin/minikube-metrics
|
||||||
minikube_metrics_interval: 60 # seconds between metric collection
|
minikube_metrics_interval: 60 # seconds between metric collection
|
||||||
minikube_metrics_log_dir: /opt/homebrew/var/log
|
minikube_metrics_log_dir: /opt/homebrew/var/log
|
||||||
|
minikube_metrics_user_home: /Users/erichblume
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,13 @@
|
||||||
<dict>
|
<dict>
|
||||||
<key>Label</key>
|
<key>Label</key>
|
||||||
<string>mcquack.eblume.minikube-metrics</string>
|
<string>mcquack.eblume.minikube-metrics</string>
|
||||||
|
<key>EnvironmentVariables</key>
|
||||||
|
<dict>
|
||||||
|
<key>HOME</key>
|
||||||
|
<string>{{ minikube_metrics_user_home }}</string>
|
||||||
|
<key>PATH</key>
|
||||||
|
<string>/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin</string>
|
||||||
|
</dict>
|
||||||
<key>ProgramArguments</key>
|
<key>ProgramArguments</key>
|
||||||
<array>
|
<array>
|
||||||
<string>{{ minikube_metrics_script }}</string>
|
<string>{{ minikube_metrics_script }}</string>
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,10 @@
|
||||||
|
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Use absolute paths for LaunchAgent compatibility
|
||||||
|
MINIKUBE="/opt/homebrew/bin/minikube"
|
||||||
|
KUBECTL="/opt/homebrew/bin/kubectl"
|
||||||
|
|
||||||
OUTPUT_FILE="{{ minikube_metrics_dir }}/minikube.prom"
|
OUTPUT_FILE="{{ minikube_metrics_dir }}/minikube.prom"
|
||||||
TEMP_FILE="${OUTPUT_FILE}.tmp"
|
TEMP_FILE="${OUTPUT_FILE}.tmp"
|
||||||
|
|
||||||
|
|
@ -22,7 +26,7 @@ cat > "$TEMP_FILE" << 'HEADER'
|
||||||
HEADER
|
HEADER
|
||||||
|
|
||||||
# Check if minikube is running
|
# Check if minikube is running
|
||||||
if minikube status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then
|
if $MINIKUBE status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then
|
||||||
echo "minikube_up 1" >> "$TEMP_FILE"
|
echo "minikube_up 1" >> "$TEMP_FILE"
|
||||||
else
|
else
|
||||||
echo "minikube_up 0" >> "$TEMP_FILE"
|
echo "minikube_up 0" >> "$TEMP_FILE"
|
||||||
|
|
@ -35,22 +39,22 @@ else
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Check API server health
|
# Check API server health
|
||||||
if kubectl get --raw /healthz >/dev/null 2>&1; then
|
if $KUBECTL get --raw /healthz >/dev/null 2>&1; then
|
||||||
echo "minikube_apiserver_up 1" >> "$TEMP_FILE"
|
echo "minikube_apiserver_up 1" >> "$TEMP_FILE"
|
||||||
else
|
else
|
||||||
echo "minikube_apiserver_up 0" >> "$TEMP_FILE"
|
echo "minikube_apiserver_up 0" >> "$TEMP_FILE"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Get node count
|
# Get node count
|
||||||
NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
NODE_COUNT=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||||
echo "minikube_node_count ${NODE_COUNT:-0}" >> "$TEMP_FILE"
|
echo "minikube_node_count ${NODE_COUNT:-0}" >> "$TEMP_FILE"
|
||||||
|
|
||||||
# Get pod count (all namespaces)
|
# Get pod count (all namespaces)
|
||||||
POD_COUNT=$(kubectl get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
POD_COUNT=$($KUBECTL get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||||
echo "minikube_pod_count ${POD_COUNT:-0}" >> "$TEMP_FILE"
|
echo "minikube_pod_count ${POD_COUNT:-0}" >> "$TEMP_FILE"
|
||||||
|
|
||||||
# Get namespace count
|
# Get namespace count
|
||||||
NS_COUNT=$(kubectl get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
NS_COUNT=$($KUBECTL get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ')
|
||||||
echo "minikube_namespace_count ${NS_COUNT:-0}" >> "$TEMP_FILE"
|
echo "minikube_namespace_count ${NS_COUNT:-0}" >> "$TEMP_FILE"
|
||||||
|
|
||||||
# Atomic move
|
# Atomic move
|
||||||
|
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
---
|
|
||||||
- name: Restart prometheus
|
|
||||||
ansible.builtin.command: brew services restart prometheus
|
|
||||||
changed_when: true
|
|
||||||
|
|
@ -1,25 +0,0 @@
|
||||||
---
|
|
||||||
- name: Install prometheus via homebrew
|
|
||||||
community.general.homebrew:
|
|
||||||
name: prometheus
|
|
||||||
state: present
|
|
||||||
|
|
||||||
- name: Configure prometheus.yml
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: prometheus.yml.j2
|
|
||||||
dest: /opt/homebrew/etc/prometheus.yml
|
|
||||||
mode: '0644'
|
|
||||||
notify: Restart prometheus
|
|
||||||
|
|
||||||
- name: Configure prometheus.args
|
|
||||||
ansible.builtin.template:
|
|
||||||
src: prometheus.args.j2
|
|
||||||
dest: /opt/homebrew/etc/prometheus.args
|
|
||||||
mode: '0644'
|
|
||||||
notify: Restart prometheus
|
|
||||||
|
|
||||||
- name: Ensure prometheus service is started
|
|
||||||
ansible.builtin.command: brew services start prometheus
|
|
||||||
register: prometheus_brew_start
|
|
||||||
changed_when: "'Successfully started' in prometheus_brew_start.stdout"
|
|
||||||
failed_when: false
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
--config.file /opt/homebrew/etc/prometheus.yml
|
|
||||||
--web.listen-address=0.0.0.0:9090
|
|
||||||
--storage.tsdb.path /opt/homebrew/var/prometheus
|
|
||||||
--web.enable-remote-write-receiver
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
# {{ ansible_managed }}
|
|
||||||
global:
|
|
||||||
scrape_interval: 15s
|
|
||||||
|
|
||||||
# Note: indri system metrics are pushed via Alloy remote_write
|
|
||||||
# Sifaka still uses traditional scraping via node_exporter
|
|
||||||
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: "node-exporter-sifaka"
|
|
||||||
static_configs:
|
|
||||||
- targets: ["sifaka:9100"]
|
|
||||||
|
|
||||||
- job_name: "loki"
|
|
||||||
static_configs:
|
|
||||||
- targets: ["localhost:3100"]
|
|
||||||
17
argocd/apps/loki.yaml
Normal file
17
argocd/apps/loki.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: loki
|
||||||
|
namespace: argocd
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
source:
|
||||||
|
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
|
||||||
|
targetRevision: main
|
||||||
|
path: argocd/manifests/loki
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: monitoring
|
||||||
|
syncPolicy:
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
17
argocd/apps/prometheus.yaml
Normal file
17
argocd/apps/prometheus.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
apiVersion: argoproj.io/v1alpha1
|
||||||
|
kind: Application
|
||||||
|
metadata:
|
||||||
|
name: prometheus
|
||||||
|
namespace: argocd
|
||||||
|
spec:
|
||||||
|
project: default
|
||||||
|
source:
|
||||||
|
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
|
||||||
|
targetRevision: main
|
||||||
|
path: argocd/manifests/prometheus
|
||||||
|
destination:
|
||||||
|
server: https://kubernetes.default.svc
|
||||||
|
namespace: monitoring
|
||||||
|
syncPolicy:
|
||||||
|
syncOptions:
|
||||||
|
- CreateNamespace=true
|
||||||
|
|
@ -6,3 +6,4 @@ namespace: databases
|
||||||
resources:
|
resources:
|
||||||
- blumeops-pg.yaml
|
- blumeops-pg.yaml
|
||||||
- service-tailscale.yaml
|
- service-tailscale.yaml
|
||||||
|
- service-metrics-tailscale.yaml
|
||||||
|
|
|
||||||
22
argocd/manifests/databases/service-metrics-tailscale.yaml
Normal file
22
argocd/manifests/databases/service-metrics-tailscale.yaml
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
# Tailscale LoadBalancer for CNPG metrics access
|
||||||
|
# Exposes native postgres_exporter metrics on port 9187
|
||||||
|
# Canonical hostname: cnpg-metrics.tail8d86e.ts.net
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: blumeops-pg-metrics-tailscale
|
||||||
|
namespace: databases
|
||||||
|
annotations:
|
||||||
|
tailscale.com/hostname: "cnpg-metrics"
|
||||||
|
tailscale.com/proxy-class: "default"
|
||||||
|
spec:
|
||||||
|
type: LoadBalancer
|
||||||
|
loadBalancerClass: tailscale
|
||||||
|
selector:
|
||||||
|
cnpg.io/cluster: blumeops-pg
|
||||||
|
role: primary
|
||||||
|
ports:
|
||||||
|
- name: metrics
|
||||||
|
port: 9187
|
||||||
|
targetPort: 9187
|
||||||
|
protocol: TCP
|
||||||
|
|
@ -54,7 +54,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "pg_up",
|
"expr": "cnpg_collector_up",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -95,7 +95,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "pg_stat_activity_count{state=\"active\"}",
|
"expr": "cnpg_backends_total{state=\"active\"}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -136,7 +136,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(pg_stat_activity_count)",
|
"expr": "sum(cnpg_backends_total)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -177,7 +177,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum(pg_database_size_bytes)",
|
"expr": "sum(cnpg_pg_database_size_bytes)",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
|
@ -249,7 +249,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "pg_stat_activity_count",
|
"expr": "cnpg_backends_total",
|
||||||
"legendFormat": "{{state}}",
|
"legendFormat": "{{state}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
|
|
@ -322,7 +322,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "pg_database_size_bytes{datname!~\"template.*\"}",
|
"expr": "cnpg_pg_database_size_bytes{datname!~\"template.*\"}",
|
||||||
"legendFormat": "{{datname}}",
|
"legendFormat": "{{datname}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
|
|
@ -395,22 +395,22 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "rate(pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])",
|
"expr": "rate(cnpg_pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])",
|
||||||
"legendFormat": "{{datname}} fetched",
|
"legendFormat": "{{datname}} fetched",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])",
|
"expr": "rate(cnpg_pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])",
|
||||||
"legendFormat": "{{datname}} inserted",
|
"legendFormat": "{{datname}} inserted",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])",
|
"expr": "rate(cnpg_pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])",
|
||||||
"legendFormat": "{{datname}} updated",
|
"legendFormat": "{{datname}} updated",
|
||||||
"refId": "C"
|
"refId": "C"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])",
|
"expr": "rate(cnpg_pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])",
|
||||||
"legendFormat": "{{datname}} deleted",
|
"legendFormat": "{{datname}} deleted",
|
||||||
"refId": "D"
|
"refId": "D"
|
||||||
}
|
}
|
||||||
|
|
@ -483,12 +483,12 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])",
|
"expr": "rate(cnpg_pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])",
|
||||||
"legendFormat": "{{datname}} commits",
|
"legendFormat": "{{datname}} commits",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])",
|
"expr": "rate(cnpg_pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])",
|
||||||
"legendFormat": "{{datname}} rollbacks",
|
"legendFormat": "{{datname}} rollbacks",
|
||||||
"refId": "B"
|
"refId": "B"
|
||||||
}
|
}
|
||||||
|
|
@ -561,7 +561,7 @@ data:
|
||||||
},
|
},
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "pg_database_xid_age_xid_age",
|
"expr": "cnpg_pg_database_xid_age",
|
||||||
"legendFormat": "{{datname}}",
|
"legendFormat": "{{datname}}",
|
||||||
"refId": "A"
|
"refId": "A"
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -24,8 +24,7 @@ grafana.ini:
|
||||||
check_for_updates: false
|
check_for_updates: false
|
||||||
reporting_enabled: false
|
reporting_enabled: false
|
||||||
|
|
||||||
# Datasources - point to indri services via podman host gateway
|
# Datasources - point to k8s-internal services
|
||||||
# host.containers.internal resolves to the podman host (indri) from inside minikube
|
|
||||||
datasources:
|
datasources:
|
||||||
datasources.yaml:
|
datasources.yaml:
|
||||||
apiVersion: 1
|
apiVersion: 1
|
||||||
|
|
@ -35,7 +34,7 @@ datasources:
|
||||||
access: proxy
|
access: proxy
|
||||||
orgId: 1
|
orgId: 1
|
||||||
uid: prometheus
|
uid: prometheus
|
||||||
url: http://host.containers.internal:9090
|
url: http://prometheus.monitoring.svc.cluster.local:9090
|
||||||
isDefault: true
|
isDefault: true
|
||||||
editable: false
|
editable: false
|
||||||
- name: Loki
|
- name: Loki
|
||||||
|
|
@ -43,7 +42,7 @@ datasources:
|
||||||
access: proxy
|
access: proxy
|
||||||
orgId: 1
|
orgId: 1
|
||||||
uid: loki
|
uid: loki
|
||||||
url: http://host.containers.internal:3100
|
url: http://loki.monitoring.svc.cluster.local:3100
|
||||||
editable: false
|
editable: false
|
||||||
|
|
||||||
# Dashboard provisioning - sidecar watches for ConfigMaps with label
|
# Dashboard provisioning - sidecar watches for ConfigMaps with label
|
||||||
|
|
|
||||||
58
argocd/manifests/loki/configmap.yaml
Normal file
58
argocd/manifests/loki/configmap.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: loki-config
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
loki-config.yaml: |
|
||||||
|
auth_enabled: false
|
||||||
|
|
||||||
|
server:
|
||||||
|
http_listen_port: 3100
|
||||||
|
http_listen_address: 0.0.0.0
|
||||||
|
grpc_listen_port: 9096
|
||||||
|
|
||||||
|
common:
|
||||||
|
instance_addr: 127.0.0.1
|
||||||
|
path_prefix: /loki
|
||||||
|
storage:
|
||||||
|
filesystem:
|
||||||
|
chunks_directory: /loki/chunks
|
||||||
|
rules_directory: /loki/rules
|
||||||
|
replication_factor: 1
|
||||||
|
ring:
|
||||||
|
kvstore:
|
||||||
|
store: inmemory
|
||||||
|
|
||||||
|
query_range:
|
||||||
|
results_cache:
|
||||||
|
cache:
|
||||||
|
embedded_cache:
|
||||||
|
enabled: true
|
||||||
|
max_size_mb: 100
|
||||||
|
|
||||||
|
schema_config:
|
||||||
|
configs:
|
||||||
|
- from: 2024-01-01
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
storage_config:
|
||||||
|
tsdb_shipper:
|
||||||
|
active_index_directory: /loki/tsdb-index
|
||||||
|
cache_location: /loki/tsdb-cache
|
||||||
|
|
||||||
|
limits_config:
|
||||||
|
retention_period: 744h # 31 days
|
||||||
|
|
||||||
|
compactor:
|
||||||
|
working_directory: /loki/compactor
|
||||||
|
compaction_interval: 10m
|
||||||
|
retention_enabled: true
|
||||||
|
retention_delete_delay: 2h
|
||||||
|
retention_delete_worker_count: 150
|
||||||
|
delete_request_store: filesystem
|
||||||
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Tailscale Ingress for Loki
|
||||||
|
# Allows Alloy on indri to push logs
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: loki-tailscale
|
||||||
|
namespace: monitoring
|
||||||
|
annotations:
|
||||||
|
tailscale.com/funnel: "false"
|
||||||
|
spec:
|
||||||
|
ingressClassName: tailscale
|
||||||
|
rules:
|
||||||
|
- host: loki
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: loki
|
||||||
|
port:
|
||||||
|
number: 3100
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- loki
|
||||||
10
argocd/manifests/loki/kustomization.yaml
Normal file
10
argocd/manifests/loki/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
namespace: monitoring
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- configmap.yaml
|
||||||
|
- statefulset.yaml
|
||||||
|
- service.yaml
|
||||||
|
- ingress-tailscale.yaml
|
||||||
16
argocd/manifests/loki/service.yaml
Normal file
16
argocd/manifests/loki/service.yaml
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: loki
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: loki
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 3100
|
||||||
|
targetPort: 3100
|
||||||
|
- name: grpc
|
||||||
|
port: 9096
|
||||||
|
targetPort: 9096
|
||||||
|
type: ClusterIP
|
||||||
66
argocd/manifests/loki/statefulset.yaml
Normal file
66
argocd/manifests/loki/statefulset.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: StatefulSet
|
||||||
|
metadata:
|
||||||
|
name: loki
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
serviceName: loki
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: loki
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: loki
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 10001
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 10001
|
||||||
|
containers:
|
||||||
|
- name: loki
|
||||||
|
image: grafana/loki:3.3.2
|
||||||
|
args:
|
||||||
|
- -config.file=/etc/loki/loki-config.yaml
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 3100
|
||||||
|
- name: grpc
|
||||||
|
containerPort: 9096
|
||||||
|
volumeMounts:
|
||||||
|
- name: config
|
||||||
|
mountPath: /etc/loki
|
||||||
|
- name: data
|
||||||
|
mountPath: /loki
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "1Gi"
|
||||||
|
cpu: "500m"
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: 3100
|
||||||
|
initialDelaySeconds: 45
|
||||||
|
periodSeconds: 10
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /ready
|
||||||
|
port: 3100
|
||||||
|
initialDelaySeconds: 10
|
||||||
|
periodSeconds: 5
|
||||||
|
volumes:
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: loki-config
|
||||||
|
volumeClaimTemplates:
|
||||||
|
- metadata:
|
||||||
|
name: data
|
||||||
|
spec:
|
||||||
|
accessModes: ["ReadWriteOnce"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 20Gi
|
||||||
38
argocd/manifests/prometheus/configmap.yaml
Normal file
38
argocd/manifests/prometheus/configmap.yaml
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: prometheus-config
|
||||||
|
namespace: monitoring
|
||||||
|
data:
|
||||||
|
prometheus.yml: |
|
||||||
|
global:
|
||||||
|
scrape_interval: 15s
|
||||||
|
evaluation_interval: 15s
|
||||||
|
|
||||||
|
# Indri system metrics are pushed via Alloy remote_write
|
||||||
|
# K8s services are scraped directly
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
|
||||||
|
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
|
||||||
|
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
|
||||||
|
- job_name: "node-exporter-sifaka"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["192.168.1.203:9100"]
|
||||||
|
|
||||||
|
# CNPG PostgreSQL metrics (k8s internal)
|
||||||
|
- job_name: "cnpg-postgres"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
|
||||||
|
labels:
|
||||||
|
instance: "blumeops-pg"
|
||||||
|
|
||||||
|
# Prometheus self-monitoring
|
||||||
|
- job_name: "prometheus"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
|
|
||||||
|
# Loki metrics
|
||||||
|
- job_name: "loki"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["loki.monitoring.svc.cluster.local:3100"]
|
||||||
25
argocd/manifests/prometheus/ingress-tailscale.yaml
Normal file
25
argocd/manifests/prometheus/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# Tailscale Ingress for Prometheus
|
||||||
|
# Allows Alloy on indri to push metrics via remote_write
|
||||||
|
apiVersion: networking.k8s.io/v1
|
||||||
|
kind: Ingress
|
||||||
|
metadata:
|
||||||
|
name: prometheus-tailscale
|
||||||
|
namespace: monitoring
|
||||||
|
annotations:
|
||||||
|
tailscale.com/funnel: "false"
|
||||||
|
spec:
|
||||||
|
ingressClassName: tailscale
|
||||||
|
rules:
|
||||||
|
- host: prometheus
|
||||||
|
http:
|
||||||
|
paths:
|
||||||
|
- path: /
|
||||||
|
pathType: Prefix
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: prometheus
|
||||||
|
port:
|
||||||
|
number: 9090
|
||||||
|
tls:
|
||||||
|
- hosts:
|
||||||
|
- prometheus
|
||||||
10
argocd/manifests/prometheus/kustomization.yaml
Normal file
10
argocd/manifests/prometheus/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
namespace: monitoring
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- configmap.yaml
|
||||||
|
- statefulset.yaml
|
||||||
|
- service.yaml
|
||||||
|
- ingress-tailscale.yaml
|
||||||
13
argocd/manifests/prometheus/service.yaml
Normal file
13
argocd/manifests/prometheus/service.yaml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: prometheus
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
selector:
|
||||||
|
app: prometheus
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
port: 9090
|
||||||
|
targetPort: 9090
|
||||||
|
type: ClusterIP
|
||||||
68
argocd/manifests/prometheus/statefulset.yaml
Normal file
68
argocd/manifests/prometheus/statefulset.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: StatefulSet
|
||||||
|
metadata:
|
||||||
|
name: prometheus
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
serviceName: prometheus
|
||||||
|
replicas: 1
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: prometheus
|
||||||
|
template:
|
||||||
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: prometheus
|
||||||
|
spec:
|
||||||
|
securityContext:
|
||||||
|
fsGroup: 65534
|
||||||
|
runAsNonRoot: true
|
||||||
|
runAsUser: 65534
|
||||||
|
containers:
|
||||||
|
- name: prometheus
|
||||||
|
image: prom/prometheus:v3.2.1
|
||||||
|
args:
|
||||||
|
- --config.file=/etc/prometheus/prometheus.yml
|
||||||
|
- --storage.tsdb.path=/prometheus
|
||||||
|
- --storage.tsdb.retention.time=15d
|
||||||
|
- --web.enable-remote-write-receiver
|
||||||
|
- --web.enable-lifecycle
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 9090
|
||||||
|
volumeMounts:
|
||||||
|
- name: config
|
||||||
|
mountPath: /etc/prometheus
|
||||||
|
- name: data
|
||||||
|
mountPath: /prometheus
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "256Mi"
|
||||||
|
cpu: "100m"
|
||||||
|
limits:
|
||||||
|
memory: "1Gi"
|
||||||
|
cpu: "500m"
|
||||||
|
livenessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/healthy
|
||||||
|
port: 9090
|
||||||
|
initialDelaySeconds: 30
|
||||||
|
periodSeconds: 15
|
||||||
|
readinessProbe:
|
||||||
|
httpGet:
|
||||||
|
path: /-/ready
|
||||||
|
port: 9090
|
||||||
|
initialDelaySeconds: 5
|
||||||
|
periodSeconds: 5
|
||||||
|
volumes:
|
||||||
|
- name: config
|
||||||
|
configMap:
|
||||||
|
name: prometheus-config
|
||||||
|
volumeClaimTemplates:
|
||||||
|
- metadata:
|
||||||
|
name: data
|
||||||
|
spec:
|
||||||
|
accessModes: ["ReadWriteOnce"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 20Gi
|
||||||
|
|
@ -14,7 +14,7 @@ check_service() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
local check_cmd="$2"
|
local check_cmd="$2"
|
||||||
|
|
||||||
printf "%-20s " "$name..."
|
printf "%-24s " "$name..."
|
||||||
if eval "$check_cmd" > /dev/null 2>&1; then
|
if eval "$check_cmd" > /dev/null 2>&1; then
|
||||||
echo -e "${GREEN}OK${NC}"
|
echo -e "${GREEN}OK${NC}"
|
||||||
else
|
else
|
||||||
|
|
@ -27,7 +27,7 @@ check_http() {
|
||||||
local name="$1"
|
local name="$1"
|
||||||
local url="$2"
|
local url="$2"
|
||||||
|
|
||||||
printf "%-20s " "$name..."
|
printf "%-24s " "$name..."
|
||||||
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
||||||
echo -e "${GREEN}OK${NC}"
|
echo -e "${GREEN}OK${NC}"
|
||||||
else
|
else
|
||||||
|
|
@ -40,39 +40,23 @@ echo "Checking indri services..."
|
||||||
echo "=========================="
|
echo "=========================="
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
# Check via SSH that services are running on indri
|
# Local services on indri
|
||||||
echo "Local services (via launchctl/brew services):"
|
echo "Local services on indri:"
|
||||||
check_service "loki" "ssh indri 'brew services list | grep loki | grep started'"
|
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
|
||||||
check_service "alloy" "ssh indri 'brew services list | grep grafana-alloy | grep started'"
|
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
||||||
check_service "prometheus" "ssh indri 'brew services list | grep prometheus | grep started'"
|
check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'"
|
||||||
check_service "grafana" "ssh indri 'brew services list | grep grafana | grep started'"
|
check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'"
|
||||||
check_service "transmission" "ssh indri 'brew services list | grep transmission | grep started'"
|
check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'"
|
||||||
check_service "transmission-metrics" "ssh indri 'launchctl list | grep transmission-metrics | grep -v \"^-\"'"
|
check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'"
|
||||||
check_service "kiwix-serve" "ssh indri 'launchctl list | grep kiwix | grep -v \"^-\"'"
|
check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'"
|
||||||
check_service "forgejo" "ssh indri 'brew services list | grep forgejo | grep started'"
|
check_service "plex-metrics" "ssh indri 'launchctl list mcquack.plex-metrics | grep -v \"^-\"'"
|
||||||
check_service "devpi" "ssh indri 'launchctl list | grep devpi | grep -v \"^-\"'"
|
|
||||||
# NOTE: postgresql and miniflux moved to k8s - checked below
|
|
||||||
check_service "zot" "ssh indri 'launchctl list | grep mcquack.eblume.zot | grep -v \"^-\"'"
|
|
||||||
check_service "zot-metrics" "ssh indri 'launchctl list | grep zot-metrics | grep -v \"^-\"'"
|
|
||||||
check_service "minikube-metrics" "ssh indri 'launchctl list | grep minikube-metrics | grep -v \"^-\"'"
|
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "HTTP endpoints (via Tailscale):"
|
echo "Metrics textfiles:"
|
||||||
check_http "Loki" "http://indri:3100/ready"
|
check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'"
|
||||||
check_http "Prometheus" "http://indri:9090/-/healthy"
|
check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
||||||
check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health"
|
check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
||||||
check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/"
|
check_service "plex.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/plex.prom'"
|
||||||
check_http "Forgejo" "https://forge.tail8d86e.ts.net/"
|
|
||||||
check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api"
|
|
||||||
check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck"
|
|
||||||
# Transmission RPC is localhost-only by design, check via SSH
|
|
||||||
check_service "Transmission RPC" "ssh indri 'curl -sf http://127.0.0.1:9091/transmission/rpc'"
|
|
||||||
# Check that transmission metrics are being collected
|
|
||||||
check_service "Transmission metrics" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/transmission.prom'"
|
|
||||||
# Zot registry (via Tailscale service)
|
|
||||||
check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog"
|
|
||||||
check_service "Zot metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
|
||||||
check_service "Minikube metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Kubernetes cluster:"
|
echo "Kubernetes cluster:"
|
||||||
|
|
@ -81,14 +65,43 @@ check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
||||||
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
echo "Kubernetes workloads (via Tailscale):"
|
echo "HTTP endpoints (via Tailscale):"
|
||||||
|
check_http "Prometheus" "https://prometheus.tail8d86e.ts.net/-/healthy"
|
||||||
|
check_http "Loki" "https://loki.tail8d86e.ts.net/ready"
|
||||||
|
check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health"
|
||||||
check_http "ArgoCD" "https://argocd.tail8d86e.ts.net/healthz"
|
check_http "ArgoCD" "https://argocd.tail8d86e.ts.net/healthz"
|
||||||
# k8s PostgreSQL - check TCP connection (no auth needed for pg_isready)
|
check_http "Forgejo" "https://forge.tail8d86e.ts.net/"
|
||||||
|
check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog"
|
||||||
|
check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/"
|
||||||
|
check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck"
|
||||||
|
check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api"
|
||||||
|
check_http "Transmission" "https://torrent.tail8d86e.ts.net/"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "Database:"
|
||||||
check_service "PostgreSQL (k8s)" "pg_isready -h pg.tail8d86e.ts.net -p 5432"
|
check_service "PostgreSQL (k8s)" "pg_isready -h pg.tail8d86e.ts.net -p 5432"
|
||||||
# k8s miniflux pod
|
|
||||||
check_service "Miniflux pod" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
echo ""
|
||||||
# ArgoCD apps sync status
|
echo "Kubernetes pods:"
|
||||||
check_service "ArgoCD apps synced" "kubectl --context=minikube-indri get applications -n argocd -o jsonpath='{.items[*].status.sync.status}' | grep -v OutOfSync"
|
check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||||
|
check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||||
|
check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||||
|
check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||||
|
check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "ArgoCD app sync status:"
|
||||||
|
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
||||||
|
while read -r name sync health target; do
|
||||||
|
if [[ "$sync" == "Synced" ]]; then
|
||||||
|
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||||
|
elif [[ "$sync" == "OutOfSync" ]]; then
|
||||||
|
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||||
|
FAILED=1
|
||||||
|
else
|
||||||
|
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||||
|
fi
|
||||||
|
done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null)
|
||||||
|
|
||||||
echo ""
|
echo ""
|
||||||
if [ $FAILED -eq 0 ]; then
|
if [ $FAILED -eq 0 ]; then
|
||||||
|
|
|
||||||
|
|
@ -74,11 +74,11 @@
|
||||||
"dst": ["tag:homelab"],
|
"dst": ["tag:homelab"],
|
||||||
"ip": ["tcp:3001", "tcp:2200"],
|
"ip": ["tcp:3001", "tcp:2200"],
|
||||||
},
|
},
|
||||||
// Homelab can reach k8s PostgreSQL for borgmatic backups
|
// Homelab can reach k8s services: PostgreSQL, CNPG metrics, Prometheus/Loki
|
||||||
{
|
{
|
||||||
"src": ["tag:homelab"],
|
"src": ["tag:homelab"],
|
||||||
"dst": ["tag:k8s"],
|
"dst": ["tag:k8s"],
|
||||||
"ip": ["tcp:5432"],
|
"ip": ["tcp:443", "tcp:5432", "tcp:9187"],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
|
|
@ -141,10 +141,10 @@
|
||||||
"accept": ["tag:kiwix:443", "tag:forge:443", "tag:feed:443", "tag:pg:5432"],
|
"accept": ["tag:kiwix:443", "tag:forge:443", "tag:feed:443", "tag:pg:5432"],
|
||||||
"deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445", "tag:registry:443", "tag:k8s-api:443"],
|
"deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445", "tag:registry:443", "tag:k8s-api:443"],
|
||||||
},
|
},
|
||||||
// Homelab can reach homelab and NAS
|
// Homelab can reach homelab, NAS, and k8s services (postgres, metrics, prometheus/loki)
|
||||||
{
|
{
|
||||||
"src": "tag:homelab",
|
"src": "tag:homelab",
|
||||||
"accept": ["tag:homelab:22", "tag:nas:445"],
|
"accept": ["tag:homelab:22", "tag:nas:445", "tag:k8s:443", "tag:k8s:5432", "tag:k8s:9187"],
|
||||||
},
|
},
|
||||||
// K8s workloads can reach registry and forge (on indri:3001 HTTP, :2200 SSH)
|
// K8s workloads can reach registry and forge (on indri:3001 HTTP, :2200 SSH)
|
||||||
{
|
{
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue