Migrate observability stack to Kubernetes (#42)
Note: the name of this branch was chosen before the scope widened to encompass the entire observability stack. Summary - Fix Grafana data source URLs (docker driver uses host.minikube.internal, not host.containers.internal) - Migrate Prometheus and Loki from indri to Kubernetes with Tailscale Ingresses - Expose CNPG PostgreSQL metrics via Tailscale and update dashboard to use cnpg_* metrics - Update Alloy to push metrics/logs to k8s endpoints (prometheus.tail8d86e.ts.net, loki.tail8d86e.ts.net) - Add ACL rule for port 9187 (CNPG metrics) - Delete obsolete ansible roles for prometheus and loki Changes - argocd/manifests/prometheus/ - New Prometheus StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/manifests/loki/ - New Loki StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/apps/prometheus.yaml, argocd/apps/loki.yaml - ArgoCD Applications - argocd/manifests/grafana/values.yaml - Data sources now use k8s internal DNS - argocd/manifests/databases/service-metrics-tailscale.yaml - CNPG metrics endpoint - argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml - Updated to cnpg_* metrics - ansible/roles/alloy/defaults/main.yml - Push to k8s Tailscale endpoints - pulumi/policy.hujson - ACL for port 9187 - Deleted ansible/roles/prometheus/ and ansible/roles/loki/ Deployment and Testing - Stop prometheus and loki on indri - Sync ArgoCD apps (apps, prometheus, loki, grafana) - Run mise run provision-indri -- --tags alloy - Verify Grafana dashboards show data 🤖 Generated with https://claude.ai/claude-code Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/42
This commit is contained in:
parent
5a829e0afd
commit
17023085cb
36 changed files with 569 additions and 270 deletions
|
|
@ -14,7 +14,7 @@ check_service() {
|
|||
local name="$1"
|
||||
local check_cmd="$2"
|
||||
|
||||
printf "%-20s " "$name..."
|
||||
printf "%-24s " "$name..."
|
||||
if eval "$check_cmd" > /dev/null 2>&1; then
|
||||
echo -e "${GREEN}OK${NC}"
|
||||
else
|
||||
|
|
@ -27,7 +27,7 @@ check_http() {
|
|||
local name="$1"
|
||||
local url="$2"
|
||||
|
||||
printf "%-20s " "$name..."
|
||||
printf "%-24s " "$name..."
|
||||
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
||||
echo -e "${GREEN}OK${NC}"
|
||||
else
|
||||
|
|
@ -40,39 +40,23 @@ echo "Checking indri services..."
|
|||
echo "=========================="
|
||||
echo ""
|
||||
|
||||
# Check via SSH that services are running on indri
|
||||
echo "Local services (via launchctl/brew services):"
|
||||
check_service "loki" "ssh indri 'brew services list | grep loki | grep started'"
|
||||
check_service "alloy" "ssh indri 'brew services list | grep grafana-alloy | grep started'"
|
||||
check_service "prometheus" "ssh indri 'brew services list | grep prometheus | grep started'"
|
||||
check_service "grafana" "ssh indri 'brew services list | grep grafana | grep started'"
|
||||
check_service "transmission" "ssh indri 'brew services list | grep transmission | grep started'"
|
||||
check_service "transmission-metrics" "ssh indri 'launchctl list | grep transmission-metrics | grep -v \"^-\"'"
|
||||
check_service "kiwix-serve" "ssh indri 'launchctl list | grep kiwix | grep -v \"^-\"'"
|
||||
check_service "forgejo" "ssh indri 'brew services list | grep forgejo | grep started'"
|
||||
check_service "devpi" "ssh indri 'launchctl list | grep devpi | grep -v \"^-\"'"
|
||||
# NOTE: postgresql and miniflux moved to k8s - checked below
|
||||
check_service "zot" "ssh indri 'launchctl list | grep mcquack.eblume.zot | grep -v \"^-\"'"
|
||||
check_service "zot-metrics" "ssh indri 'launchctl list | grep zot-metrics | grep -v \"^-\"'"
|
||||
check_service "minikube-metrics" "ssh indri 'launchctl list | grep minikube-metrics | grep -v \"^-\"'"
|
||||
# Local services on indri
|
||||
echo "Local services on indri:"
|
||||
check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'"
|
||||
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
||||
check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'"
|
||||
check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'"
|
||||
check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'"
|
||||
check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'"
|
||||
check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'"
|
||||
check_service "plex-metrics" "ssh indri 'launchctl list mcquack.plex-metrics | grep -v \"^-\"'"
|
||||
|
||||
echo ""
|
||||
echo "HTTP endpoints (via Tailscale):"
|
||||
check_http "Loki" "http://indri:3100/ready"
|
||||
check_http "Prometheus" "http://indri:9090/-/healthy"
|
||||
check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health"
|
||||
check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/"
|
||||
check_http "Forgejo" "https://forge.tail8d86e.ts.net/"
|
||||
check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api"
|
||||
check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck"
|
||||
# Transmission RPC is localhost-only by design, check via SSH
|
||||
check_service "Transmission RPC" "ssh indri 'curl -sf http://127.0.0.1:9091/transmission/rpc'"
|
||||
# Check that transmission metrics are being collected
|
||||
check_service "Transmission metrics" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/transmission.prom'"
|
||||
# Zot registry (via Tailscale service)
|
||||
check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog"
|
||||
check_service "Zot metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
||||
check_service "Minikube metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
||||
echo "Metrics textfiles:"
|
||||
check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'"
|
||||
check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'"
|
||||
check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'"
|
||||
check_service "plex.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/plex.prom'"
|
||||
|
||||
echo ""
|
||||
echo "Kubernetes cluster:"
|
||||
|
|
@ -81,14 +65,43 @@ check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
|||
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
||||
|
||||
echo ""
|
||||
echo "Kubernetes workloads (via Tailscale):"
|
||||
echo "HTTP endpoints (via Tailscale):"
|
||||
check_http "Prometheus" "https://prometheus.tail8d86e.ts.net/-/healthy"
|
||||
check_http "Loki" "https://loki.tail8d86e.ts.net/ready"
|
||||
check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health"
|
||||
check_http "ArgoCD" "https://argocd.tail8d86e.ts.net/healthz"
|
||||
# k8s PostgreSQL - check TCP connection (no auth needed for pg_isready)
|
||||
check_http "Forgejo" "https://forge.tail8d86e.ts.net/"
|
||||
check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog"
|
||||
check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/"
|
||||
check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck"
|
||||
check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api"
|
||||
check_http "Transmission" "https://torrent.tail8d86e.ts.net/"
|
||||
|
||||
echo ""
|
||||
echo "Database:"
|
||||
check_service "PostgreSQL (k8s)" "pg_isready -h pg.tail8d86e.ts.net -p 5432"
|
||||
# k8s miniflux pod
|
||||
check_service "Miniflux pod" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
# ArgoCD apps sync status
|
||||
check_service "ArgoCD apps synced" "kubectl --context=minikube-indri get applications -n argocd -o jsonpath='{.items[*].status.sync.status}' | grep -v OutOfSync"
|
||||
|
||||
echo ""
|
||||
echo "Kubernetes pods:"
|
||||
check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running"
|
||||
check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running"
|
||||
|
||||
echo ""
|
||||
echo "ArgoCD app sync status:"
|
||||
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
||||
while read -r name sync health target; do
|
||||
if [[ "$sync" == "Synced" ]]; then
|
||||
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
elif [[ "$sync" == "OutOfSync" ]]; then
|
||||
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
FAILED=1
|
||||
else
|
||||
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
||||
fi
|
||||
done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null)
|
||||
|
||||
echo ""
|
||||
if [ $FAILED -eq 0 ]; then
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue