diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index cc4ff27..c3d5112 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -23,12 +23,8 @@ tags: [borgmatic] roles: - - role: loki - tags: loki - role: alloy tags: alloy - - role: prometheus - tags: prometheus - role: borgmatic tags: borgmatic - role: borgmatic_metrics diff --git a/ansible/roles/alloy/defaults/main.yml b/ansible/roles/alloy/defaults/main.yml index ec867f9..85f420c 100644 --- a/ansible/roles/alloy/defaults/main.yml +++ b/ansible/roles/alloy/defaults/main.yml @@ -1,14 +1,42 @@ --- # Grafana Alloy configuration +# +# BUILDING FROM SOURCE (required for CGO DNS resolution on macOS): +# +# Alloy must be built with CGO_ENABLED=1 to use macOS native DNS resolver, +# which is required for Tailscale MagicDNS hostname resolution. +# The Homebrew bottle is built with CGO_ENABLED=0. +# +# Build on dev machine (gilbert), then copy to indri: +# +# 1. Clone from forge mirror: +# git clone ssh://forgejo@forge.tail8d86e.ts.net/eblume/alloy.git ~/code/3rd/alloy +# +# 2. Set up build tools via mise: +# cd ~/code/3rd/alloy && mise use go@1.25 node yarn +# +# 3. Build with CGO enabled (default in Makefile): +# cd ~/code/3rd/alloy && mise x -- make alloy +# +# 4. Copy binary to indri: +# scp ~/code/3rd/alloy/build/alloy indri:~/.local/bin/alloy +# +# 5. Run ansible to deploy config and LaunchAgent + +# Binary and paths +alloy_binary: /Users/erichblume/.local/bin/alloy +alloy_config_dir: /Users/erichblume/.config/grafana-alloy +alloy_data_dir: /Users/erichblume/.local/share/grafana-alloy +alloy_log_dir: /Users/erichblume/Library/Logs # Textfile collector directory (same as node_exporter for compatibility) alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile -# Prometheus remote write endpoint -alloy_prometheus_url: "http://localhost:9090/api/v1/write" +# Prometheus remote write endpoint (k8s via Tailscale) +alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write" -# Loki endpoint (used in Phase 2) -alloy_loki_url: "http://localhost:3100/loki/api/v1/push" +# Loki endpoint (k8s via Tailscale) +alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push" # Instance label for metrics alloy_instance_label: indri @@ -16,39 +44,21 @@ alloy_instance_label: indri # Scrape interval alloy_scrape_interval: "15s" -# Config paths -alloy_config_dir: /opt/homebrew/etc/grafana-alloy -alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data - # Log paths to collect alloy_brew_logs: - - path: /opt/homebrew/var/log/grafana-stdout.log - service: grafana - stream: stdout - - path: /opt/homebrew/var/log/grafana-stderr.log - service: grafana - stream: stderr - path: /opt/homebrew/var/log/forgejo.log service: forgejo stream: stdout - - path: /opt/homebrew/var/log/prometheus.err.log - service: prometheus - stream: stderr - path: /opt/homebrew/var/log/tailscaled.log service: tailscale stream: stdout - - path: /opt/homebrew/var/transmission/transmission-daemon.log - service: transmission - stream: stdout - # NOTE: postgresql and miniflux removed - now hosted in k8s alloy_mcquack_logs: - # NOTE: devpi logs removed - now hosted in k8s - - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log - service: kiwix + - path: /Users/erichblume/Library/Logs/mcquack.alloy.out.log + service: alloy stream: stdout - - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log - service: kiwix + - path: /Users/erichblume/Library/Logs/mcquack.alloy.err.log + service: alloy stream: stderr - path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log service: borgmatic @@ -75,8 +85,7 @@ alloy_collect_logs: true alloy_collect_zot: true alloy_zot_metrics_url: "http://localhost:5050/metrics" -# PostgreSQL metrics collection -# NOTE: Disabled - brew postgresql removed, k8s CNPG metrics TBD +# PostgreSQL metrics collection (disabled, CNPG metrics scraped directly by k8s Prometheus) alloy_collect_postgres: false alloy_postgres_host: localhost alloy_postgres_port: 5432 diff --git a/ansible/roles/alloy/handlers/main.yml b/ansible/roles/alloy/handlers/main.yml index 5948838..4132dfb 100644 --- a/ansible/roles/alloy/handlers/main.yml +++ b/ansible/roles/alloy/handlers/main.yml @@ -1,6 +1,6 @@ --- - name: Restart alloy - ansible.builtin.command: brew services restart grafana-alloy - async: 120 - poll: 0 + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist changed_when: true diff --git a/ansible/roles/alloy/tasks/main.yml b/ansible/roles/alloy/tasks/main.yml index 644a6b2..99d256d 100644 --- a/ansible/roles/alloy/tasks/main.yml +++ b/ansible/roles/alloy/tasks/main.yml @@ -1,11 +1,18 @@ --- # Grafana Alloy installation and configuration -# Replaces node_exporter for metrics, adds log collection +# See defaults/main.yml for build instructions -- name: Install grafana-alloy via homebrew - community.general.homebrew: - name: grafana-alloy - state: present +- name: Verify alloy binary exists + ansible.builtin.stat: + path: "{{ alloy_binary }}" + register: alloy_binary_stat + +- name: Fail if alloy binary not found + ansible.builtin.fail: + msg: | + Alloy binary not found at {{ alloy_binary }}. + Please build from source first (see ansible/roles/alloy/defaults/main.yml) + when: not alloy_binary_stat.stat.exists - name: Ensure alloy config directory exists ansible.builtin.file: @@ -68,8 +75,21 @@ notify: Restart alloy no_log: true -- name: Ensure alloy service is started - ansible.builtin.command: brew services start grafana-alloy - register: alloy_brew_start - changed_when: "'Successfully started' in alloy_brew_start.stdout" +- name: Deploy alloy LaunchAgent plist + ansible.builtin.template: + src: alloy.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.alloy.plist + mode: '0644' + notify: Restart alloy + +- name: Check if alloy LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.alloy + register: alloy_launchctl_check + changed_when: false + failed_when: false + +- name: Load alloy LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist + when: alloy_launchctl_check.rc != 0 + changed_when: true failed_when: false diff --git a/ansible/roles/alloy/templates/alloy.plist.j2 b/ansible/roles/alloy/templates/alloy.plist.j2 new file mode 100644 index 0000000..a3a2353 --- /dev/null +++ b/ansible/roles/alloy/templates/alloy.plist.j2 @@ -0,0 +1,24 @@ + + + + + + Label + mcquack.eblume.alloy + ProgramArguments + + {{ alloy_binary }} + run + {{ alloy_config_dir }}/config.alloy + --storage.path={{ alloy_data_dir }} + + RunAtLoad + + KeepAlive + + StandardOutPath + {{ alloy_log_dir }}/mcquack.alloy.out.log + StandardErrorPath + {{ alloy_log_dir }}/mcquack.alloy.err.log + + diff --git a/ansible/roles/alloy/templates/config.alloy.j2 b/ansible/roles/alloy/templates/config.alloy.j2 index d6d2e75..1702505 100644 --- a/ansible/roles/alloy/templates/config.alloy.j2 +++ b/ansible/roles/alloy/templates/config.alloy.j2 @@ -43,7 +43,7 @@ prometheus.exporter.postgres "postgresql" { data_source_names = ["postgresql://{{ alloy_postgres_user }}:{{ alloy_postgres_password | urlencode }}@{{ alloy_postgres_host }}:{{ alloy_postgres_port }}/{{ alloy_postgres_database }}?sslmode=disable"] // Custom queries for vacuum and XID monitoring - custom_queries_config_path = "/opt/homebrew/etc/grafana-alloy/postgres_queries.yaml" + custom_queries_config_path = "{{ alloy_config_dir }}/postgres_queries.yaml" } // Scrape PostgreSQL metrics diff --git a/ansible/roles/loki/defaults/main.yml b/ansible/roles/loki/defaults/main.yml deleted file mode 100644 index 1f7d62e..0000000 --- a/ansible/roles/loki/defaults/main.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- -# Loki configuration - -# Server settings -loki_http_port: 3100 - -# Storage paths -loki_data_dir: /opt/homebrew/var/loki -loki_config_file: /opt/homebrew/etc/loki-local-config.yaml - -# Retention settings -loki_retention_period: 744h # 31 days diff --git a/ansible/roles/loki/handlers/main.yml b/ansible/roles/loki/handlers/main.yml deleted file mode 100644 index 3470e8e..0000000 --- a/ansible/roles/loki/handlers/main.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- name: Restart loki - ansible.builtin.command: brew services restart loki - async: 120 - poll: 0 - changed_when: true diff --git a/ansible/roles/loki/meta/main.yml b/ansible/roles/loki/meta/main.yml deleted file mode 100644 index 23d65c7..0000000 --- a/ansible/roles/loki/meta/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -dependencies: [] diff --git a/ansible/roles/loki/tasks/main.yml b/ansible/roles/loki/tasks/main.yml deleted file mode 100644 index ab76419..0000000 --- a/ansible/roles/loki/tasks/main.yml +++ /dev/null @@ -1,38 +0,0 @@ ---- -# Loki installation and configuration - -- name: Install loki via homebrew - community.general.homebrew: - name: loki - state: present - -- name: Ensure loki data directory exists - ansible.builtin.file: - path: "{{ loki_data_dir }}" - state: directory - mode: '0755' - -- name: Ensure loki chunks directory exists - ansible.builtin.file: - path: "{{ loki_data_dir }}/chunks" - state: directory - mode: '0755' - -- name: Ensure loki rules directory exists - ansible.builtin.file: - path: "{{ loki_data_dir }}/rules" - state: directory - mode: '0755' - -- name: Deploy loki configuration - ansible.builtin.template: - src: loki-config.yaml.j2 - dest: "{{ loki_config_file }}" - mode: '0644' - notify: Restart loki - -- name: Ensure loki service is started - ansible.builtin.command: brew services start loki - register: loki_brew_start - changed_when: "'Successfully started' in loki_brew_start.stdout" - failed_when: false diff --git a/ansible/roles/loki/templates/loki-config.yaml.j2 b/ansible/roles/loki/templates/loki-config.yaml.j2 deleted file mode 100644 index 465d267..0000000 --- a/ansible/roles/loki/templates/loki-config.yaml.j2 +++ /dev/null @@ -1,54 +0,0 @@ -# {{ ansible_managed }} -# Loki configuration for single-node deployment - -auth_enabled: false - -server: - http_listen_port: {{ loki_http_port }} - http_listen_address: 0.0.0.0 - grpc_listen_port: 9096 - -common: - instance_addr: 127.0.0.1 - path_prefix: {{ loki_data_dir }} - storage: - filesystem: - chunks_directory: {{ loki_data_dir }}/chunks - rules_directory: {{ loki_data_dir }}/rules - replication_factor: 1 - ring: - kvstore: - store: inmemory - -query_range: - results_cache: - cache: - embedded_cache: - enabled: true - max_size_mb: 100 - -schema_config: - configs: - - from: 2024-01-01 - store: tsdb - object_store: filesystem - schema: v13 - index: - prefix: index_ - period: 24h - -storage_config: - tsdb_shipper: - active_index_directory: {{ loki_data_dir }}/tsdb-index - cache_location: {{ loki_data_dir }}/tsdb-cache - -limits_config: - retention_period: {{ loki_retention_period }} - -compactor: - working_directory: {{ loki_data_dir }}/compactor - compaction_interval: 10m - retention_enabled: true - retention_delete_delay: 2h - retention_delete_worker_count: 150 - delete_request_store: filesystem diff --git a/ansible/roles/minikube_metrics/defaults/main.yml b/ansible/roles/minikube_metrics/defaults/main.yml index 68fd672..91ae59c 100644 --- a/ansible/roles/minikube_metrics/defaults/main.yml +++ b/ansible/roles/minikube_metrics/defaults/main.yml @@ -3,3 +3,4 @@ minikube_metrics_dir: /opt/homebrew/var/node_exporter/textfile minikube_metrics_script: /Users/erichblume/bin/minikube-metrics minikube_metrics_interval: 60 # seconds between metric collection minikube_metrics_log_dir: /opt/homebrew/var/log +minikube_metrics_user_home: /Users/erichblume diff --git a/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 b/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 index 4e751d7..fe2198b 100644 --- a/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 +++ b/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 @@ -5,6 +5,13 @@ Label mcquack.eblume.minikube-metrics + EnvironmentVariables + + HOME + {{ minikube_metrics_user_home }} + PATH + /opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin + ProgramArguments {{ minikube_metrics_script }} diff --git a/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 b/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 index 447c5a5..68521d6 100644 --- a/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 +++ b/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 @@ -4,6 +4,10 @@ set -euo pipefail +# Use absolute paths for LaunchAgent compatibility +MINIKUBE="/opt/homebrew/bin/minikube" +KUBECTL="/opt/homebrew/bin/kubectl" + OUTPUT_FILE="{{ minikube_metrics_dir }}/minikube.prom" TEMP_FILE="${OUTPUT_FILE}.tmp" @@ -22,7 +26,7 @@ cat > "$TEMP_FILE" << 'HEADER' HEADER # Check if minikube is running -if minikube status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then +if $MINIKUBE status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then echo "minikube_up 1" >> "$TEMP_FILE" else echo "minikube_up 0" >> "$TEMP_FILE" @@ -35,22 +39,22 @@ else fi # Check API server health -if kubectl get --raw /healthz >/dev/null 2>&1; then +if $KUBECTL get --raw /healthz >/dev/null 2>&1; then echo "minikube_apiserver_up 1" >> "$TEMP_FILE" else echo "minikube_apiserver_up 0" >> "$TEMP_FILE" fi # Get node count -NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') +NODE_COUNT=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') echo "minikube_node_count ${NODE_COUNT:-0}" >> "$TEMP_FILE" # Get pod count (all namespaces) -POD_COUNT=$(kubectl get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ') +POD_COUNT=$($KUBECTL get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ') echo "minikube_pod_count ${POD_COUNT:-0}" >> "$TEMP_FILE" # Get namespace count -NS_COUNT=$(kubectl get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') +NS_COUNT=$($KUBECTL get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') echo "minikube_namespace_count ${NS_COUNT:-0}" >> "$TEMP_FILE" # Atomic move diff --git a/ansible/roles/prometheus/handlers/main.yml b/ansible/roles/prometheus/handlers/main.yml deleted file mode 100644 index ee64300..0000000 --- a/ansible/roles/prometheus/handlers/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- -- name: Restart prometheus - ansible.builtin.command: brew services restart prometheus - changed_when: true diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml deleted file mode 100644 index c21d642..0000000 --- a/ansible/roles/prometheus/tasks/main.yml +++ /dev/null @@ -1,25 +0,0 @@ ---- -- name: Install prometheus via homebrew - community.general.homebrew: - name: prometheus - state: present - -- name: Configure prometheus.yml - ansible.builtin.template: - src: prometheus.yml.j2 - dest: /opt/homebrew/etc/prometheus.yml - mode: '0644' - notify: Restart prometheus - -- name: Configure prometheus.args - ansible.builtin.template: - src: prometheus.args.j2 - dest: /opt/homebrew/etc/prometheus.args - mode: '0644' - notify: Restart prometheus - -- name: Ensure prometheus service is started - ansible.builtin.command: brew services start prometheus - register: prometheus_brew_start - changed_when: "'Successfully started' in prometheus_brew_start.stdout" - failed_when: false diff --git a/ansible/roles/prometheus/templates/prometheus.args.j2 b/ansible/roles/prometheus/templates/prometheus.args.j2 deleted file mode 100644 index ac09616..0000000 --- a/ansible/roles/prometheus/templates/prometheus.args.j2 +++ /dev/null @@ -1,4 +0,0 @@ ---config.file /opt/homebrew/etc/prometheus.yml ---web.listen-address=0.0.0.0:9090 ---storage.tsdb.path /opt/homebrew/var/prometheus ---web.enable-remote-write-receiver diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 deleted file mode 100644 index 1366ae4..0000000 --- a/ansible/roles/prometheus/templates/prometheus.yml.j2 +++ /dev/null @@ -1,15 +0,0 @@ -# {{ ansible_managed }} -global: - scrape_interval: 15s - -# Note: indri system metrics are pushed via Alloy remote_write -# Sifaka still uses traditional scraping via node_exporter - -scrape_configs: - - job_name: "node-exporter-sifaka" - static_configs: - - targets: ["sifaka:9100"] - - - job_name: "loki" - static_configs: - - targets: ["localhost:3100"] diff --git a/argocd/apps/loki.yaml b/argocd/apps/loki.yaml new file mode 100644 index 0000000..cb9dd41 --- /dev/null +++ b/argocd/apps/loki.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: loki + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/loki + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/prometheus.yaml b/argocd/apps/prometheus.yaml new file mode 100644 index 0000000..b53a243 --- /dev/null +++ b/argocd/apps/prometheus.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: prometheus + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/prometheus + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/databases/kustomization.yaml b/argocd/manifests/databases/kustomization.yaml index a115143..e44bdaf 100644 --- a/argocd/manifests/databases/kustomization.yaml +++ b/argocd/manifests/databases/kustomization.yaml @@ -6,3 +6,4 @@ namespace: databases resources: - blumeops-pg.yaml - service-tailscale.yaml + - service-metrics-tailscale.yaml diff --git a/argocd/manifests/databases/service-metrics-tailscale.yaml b/argocd/manifests/databases/service-metrics-tailscale.yaml new file mode 100644 index 0000000..1eeddd7 --- /dev/null +++ b/argocd/manifests/databases/service-metrics-tailscale.yaml @@ -0,0 +1,22 @@ +# Tailscale LoadBalancer for CNPG metrics access +# Exposes native postgres_exporter metrics on port 9187 +# Canonical hostname: cnpg-metrics.tail8d86e.ts.net +apiVersion: v1 +kind: Service +metadata: + name: blumeops-pg-metrics-tailscale + namespace: databases + annotations: + tailscale.com/hostname: "cnpg-metrics" + tailscale.com/proxy-class: "default" +spec: + type: LoadBalancer + loadBalancerClass: tailscale + selector: + cnpg.io/cluster: blumeops-pg + role: primary + ports: + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP diff --git a/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml b/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml index cb1f6a5..39d05f2 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml @@ -54,7 +54,7 @@ data: }, "targets": [ { - "expr": "pg_up", + "expr": "cnpg_collector_up", "refId": "A" } ], @@ -95,7 +95,7 @@ data: }, "targets": [ { - "expr": "pg_stat_activity_count{state=\"active\"}", + "expr": "cnpg_backends_total{state=\"active\"}", "refId": "A" } ], @@ -136,7 +136,7 @@ data: }, "targets": [ { - "expr": "sum(pg_stat_activity_count)", + "expr": "sum(cnpg_backends_total)", "refId": "A" } ], @@ -177,7 +177,7 @@ data: }, "targets": [ { - "expr": "sum(pg_database_size_bytes)", + "expr": "sum(cnpg_pg_database_size_bytes)", "refId": "A" } ], @@ -249,7 +249,7 @@ data: }, "targets": [ { - "expr": "pg_stat_activity_count", + "expr": "cnpg_backends_total", "legendFormat": "{{state}}", "refId": "A" } @@ -322,7 +322,7 @@ data: }, "targets": [ { - "expr": "pg_database_size_bytes{datname!~\"template.*\"}", + "expr": "cnpg_pg_database_size_bytes{datname!~\"template.*\"}", "legendFormat": "{{datname}}", "refId": "A" } @@ -395,22 +395,22 @@ data: }, "targets": [ { - "expr": "rate(pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} fetched", "refId": "A" }, { - "expr": "rate(pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} inserted", "refId": "B" }, { - "expr": "rate(pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} updated", "refId": "C" }, { - "expr": "rate(pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} deleted", "refId": "D" } @@ -483,12 +483,12 @@ data: }, "targets": [ { - "expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} commits", "refId": "A" }, { - "expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} rollbacks", "refId": "B" } @@ -561,7 +561,7 @@ data: }, "targets": [ { - "expr": "pg_database_xid_age_xid_age", + "expr": "cnpg_pg_database_xid_age", "legendFormat": "{{datname}}", "refId": "A" } diff --git a/argocd/manifests/grafana/values.yaml b/argocd/manifests/grafana/values.yaml index bb2a28f..db2e1a1 100644 --- a/argocd/manifests/grafana/values.yaml +++ b/argocd/manifests/grafana/values.yaml @@ -24,8 +24,7 @@ grafana.ini: check_for_updates: false reporting_enabled: false -# Datasources - point to indri services via podman host gateway -# host.containers.internal resolves to the podman host (indri) from inside minikube +# Datasources - point to k8s-internal services datasources: datasources.yaml: apiVersion: 1 @@ -35,7 +34,7 @@ datasources: access: proxy orgId: 1 uid: prometheus - url: http://host.containers.internal:9090 + url: http://prometheus.monitoring.svc.cluster.local:9090 isDefault: true editable: false - name: Loki @@ -43,7 +42,7 @@ datasources: access: proxy orgId: 1 uid: loki - url: http://host.containers.internal:3100 + url: http://loki.monitoring.svc.cluster.local:3100 editable: false # Dashboard provisioning - sidecar watches for ConfigMaps with label diff --git a/argocd/manifests/loki/configmap.yaml b/argocd/manifests/loki/configmap.yaml new file mode 100644 index 0000000..19c516b --- /dev/null +++ b/argocd/manifests/loki/configmap.yaml @@ -0,0 +1,58 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: monitoring +data: + loki-config.yaml: | + auth_enabled: false + + server: + http_listen_port: 3100 + http_listen_address: 0.0.0.0 + grpc_listen_port: 9096 + + common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + + query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + + schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + + limits_config: + retention_period: 744h # 31 days + + compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem diff --git a/argocd/manifests/loki/ingress-tailscale.yaml b/argocd/manifests/loki/ingress-tailscale.yaml new file mode 100644 index 0000000..bee0148 --- /dev/null +++ b/argocd/manifests/loki/ingress-tailscale.yaml @@ -0,0 +1,25 @@ +# Tailscale Ingress for Loki +# Allows Alloy on indri to push logs +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: loki-tailscale + namespace: monitoring + annotations: + tailscale.com/funnel: "false" +spec: + ingressClassName: tailscale + rules: + - host: loki + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: loki + port: + number: 3100 + tls: + - hosts: + - loki diff --git a/argocd/manifests/loki/kustomization.yaml b/argocd/manifests/loki/kustomization.yaml new file mode 100644 index 0000000..1c65acb --- /dev/null +++ b/argocd/manifests/loki/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - configmap.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml diff --git a/argocd/manifests/loki/service.yaml b/argocd/manifests/loki/service.yaml new file mode 100644 index 0000000..74b688e --- /dev/null +++ b/argocd/manifests/loki/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: monitoring +spec: + selector: + app: loki + ports: + - name: http + port: 3100 + targetPort: 3100 + - name: grpc + port: 9096 + targetPort: 9096 + type: ClusterIP diff --git a/argocd/manifests/loki/statefulset.yaml b/argocd/manifests/loki/statefulset.yaml new file mode 100644 index 0000000..18067b4 --- /dev/null +++ b/argocd/manifests/loki/statefulset.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: loki + namespace: monitoring +spec: + serviceName: loki + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + securityContext: + fsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + containers: + - name: loki + image: grafana/loki:3.3.2 + args: + - -config.file=/etc/loki/loki-config.yaml + ports: + - name: http + containerPort: 3100 + - name: grpc + containerPort: 9096 + volumeMounts: + - name: config + mountPath: /etc/loki + - name: data + mountPath: /loki + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 45 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: loki-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi diff --git a/argocd/manifests/prometheus/configmap.yaml b/argocd/manifests/prometheus/configmap.yaml new file mode 100644 index 0000000..7ae945a --- /dev/null +++ b/argocd/manifests/prometheus/configmap.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + # Indri system metrics are pushed via Alloy remote_write + # K8s services are scraped directly + + scrape_configs: + # Sifaka NAS node-exporter (via LAN - Docker NATs through indri) + # Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts) + # If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml + - job_name: "node-exporter-sifaka" + static_configs: + - targets: ["192.168.1.203:9100"] + + # CNPG PostgreSQL metrics (k8s internal) + - job_name: "cnpg-postgres" + static_configs: + - targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"] + labels: + instance: "blumeops-pg" + + # Prometheus self-monitoring + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + # Loki metrics + - job_name: "loki" + static_configs: + - targets: ["loki.monitoring.svc.cluster.local:3100"] diff --git a/argocd/manifests/prometheus/ingress-tailscale.yaml b/argocd/manifests/prometheus/ingress-tailscale.yaml new file mode 100644 index 0000000..1aeaa34 --- /dev/null +++ b/argocd/manifests/prometheus/ingress-tailscale.yaml @@ -0,0 +1,25 @@ +# Tailscale Ingress for Prometheus +# Allows Alloy on indri to push metrics via remote_write +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus-tailscale + namespace: monitoring + annotations: + tailscale.com/funnel: "false" +spec: + ingressClassName: tailscale + rules: + - host: prometheus + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 + tls: + - hosts: + - prometheus diff --git a/argocd/manifests/prometheus/kustomization.yaml b/argocd/manifests/prometheus/kustomization.yaml new file mode 100644 index 0000000..1c65acb --- /dev/null +++ b/argocd/manifests/prometheus/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - configmap.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml diff --git a/argocd/manifests/prometheus/service.yaml b/argocd/manifests/prometheus/service.yaml new file mode 100644 index 0000000..84d1909 --- /dev/null +++ b/argocd/manifests/prometheus/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring +spec: + selector: + app: prometheus + ports: + - name: http + port: 9090 + targetPort: 9090 + type: ClusterIP diff --git a/argocd/manifests/prometheus/statefulset.yaml b/argocd/manifests/prometheus/statefulset.yaml new file mode 100644 index 0000000..651451f --- /dev/null +++ b/argocd/manifests/prometheus/statefulset.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: monitoring +spec: + serviceName: prometheus + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: prometheus + image: prom/prometheus:v3.2.1 + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-remote-write-receiver + - --web.enable-lifecycle + ports: + - name: http + containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: data + mountPath: /prometheus + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: prometheus-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi diff --git a/mise-tasks/indri-services-check b/mise-tasks/indri-services-check index dcda013..d1cd525 100755 --- a/mise-tasks/indri-services-check +++ b/mise-tasks/indri-services-check @@ -14,7 +14,7 @@ check_service() { local name="$1" local check_cmd="$2" - printf "%-20s " "$name..." + printf "%-24s " "$name..." if eval "$check_cmd" > /dev/null 2>&1; then echo -e "${GREEN}OK${NC}" else @@ -27,7 +27,7 @@ check_http() { local name="$1" local url="$2" - printf "%-20s " "$name..." + printf "%-24s " "$name..." if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then echo -e "${GREEN}OK${NC}" else @@ -40,39 +40,23 @@ echo "Checking indri services..." echo "==========================" echo "" -# Check via SSH that services are running on indri -echo "Local services (via launchctl/brew services):" -check_service "loki" "ssh indri 'brew services list | grep loki | grep started'" -check_service "alloy" "ssh indri 'brew services list | grep grafana-alloy | grep started'" -check_service "prometheus" "ssh indri 'brew services list | grep prometheus | grep started'" -check_service "grafana" "ssh indri 'brew services list | grep grafana | grep started'" -check_service "transmission" "ssh indri 'brew services list | grep transmission | grep started'" -check_service "transmission-metrics" "ssh indri 'launchctl list | grep transmission-metrics | grep -v \"^-\"'" -check_service "kiwix-serve" "ssh indri 'launchctl list | grep kiwix | grep -v \"^-\"'" -check_service "forgejo" "ssh indri 'brew services list | grep forgejo | grep started'" -check_service "devpi" "ssh indri 'launchctl list | grep devpi | grep -v \"^-\"'" -# NOTE: postgresql and miniflux moved to k8s - checked below -check_service "zot" "ssh indri 'launchctl list | grep mcquack.eblume.zot | grep -v \"^-\"'" -check_service "zot-metrics" "ssh indri 'launchctl list | grep zot-metrics | grep -v \"^-\"'" -check_service "minikube-metrics" "ssh indri 'launchctl list | grep minikube-metrics | grep -v \"^-\"'" +# Local services on indri +echo "Local services on indri:" +check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'" +check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'" +check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'" +check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'" +check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'" +check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'" +check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'" +check_service "plex-metrics" "ssh indri 'launchctl list mcquack.plex-metrics | grep -v \"^-\"'" echo "" -echo "HTTP endpoints (via Tailscale):" -check_http "Loki" "http://indri:3100/ready" -check_http "Prometheus" "http://indri:9090/-/healthy" -check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health" -check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/" -check_http "Forgejo" "https://forge.tail8d86e.ts.net/" -check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api" -check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck" -# Transmission RPC is localhost-only by design, check via SSH -check_service "Transmission RPC" "ssh indri 'curl -sf http://127.0.0.1:9091/transmission/rpc'" -# Check that transmission metrics are being collected -check_service "Transmission metrics" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/transmission.prom'" -# Zot registry (via Tailscale service) -check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog" -check_service "Zot metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" -check_service "Minikube metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" +echo "Metrics textfiles:" +check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'" +check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" +check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" +check_service "plex.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/plex.prom'" echo "" echo "Kubernetes cluster:" @@ -81,14 +65,43 @@ check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" -echo "Kubernetes workloads (via Tailscale):" +echo "HTTP endpoints (via Tailscale):" +check_http "Prometheus" "https://prometheus.tail8d86e.ts.net/-/healthy" +check_http "Loki" "https://loki.tail8d86e.ts.net/ready" +check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health" check_http "ArgoCD" "https://argocd.tail8d86e.ts.net/healthz" -# k8s PostgreSQL - check TCP connection (no auth needed for pg_isready) +check_http "Forgejo" "https://forge.tail8d86e.ts.net/" +check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog" +check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/" +check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck" +check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api" +check_http "Transmission" "https://torrent.tail8d86e.ts.net/" + +echo "" +echo "Database:" check_service "PostgreSQL (k8s)" "pg_isready -h pg.tail8d86e.ts.net -p 5432" -# k8s miniflux pod -check_service "Miniflux pod" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" -# ArgoCD apps sync status -check_service "ArgoCD apps synced" "kubectl --context=minikube-indri get applications -n argocd -o jsonpath='{.items[*].status.sync.status}' | grep -v OutOfSync" + +echo "" +echo "Kubernetes pods:" +check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running" +check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running" +check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running" +check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" +check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running" + +echo "" +echo "ArgoCD app sync status:" +printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET" +while read -r name sync health target; do + if [[ "$sync" == "Synced" ]]; then + printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" + elif [[ "$sync" == "OutOfSync" ]]; then + printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" + FAILED=1 + else + printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target" + fi +done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null) echo "" if [ $FAILED -eq 0 ]; then diff --git a/pulumi/policy.hujson b/pulumi/policy.hujson index 142326b..7f18820 100644 --- a/pulumi/policy.hujson +++ b/pulumi/policy.hujson @@ -74,11 +74,11 @@ "dst": ["tag:homelab"], "ip": ["tcp:3001", "tcp:2200"], }, - // Homelab can reach k8s PostgreSQL for borgmatic backups + // Homelab can reach k8s services: PostgreSQL, CNPG metrics, Prometheus/Loki { "src": ["tag:homelab"], "dst": ["tag:k8s"], - "ip": ["tcp:5432"], + "ip": ["tcp:443", "tcp:5432", "tcp:9187"], }, ], @@ -141,10 +141,10 @@ "accept": ["tag:kiwix:443", "tag:forge:443", "tag:feed:443", "tag:pg:5432"], "deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445", "tag:registry:443", "tag:k8s-api:443"], }, - // Homelab can reach homelab and NAS + // Homelab can reach homelab, NAS, and k8s services (postgres, metrics, prometheus/loki) { "src": "tag:homelab", - "accept": ["tag:homelab:22", "tag:nas:445"], + "accept": ["tag:homelab:22", "tag:nas:445", "tag:k8s:443", "tag:k8s:5432", "tag:k8s:9187"], }, // K8s workloads can reach registry and forge (on indri:3001 HTTP, :2200 SSH) {