From 0c9c306917ec7ac4aa5f77474a70704a6cb73855 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 21 Jan 2026 21:46:28 -0800 Subject: [PATCH 01/11] Fix Grafana datasource URLs for docker driver After minikube migration from podman to docker driver, the hostname host.containers.internal no longer resolves. Use host.minikube.internal which is the correct hostname for docker driver. Co-Authored-By: Claude Opus 4.5 --- argocd/manifests/grafana/values.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/argocd/manifests/grafana/values.yaml b/argocd/manifests/grafana/values.yaml index bb2a28f..00614bb 100644 --- a/argocd/manifests/grafana/values.yaml +++ b/argocd/manifests/grafana/values.yaml @@ -24,8 +24,8 @@ grafana.ini: check_for_updates: false reporting_enabled: false -# Datasources - point to indri services via podman host gateway -# host.containers.internal resolves to the podman host (indri) from inside minikube +# Datasources - point to indri services via docker host gateway +# host.minikube.internal resolves to the docker host (indri) from inside minikube datasources: datasources.yaml: apiVersion: 1 @@ -35,7 +35,7 @@ datasources: access: proxy orgId: 1 uid: prometheus - url: http://host.containers.internal:9090 + url: http://host.minikube.internal:9090 isDefault: true editable: false - name: Loki @@ -43,7 +43,7 @@ datasources: access: proxy orgId: 1 uid: loki - url: http://host.containers.internal:3100 + url: http://host.minikube.internal:3100 editable: false # Dashboard provisioning - sidecar watches for ConfigMaps with label -- 2.50.1 (Apple Git-155) From 329f58499b5c78aab4710b0933376c5988e96d80 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Wed, 21 Jan 2026 22:03:28 -0800 Subject: [PATCH 02/11] Add CNPG metrics collection for PostgreSQL dashboard - Add Tailscale service exposing CNPG metrics on port 9187 (cnpg-metrics.tail8d86e.ts.net) - Add Prometheus scrape config for cnpg-postgres job - Update PostgreSQL dashboard to use CNPG metric names (cnpg_* prefix) Co-Authored-By: Claude Opus 4.5 --- .../prometheus/templates/prometheus.yml.j2 | 6 +++++ argocd/manifests/databases/kustomization.yaml | 1 + .../databases/service-metrics-tailscale.yaml | 22 ++++++++++++++++ .../dashboards/configmap-postgresql.yaml | 26 +++++++++---------- 4 files changed, 42 insertions(+), 13 deletions(-) create mode 100644 argocd/manifests/databases/service-metrics-tailscale.yaml diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 index 1366ae4..4271805 100644 --- a/ansible/roles/prometheus/templates/prometheus.yml.j2 +++ b/ansible/roles/prometheus/templates/prometheus.yml.j2 @@ -13,3 +13,9 @@ scrape_configs: - job_name: "loki" static_configs: - targets: ["localhost:3100"] + + - job_name: "cnpg-postgres" + static_configs: + - targets: ["cnpg-metrics.tail8d86e.ts.net:9187"] + labels: + instance: "blumeops-pg" diff --git a/argocd/manifests/databases/kustomization.yaml b/argocd/manifests/databases/kustomization.yaml index a115143..e44bdaf 100644 --- a/argocd/manifests/databases/kustomization.yaml +++ b/argocd/manifests/databases/kustomization.yaml @@ -6,3 +6,4 @@ namespace: databases resources: - blumeops-pg.yaml - service-tailscale.yaml + - service-metrics-tailscale.yaml diff --git a/argocd/manifests/databases/service-metrics-tailscale.yaml b/argocd/manifests/databases/service-metrics-tailscale.yaml new file mode 100644 index 0000000..1eeddd7 --- /dev/null +++ b/argocd/manifests/databases/service-metrics-tailscale.yaml @@ -0,0 +1,22 @@ +# Tailscale LoadBalancer for CNPG metrics access +# Exposes native postgres_exporter metrics on port 9187 +# Canonical hostname: cnpg-metrics.tail8d86e.ts.net +apiVersion: v1 +kind: Service +metadata: + name: blumeops-pg-metrics-tailscale + namespace: databases + annotations: + tailscale.com/hostname: "cnpg-metrics" + tailscale.com/proxy-class: "default" +spec: + type: LoadBalancer + loadBalancerClass: tailscale + selector: + cnpg.io/cluster: blumeops-pg + role: primary + ports: + - name: metrics + port: 9187 + targetPort: 9187 + protocol: TCP diff --git a/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml b/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml index cb1f6a5..39d05f2 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml @@ -54,7 +54,7 @@ data: }, "targets": [ { - "expr": "pg_up", + "expr": "cnpg_collector_up", "refId": "A" } ], @@ -95,7 +95,7 @@ data: }, "targets": [ { - "expr": "pg_stat_activity_count{state=\"active\"}", + "expr": "cnpg_backends_total{state=\"active\"}", "refId": "A" } ], @@ -136,7 +136,7 @@ data: }, "targets": [ { - "expr": "sum(pg_stat_activity_count)", + "expr": "sum(cnpg_backends_total)", "refId": "A" } ], @@ -177,7 +177,7 @@ data: }, "targets": [ { - "expr": "sum(pg_database_size_bytes)", + "expr": "sum(cnpg_pg_database_size_bytes)", "refId": "A" } ], @@ -249,7 +249,7 @@ data: }, "targets": [ { - "expr": "pg_stat_activity_count", + "expr": "cnpg_backends_total", "legendFormat": "{{state}}", "refId": "A" } @@ -322,7 +322,7 @@ data: }, "targets": [ { - "expr": "pg_database_size_bytes{datname!~\"template.*\"}", + "expr": "cnpg_pg_database_size_bytes{datname!~\"template.*\"}", "legendFormat": "{{datname}}", "refId": "A" } @@ -395,22 +395,22 @@ data: }, "targets": [ { - "expr": "rate(pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} fetched", "refId": "A" }, { - "expr": "rate(pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} inserted", "refId": "B" }, { - "expr": "rate(pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} updated", "refId": "C" }, { - "expr": "rate(pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} deleted", "refId": "D" } @@ -483,12 +483,12 @@ data: }, "targets": [ { - "expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} commits", "refId": "A" }, { - "expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])", + "expr": "rate(cnpg_pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])", "legendFormat": "{{datname}} rollbacks", "refId": "B" } @@ -561,7 +561,7 @@ data: }, "targets": [ { - "expr": "pg_database_xid_age_xid_age", + "expr": "cnpg_pg_database_xid_age", "legendFormat": "{{datname}}", "refId": "A" } -- 2.50.1 (Apple Git-155) From 74c218063d8420c17f8dbb66677c20ce3e14ecfd Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 07:17:07 -0800 Subject: [PATCH 03/11] Allow homelab to scrape CNPG metrics on port 9187 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add tcp:9187 to tag:homelab → tag:k8s ACL rule for Prometheus to scrape CloudNativePG metrics endpoint. Co-Authored-By: Claude Opus 4.5 --- pulumi/policy.hujson | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pulumi/policy.hujson b/pulumi/policy.hujson index 142326b..53215f5 100644 --- a/pulumi/policy.hujson +++ b/pulumi/policy.hujson @@ -74,11 +74,11 @@ "dst": ["tag:homelab"], "ip": ["tcp:3001", "tcp:2200"], }, - // Homelab can reach k8s PostgreSQL for borgmatic backups + // Homelab can reach k8s PostgreSQL for borgmatic backups and metrics scraping { "src": ["tag:homelab"], "dst": ["tag:k8s"], - "ip": ["tcp:5432"], + "ip": ["tcp:5432", "tcp:9187"], }, ], @@ -141,10 +141,10 @@ "accept": ["tag:kiwix:443", "tag:forge:443", "tag:feed:443", "tag:pg:5432"], "deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445", "tag:registry:443", "tag:k8s-api:443"], }, - // Homelab can reach homelab and NAS + // Homelab can reach homelab, NAS, and k8s metrics { "src": "tag:homelab", - "accept": ["tag:homelab:22", "tag:nas:445"], + "accept": ["tag:homelab:22", "tag:nas:445", "tag:k8s:9187"], }, // K8s workloads can reach registry and forge (on indri:3001 HTTP, :2200 SSH) { -- 2.50.1 (Apple Git-155) From 7633a9b7a415c67277c8535ce2780d295f87cc48 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 07:48:45 -0800 Subject: [PATCH 04/11] Migrate Prometheus and Loki to Kubernetes Major observability stack migration: - Deploy Prometheus in k8s with 20Gi PVC, Tailscale Ingress - Deploy Loki in k8s with 20Gi PVC, Tailscale Ingress - Update Grafana to use k8s-internal endpoints for data sources - Update Alloy on indri to push to k8s via Tailscale endpoints - Prometheus scrapes sifaka via LAN IP (Docker NAT, same as NFS) - Deprecate ansible prometheus/loki roles Alloy on indri continues to collect: - System metrics (via prometheus.exporter.unix) - Textfile metrics (borgmatic, plex) - Logs (forgejo, tailscale, borgmatic, zot, plex) Co-Authored-By: Claude Opus 4.5 --- ansible/playbooks/indri.yml | 4 -- ansible/roles/alloy/defaults/main.yml | 31 ++------- ansible/roles/loki/defaults/main.yml | 12 ---- ansible/roles/loki/handlers/main.yml | 6 -- ansible/roles/loki/meta/main.yml | 2 - ansible/roles/loki/tasks/main.yml | 38 ----------- .../roles/loki/templates/loki-config.yaml.j2 | 54 --------------- ansible/roles/prometheus/handlers/main.yml | 4 -- ansible/roles/prometheus/tasks/main.yml | 25 ------- .../prometheus/templates/prometheus.args.j2 | 4 -- .../prometheus/templates/prometheus.yml.j2 | 21 ------ argocd/apps/loki.yaml | 17 +++++ argocd/apps/prometheus.yaml | 17 +++++ argocd/manifests/grafana/values.yaml | 7 +- argocd/manifests/loki/configmap.yaml | 58 ++++++++++++++++ argocd/manifests/loki/ingress-tailscale.yaml | 25 +++++++ argocd/manifests/loki/kustomization.yaml | 10 +++ argocd/manifests/loki/service.yaml | 16 +++++ argocd/manifests/loki/statefulset.yaml | 66 ++++++++++++++++++ argocd/manifests/prometheus/configmap.yaml | 38 +++++++++++ .../prometheus/ingress-tailscale.yaml | 25 +++++++ .../manifests/prometheus/kustomization.yaml | 10 +++ argocd/manifests/prometheus/service.yaml | 13 ++++ argocd/manifests/prometheus/statefulset.yaml | 68 +++++++++++++++++++ 24 files changed, 371 insertions(+), 200 deletions(-) delete mode 100644 ansible/roles/loki/defaults/main.yml delete mode 100644 ansible/roles/loki/handlers/main.yml delete mode 100644 ansible/roles/loki/meta/main.yml delete mode 100644 ansible/roles/loki/tasks/main.yml delete mode 100644 ansible/roles/loki/templates/loki-config.yaml.j2 delete mode 100644 ansible/roles/prometheus/handlers/main.yml delete mode 100644 ansible/roles/prometheus/tasks/main.yml delete mode 100644 ansible/roles/prometheus/templates/prometheus.args.j2 delete mode 100644 ansible/roles/prometheus/templates/prometheus.yml.j2 create mode 100644 argocd/apps/loki.yaml create mode 100644 argocd/apps/prometheus.yaml create mode 100644 argocd/manifests/loki/configmap.yaml create mode 100644 argocd/manifests/loki/ingress-tailscale.yaml create mode 100644 argocd/manifests/loki/kustomization.yaml create mode 100644 argocd/manifests/loki/service.yaml create mode 100644 argocd/manifests/loki/statefulset.yaml create mode 100644 argocd/manifests/prometheus/configmap.yaml create mode 100644 argocd/manifests/prometheus/ingress-tailscale.yaml create mode 100644 argocd/manifests/prometheus/kustomization.yaml create mode 100644 argocd/manifests/prometheus/service.yaml create mode 100644 argocd/manifests/prometheus/statefulset.yaml diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index cc4ff27..c3d5112 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -23,12 +23,8 @@ tags: [borgmatic] roles: - - role: loki - tags: loki - role: alloy tags: alloy - - role: prometheus - tags: prometheus - role: borgmatic tags: borgmatic - role: borgmatic_metrics diff --git a/ansible/roles/alloy/defaults/main.yml b/ansible/roles/alloy/defaults/main.yml index ec867f9..b01c845 100644 --- a/ansible/roles/alloy/defaults/main.yml +++ b/ansible/roles/alloy/defaults/main.yml @@ -4,11 +4,11 @@ # Textfile collector directory (same as node_exporter for compatibility) alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile -# Prometheus remote write endpoint -alloy_prometheus_url: "http://localhost:9090/api/v1/write" +# Prometheus remote write endpoint (k8s via Tailscale) +alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write" -# Loki endpoint (used in Phase 2) -alloy_loki_url: "http://localhost:3100/loki/api/v1/push" +# Loki endpoint (k8s via Tailscale) +alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push" # Instance label for metrics alloy_instance_label: indri @@ -22,34 +22,14 @@ alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data # Log paths to collect alloy_brew_logs: - - path: /opt/homebrew/var/log/grafana-stdout.log - service: grafana - stream: stdout - - path: /opt/homebrew/var/log/grafana-stderr.log - service: grafana - stream: stderr - path: /opt/homebrew/var/log/forgejo.log service: forgejo stream: stdout - - path: /opt/homebrew/var/log/prometheus.err.log - service: prometheus - stream: stderr - path: /opt/homebrew/var/log/tailscaled.log service: tailscale stream: stdout - - path: /opt/homebrew/var/transmission/transmission-daemon.log - service: transmission - stream: stdout - # NOTE: postgresql and miniflux removed - now hosted in k8s alloy_mcquack_logs: - # NOTE: devpi logs removed - now hosted in k8s - - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log - service: kiwix - stream: stdout - - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log - service: kiwix - stream: stderr - path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log service: borgmatic stream: stdout @@ -75,8 +55,7 @@ alloy_collect_logs: true alloy_collect_zot: true alloy_zot_metrics_url: "http://localhost:5050/metrics" -# PostgreSQL metrics collection -# NOTE: Disabled - brew postgresql removed, k8s CNPG metrics TBD +# PostgreSQL metrics collection (disabled, CNPG metrics scraped directly by k8s Prometheus) alloy_collect_postgres: false alloy_postgres_host: localhost alloy_postgres_port: 5432 diff --git a/ansible/roles/loki/defaults/main.yml b/ansible/roles/loki/defaults/main.yml deleted file mode 100644 index 1f7d62e..0000000 --- a/ansible/roles/loki/defaults/main.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- -# Loki configuration - -# Server settings -loki_http_port: 3100 - -# Storage paths -loki_data_dir: /opt/homebrew/var/loki -loki_config_file: /opt/homebrew/etc/loki-local-config.yaml - -# Retention settings -loki_retention_period: 744h # 31 days diff --git a/ansible/roles/loki/handlers/main.yml b/ansible/roles/loki/handlers/main.yml deleted file mode 100644 index 3470e8e..0000000 --- a/ansible/roles/loki/handlers/main.yml +++ /dev/null @@ -1,6 +0,0 @@ ---- -- name: Restart loki - ansible.builtin.command: brew services restart loki - async: 120 - poll: 0 - changed_when: true diff --git a/ansible/roles/loki/meta/main.yml b/ansible/roles/loki/meta/main.yml deleted file mode 100644 index 23d65c7..0000000 --- a/ansible/roles/loki/meta/main.yml +++ /dev/null @@ -1,2 +0,0 @@ ---- -dependencies: [] diff --git a/ansible/roles/loki/tasks/main.yml b/ansible/roles/loki/tasks/main.yml deleted file mode 100644 index ab76419..0000000 --- a/ansible/roles/loki/tasks/main.yml +++ /dev/null @@ -1,38 +0,0 @@ ---- -# Loki installation and configuration - -- name: Install loki via homebrew - community.general.homebrew: - name: loki - state: present - -- name: Ensure loki data directory exists - ansible.builtin.file: - path: "{{ loki_data_dir }}" - state: directory - mode: '0755' - -- name: Ensure loki chunks directory exists - ansible.builtin.file: - path: "{{ loki_data_dir }}/chunks" - state: directory - mode: '0755' - -- name: Ensure loki rules directory exists - ansible.builtin.file: - path: "{{ loki_data_dir }}/rules" - state: directory - mode: '0755' - -- name: Deploy loki configuration - ansible.builtin.template: - src: loki-config.yaml.j2 - dest: "{{ loki_config_file }}" - mode: '0644' - notify: Restart loki - -- name: Ensure loki service is started - ansible.builtin.command: brew services start loki - register: loki_brew_start - changed_when: "'Successfully started' in loki_brew_start.stdout" - failed_when: false diff --git a/ansible/roles/loki/templates/loki-config.yaml.j2 b/ansible/roles/loki/templates/loki-config.yaml.j2 deleted file mode 100644 index 465d267..0000000 --- a/ansible/roles/loki/templates/loki-config.yaml.j2 +++ /dev/null @@ -1,54 +0,0 @@ -# {{ ansible_managed }} -# Loki configuration for single-node deployment - -auth_enabled: false - -server: - http_listen_port: {{ loki_http_port }} - http_listen_address: 0.0.0.0 - grpc_listen_port: 9096 - -common: - instance_addr: 127.0.0.1 - path_prefix: {{ loki_data_dir }} - storage: - filesystem: - chunks_directory: {{ loki_data_dir }}/chunks - rules_directory: {{ loki_data_dir }}/rules - replication_factor: 1 - ring: - kvstore: - store: inmemory - -query_range: - results_cache: - cache: - embedded_cache: - enabled: true - max_size_mb: 100 - -schema_config: - configs: - - from: 2024-01-01 - store: tsdb - object_store: filesystem - schema: v13 - index: - prefix: index_ - period: 24h - -storage_config: - tsdb_shipper: - active_index_directory: {{ loki_data_dir }}/tsdb-index - cache_location: {{ loki_data_dir }}/tsdb-cache - -limits_config: - retention_period: {{ loki_retention_period }} - -compactor: - working_directory: {{ loki_data_dir }}/compactor - compaction_interval: 10m - retention_enabled: true - retention_delete_delay: 2h - retention_delete_worker_count: 150 - delete_request_store: filesystem diff --git a/ansible/roles/prometheus/handlers/main.yml b/ansible/roles/prometheus/handlers/main.yml deleted file mode 100644 index ee64300..0000000 --- a/ansible/roles/prometheus/handlers/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- -- name: Restart prometheus - ansible.builtin.command: brew services restart prometheus - changed_when: true diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml deleted file mode 100644 index c21d642..0000000 --- a/ansible/roles/prometheus/tasks/main.yml +++ /dev/null @@ -1,25 +0,0 @@ ---- -- name: Install prometheus via homebrew - community.general.homebrew: - name: prometheus - state: present - -- name: Configure prometheus.yml - ansible.builtin.template: - src: prometheus.yml.j2 - dest: /opt/homebrew/etc/prometheus.yml - mode: '0644' - notify: Restart prometheus - -- name: Configure prometheus.args - ansible.builtin.template: - src: prometheus.args.j2 - dest: /opt/homebrew/etc/prometheus.args - mode: '0644' - notify: Restart prometheus - -- name: Ensure prometheus service is started - ansible.builtin.command: brew services start prometheus - register: prometheus_brew_start - changed_when: "'Successfully started' in prometheus_brew_start.stdout" - failed_when: false diff --git a/ansible/roles/prometheus/templates/prometheus.args.j2 b/ansible/roles/prometheus/templates/prometheus.args.j2 deleted file mode 100644 index ac09616..0000000 --- a/ansible/roles/prometheus/templates/prometheus.args.j2 +++ /dev/null @@ -1,4 +0,0 @@ ---config.file /opt/homebrew/etc/prometheus.yml ---web.listen-address=0.0.0.0:9090 ---storage.tsdb.path /opt/homebrew/var/prometheus ---web.enable-remote-write-receiver diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 deleted file mode 100644 index 4271805..0000000 --- a/ansible/roles/prometheus/templates/prometheus.yml.j2 +++ /dev/null @@ -1,21 +0,0 @@ -# {{ ansible_managed }} -global: - scrape_interval: 15s - -# Note: indri system metrics are pushed via Alloy remote_write -# Sifaka still uses traditional scraping via node_exporter - -scrape_configs: - - job_name: "node-exporter-sifaka" - static_configs: - - targets: ["sifaka:9100"] - - - job_name: "loki" - static_configs: - - targets: ["localhost:3100"] - - - job_name: "cnpg-postgres" - static_configs: - - targets: ["cnpg-metrics.tail8d86e.ts.net:9187"] - labels: - instance: "blumeops-pg" diff --git a/argocd/apps/loki.yaml b/argocd/apps/loki.yaml new file mode 100644 index 0000000..cb9dd41 --- /dev/null +++ b/argocd/apps/loki.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: loki + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/loki + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/prometheus.yaml b/argocd/apps/prometheus.yaml new file mode 100644 index 0000000..b53a243 --- /dev/null +++ b/argocd/apps/prometheus.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: prometheus + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/prometheus + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/grafana/values.yaml b/argocd/manifests/grafana/values.yaml index 00614bb..db2e1a1 100644 --- a/argocd/manifests/grafana/values.yaml +++ b/argocd/manifests/grafana/values.yaml @@ -24,8 +24,7 @@ grafana.ini: check_for_updates: false reporting_enabled: false -# Datasources - point to indri services via docker host gateway -# host.minikube.internal resolves to the docker host (indri) from inside minikube +# Datasources - point to k8s-internal services datasources: datasources.yaml: apiVersion: 1 @@ -35,7 +34,7 @@ datasources: access: proxy orgId: 1 uid: prometheus - url: http://host.minikube.internal:9090 + url: http://prometheus.monitoring.svc.cluster.local:9090 isDefault: true editable: false - name: Loki @@ -43,7 +42,7 @@ datasources: access: proxy orgId: 1 uid: loki - url: http://host.minikube.internal:3100 + url: http://loki.monitoring.svc.cluster.local:3100 editable: false # Dashboard provisioning - sidecar watches for ConfigMaps with label diff --git a/argocd/manifests/loki/configmap.yaml b/argocd/manifests/loki/configmap.yaml new file mode 100644 index 0000000..19c516b --- /dev/null +++ b/argocd/manifests/loki/configmap.yaml @@ -0,0 +1,58 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: monitoring +data: + loki-config.yaml: | + auth_enabled: false + + server: + http_listen_port: 3100 + http_listen_address: 0.0.0.0 + grpc_listen_port: 9096 + + common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + + query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + + schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + + limits_config: + retention_period: 744h # 31 days + + compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem diff --git a/argocd/manifests/loki/ingress-tailscale.yaml b/argocd/manifests/loki/ingress-tailscale.yaml new file mode 100644 index 0000000..bee0148 --- /dev/null +++ b/argocd/manifests/loki/ingress-tailscale.yaml @@ -0,0 +1,25 @@ +# Tailscale Ingress for Loki +# Allows Alloy on indri to push logs +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: loki-tailscale + namespace: monitoring + annotations: + tailscale.com/funnel: "false" +spec: + ingressClassName: tailscale + rules: + - host: loki + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: loki + port: + number: 3100 + tls: + - hosts: + - loki diff --git a/argocd/manifests/loki/kustomization.yaml b/argocd/manifests/loki/kustomization.yaml new file mode 100644 index 0000000..1c65acb --- /dev/null +++ b/argocd/manifests/loki/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - configmap.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml diff --git a/argocd/manifests/loki/service.yaml b/argocd/manifests/loki/service.yaml new file mode 100644 index 0000000..74b688e --- /dev/null +++ b/argocd/manifests/loki/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: monitoring +spec: + selector: + app: loki + ports: + - name: http + port: 3100 + targetPort: 3100 + - name: grpc + port: 9096 + targetPort: 9096 + type: ClusterIP diff --git a/argocd/manifests/loki/statefulset.yaml b/argocd/manifests/loki/statefulset.yaml new file mode 100644 index 0000000..18067b4 --- /dev/null +++ b/argocd/manifests/loki/statefulset.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: loki + namespace: monitoring +spec: + serviceName: loki + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + securityContext: + fsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + containers: + - name: loki + image: grafana/loki:3.3.2 + args: + - -config.file=/etc/loki/loki-config.yaml + ports: + - name: http + containerPort: 3100 + - name: grpc + containerPort: 9096 + volumeMounts: + - name: config + mountPath: /etc/loki + - name: data + mountPath: /loki + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 45 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: loki-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi diff --git a/argocd/manifests/prometheus/configmap.yaml b/argocd/manifests/prometheus/configmap.yaml new file mode 100644 index 0000000..7ae945a --- /dev/null +++ b/argocd/manifests/prometheus/configmap.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + # Indri system metrics are pushed via Alloy remote_write + # K8s services are scraped directly + + scrape_configs: + # Sifaka NAS node-exporter (via LAN - Docker NATs through indri) + # Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts) + # If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml + - job_name: "node-exporter-sifaka" + static_configs: + - targets: ["192.168.1.203:9100"] + + # CNPG PostgreSQL metrics (k8s internal) + - job_name: "cnpg-postgres" + static_configs: + - targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"] + labels: + instance: "blumeops-pg" + + # Prometheus self-monitoring + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + # Loki metrics + - job_name: "loki" + static_configs: + - targets: ["loki.monitoring.svc.cluster.local:3100"] diff --git a/argocd/manifests/prometheus/ingress-tailscale.yaml b/argocd/manifests/prometheus/ingress-tailscale.yaml new file mode 100644 index 0000000..1aeaa34 --- /dev/null +++ b/argocd/manifests/prometheus/ingress-tailscale.yaml @@ -0,0 +1,25 @@ +# Tailscale Ingress for Prometheus +# Allows Alloy on indri to push metrics via remote_write +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus-tailscale + namespace: monitoring + annotations: + tailscale.com/funnel: "false" +spec: + ingressClassName: tailscale + rules: + - host: prometheus + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 + tls: + - hosts: + - prometheus diff --git a/argocd/manifests/prometheus/kustomization.yaml b/argocd/manifests/prometheus/kustomization.yaml new file mode 100644 index 0000000..1c65acb --- /dev/null +++ b/argocd/manifests/prometheus/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - configmap.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml diff --git a/argocd/manifests/prometheus/service.yaml b/argocd/manifests/prometheus/service.yaml new file mode 100644 index 0000000..84d1909 --- /dev/null +++ b/argocd/manifests/prometheus/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring +spec: + selector: + app: prometheus + ports: + - name: http + port: 9090 + targetPort: 9090 + type: ClusterIP diff --git a/argocd/manifests/prometheus/statefulset.yaml b/argocd/manifests/prometheus/statefulset.yaml new file mode 100644 index 0000000..651451f --- /dev/null +++ b/argocd/manifests/prometheus/statefulset.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: monitoring +spec: + serviceName: prometheus + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: prometheus + image: prom/prometheus:v3.2.1 + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-remote-write-receiver + - --web.enable-lifecycle + ports: + - name: http + containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: data + mountPath: /prometheus + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: prometheus-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi -- 2.50.1 (Apple Git-155) From 45519f2cd2088d53e57fe00520924bf7dc67df58 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 10:33:18 -0800 Subject: [PATCH 05/11] Add port 443 to homelab->k8s ACL for Prometheus/Loki --- pulumi/policy.hujson | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pulumi/policy.hujson b/pulumi/policy.hujson index 53215f5..7f18820 100644 --- a/pulumi/policy.hujson +++ b/pulumi/policy.hujson @@ -74,11 +74,11 @@ "dst": ["tag:homelab"], "ip": ["tcp:3001", "tcp:2200"], }, - // Homelab can reach k8s PostgreSQL for borgmatic backups and metrics scraping + // Homelab can reach k8s services: PostgreSQL, CNPG metrics, Prometheus/Loki { "src": ["tag:homelab"], "dst": ["tag:k8s"], - "ip": ["tcp:5432", "tcp:9187"], + "ip": ["tcp:443", "tcp:5432", "tcp:9187"], }, ], @@ -141,10 +141,10 @@ "accept": ["tag:kiwix:443", "tag:forge:443", "tag:feed:443", "tag:pg:5432"], "deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445", "tag:registry:443", "tag:k8s-api:443"], }, - // Homelab can reach homelab, NAS, and k8s metrics + // Homelab can reach homelab, NAS, and k8s services (postgres, metrics, prometheus/loki) { "src": "tag:homelab", - "accept": ["tag:homelab:22", "tag:nas:445", "tag:k8s:9187"], + "accept": ["tag:homelab:22", "tag:nas:445", "tag:k8s:443", "tag:k8s:5432", "tag:k8s:9187"], }, // K8s workloads can reach registry and forge (on indri:3001 HTTP, :2200 SSH) { -- 2.50.1 (Apple Git-155) From 3f9d4aefce3f0393ba5a4e90d08330d062667230 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 10:52:13 -0800 Subject: [PATCH 06/11] Switch Alloy from Homebrew to source-built binary with LaunchAgent CGO-enabled build required for macOS native DNS resolver (Tailscale MagicDNS). Homebrew bottle is built with CGO_ENABLED=0 which uses Go's pure DNS resolver that doesn't respect /etc/resolver/* on macOS. - Remove Homebrew installation, use ~/.local/bin/alloy - Add LaunchAgent plist (mcquack.eblume.alloy) - Update config paths to ~/.config/grafana-alloy - Add build instructions in defaults/main.yml - Add alloy's own logs to mcquack_logs collection --- ansible/roles/alloy/defaults/main.yml | 38 +++++++++++++++++-- ansible/roles/alloy/handlers/main.yml | 6 +-- ansible/roles/alloy/tasks/main.yml | 38 ++++++++++++++----- ansible/roles/alloy/templates/alloy.plist.j2 | 24 ++++++++++++ ansible/roles/alloy/templates/config.alloy.j2 | 2 +- 5 files changed, 91 insertions(+), 17 deletions(-) create mode 100644 ansible/roles/alloy/templates/alloy.plist.j2 diff --git a/ansible/roles/alloy/defaults/main.yml b/ansible/roles/alloy/defaults/main.yml index b01c845..85f420c 100644 --- a/ansible/roles/alloy/defaults/main.yml +++ b/ansible/roles/alloy/defaults/main.yml @@ -1,5 +1,33 @@ --- # Grafana Alloy configuration +# +# BUILDING FROM SOURCE (required for CGO DNS resolution on macOS): +# +# Alloy must be built with CGO_ENABLED=1 to use macOS native DNS resolver, +# which is required for Tailscale MagicDNS hostname resolution. +# The Homebrew bottle is built with CGO_ENABLED=0. +# +# Build on dev machine (gilbert), then copy to indri: +# +# 1. Clone from forge mirror: +# git clone ssh://forgejo@forge.tail8d86e.ts.net/eblume/alloy.git ~/code/3rd/alloy +# +# 2. Set up build tools via mise: +# cd ~/code/3rd/alloy && mise use go@1.25 node yarn +# +# 3. Build with CGO enabled (default in Makefile): +# cd ~/code/3rd/alloy && mise x -- make alloy +# +# 4. Copy binary to indri: +# scp ~/code/3rd/alloy/build/alloy indri:~/.local/bin/alloy +# +# 5. Run ansible to deploy config and LaunchAgent + +# Binary and paths +alloy_binary: /Users/erichblume/.local/bin/alloy +alloy_config_dir: /Users/erichblume/.config/grafana-alloy +alloy_data_dir: /Users/erichblume/.local/share/grafana-alloy +alloy_log_dir: /Users/erichblume/Library/Logs # Textfile collector directory (same as node_exporter for compatibility) alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile @@ -16,10 +44,6 @@ alloy_instance_label: indri # Scrape interval alloy_scrape_interval: "15s" -# Config paths -alloy_config_dir: /opt/homebrew/etc/grafana-alloy -alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data - # Log paths to collect alloy_brew_logs: - path: /opt/homebrew/var/log/forgejo.log @@ -30,6 +54,12 @@ alloy_brew_logs: stream: stdout alloy_mcquack_logs: + - path: /Users/erichblume/Library/Logs/mcquack.alloy.out.log + service: alloy + stream: stdout + - path: /Users/erichblume/Library/Logs/mcquack.alloy.err.log + service: alloy + stream: stderr - path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log service: borgmatic stream: stdout diff --git a/ansible/roles/alloy/handlers/main.yml b/ansible/roles/alloy/handlers/main.yml index 5948838..4132dfb 100644 --- a/ansible/roles/alloy/handlers/main.yml +++ b/ansible/roles/alloy/handlers/main.yml @@ -1,6 +1,6 @@ --- - name: Restart alloy - ansible.builtin.command: brew services restart grafana-alloy - async: 120 - poll: 0 + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.alloy.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist changed_when: true diff --git a/ansible/roles/alloy/tasks/main.yml b/ansible/roles/alloy/tasks/main.yml index 644a6b2..99d256d 100644 --- a/ansible/roles/alloy/tasks/main.yml +++ b/ansible/roles/alloy/tasks/main.yml @@ -1,11 +1,18 @@ --- # Grafana Alloy installation and configuration -# Replaces node_exporter for metrics, adds log collection +# See defaults/main.yml for build instructions -- name: Install grafana-alloy via homebrew - community.general.homebrew: - name: grafana-alloy - state: present +- name: Verify alloy binary exists + ansible.builtin.stat: + path: "{{ alloy_binary }}" + register: alloy_binary_stat + +- name: Fail if alloy binary not found + ansible.builtin.fail: + msg: | + Alloy binary not found at {{ alloy_binary }}. + Please build from source first (see ansible/roles/alloy/defaults/main.yml) + when: not alloy_binary_stat.stat.exists - name: Ensure alloy config directory exists ansible.builtin.file: @@ -68,8 +75,21 @@ notify: Restart alloy no_log: true -- name: Ensure alloy service is started - ansible.builtin.command: brew services start grafana-alloy - register: alloy_brew_start - changed_when: "'Successfully started' in alloy_brew_start.stdout" +- name: Deploy alloy LaunchAgent plist + ansible.builtin.template: + src: alloy.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.alloy.plist + mode: '0644' + notify: Restart alloy + +- name: Check if alloy LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.alloy + register: alloy_launchctl_check + changed_when: false + failed_when: false + +- name: Load alloy LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.alloy.plist + when: alloy_launchctl_check.rc != 0 + changed_when: true failed_when: false diff --git a/ansible/roles/alloy/templates/alloy.plist.j2 b/ansible/roles/alloy/templates/alloy.plist.j2 new file mode 100644 index 0000000..a3a2353 --- /dev/null +++ b/ansible/roles/alloy/templates/alloy.plist.j2 @@ -0,0 +1,24 @@ + + + + + + Label + mcquack.eblume.alloy + ProgramArguments + + {{ alloy_binary }} + run + {{ alloy_config_dir }}/config.alloy + --storage.path={{ alloy_data_dir }} + + RunAtLoad + + KeepAlive + + StandardOutPath + {{ alloy_log_dir }}/mcquack.alloy.out.log + StandardErrorPath + {{ alloy_log_dir }}/mcquack.alloy.err.log + + diff --git a/ansible/roles/alloy/templates/config.alloy.j2 b/ansible/roles/alloy/templates/config.alloy.j2 index d6d2e75..1702505 100644 --- a/ansible/roles/alloy/templates/config.alloy.j2 +++ b/ansible/roles/alloy/templates/config.alloy.j2 @@ -43,7 +43,7 @@ prometheus.exporter.postgres "postgresql" { data_source_names = ["postgresql://{{ alloy_postgres_user }}:{{ alloy_postgres_password | urlencode }}@{{ alloy_postgres_host }}:{{ alloy_postgres_port }}/{{ alloy_postgres_database }}?sslmode=disable"] // Custom queries for vacuum and XID monitoring - custom_queries_config_path = "/opt/homebrew/etc/grafana-alloy/postgres_queries.yaml" + custom_queries_config_path = "{{ alloy_config_dir }}/postgres_queries.yaml" } // Scrape PostgreSQL metrics -- 2.50.1 (Apple Git-155) From b457e45d9aaf12adde6fa264f7be789174584d7a Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 11:25:42 -0800 Subject: [PATCH 07/11] Update indri-services-check for k8s observability stack - Remove checks for local prometheus/loki/grafana (now in k8s) - Update alloy check to use launchctl (no longer brew service) - Add k8s pod health checks for monitoring stack - Update HTTP endpoints to use Tailscale URLs - Reorganize sections for clarity --- mise-tasks/indri-services-check | 77 ++++++++++++++++----------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/mise-tasks/indri-services-check b/mise-tasks/indri-services-check index dcda013..1019bee 100755 --- a/mise-tasks/indri-services-check +++ b/mise-tasks/indri-services-check @@ -14,7 +14,7 @@ check_service() { local name="$1" local check_cmd="$2" - printf "%-20s " "$name..." + printf "%-24s " "$name..." if eval "$check_cmd" > /dev/null 2>&1; then echo -e "${GREEN}OK${NC}" else @@ -27,7 +27,7 @@ check_http() { local name="$1" local url="$2" - printf "%-20s " "$name..." + printf "%-24s " "$name..." if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then echo -e "${GREEN}OK${NC}" else @@ -40,39 +40,23 @@ echo "Checking indri services..." echo "==========================" echo "" -# Check via SSH that services are running on indri -echo "Local services (via launchctl/brew services):" -check_service "loki" "ssh indri 'brew services list | grep loki | grep started'" -check_service "alloy" "ssh indri 'brew services list | grep grafana-alloy | grep started'" -check_service "prometheus" "ssh indri 'brew services list | grep prometheus | grep started'" -check_service "grafana" "ssh indri 'brew services list | grep grafana | grep started'" -check_service "transmission" "ssh indri 'brew services list | grep transmission | grep started'" -check_service "transmission-metrics" "ssh indri 'launchctl list | grep transmission-metrics | grep -v \"^-\"'" -check_service "kiwix-serve" "ssh indri 'launchctl list | grep kiwix | grep -v \"^-\"'" -check_service "forgejo" "ssh indri 'brew services list | grep forgejo | grep started'" -check_service "devpi" "ssh indri 'launchctl list | grep devpi | grep -v \"^-\"'" -# NOTE: postgresql and miniflux moved to k8s - checked below -check_service "zot" "ssh indri 'launchctl list | grep mcquack.eblume.zot | grep -v \"^-\"'" -check_service "zot-metrics" "ssh indri 'launchctl list | grep zot-metrics | grep -v \"^-\"'" -check_service "minikube-metrics" "ssh indri 'launchctl list | grep minikube-metrics | grep -v \"^-\"'" +# Local services on indri +echo "Local services on indri:" +check_service "forgejo (brew)" "ssh indri 'brew services list | grep forgejo | grep started'" +check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'" +check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'" +check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'" +check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'" +check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'" +check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'" +check_service "plex-metrics" "ssh indri 'launchctl list mcquack.plex-metrics | grep -v \"^-\"'" echo "" -echo "HTTP endpoints (via Tailscale):" -check_http "Loki" "http://indri:3100/ready" -check_http "Prometheus" "http://indri:9090/-/healthy" -check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health" -check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/" -check_http "Forgejo" "https://forge.tail8d86e.ts.net/" -check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api" -check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck" -# Transmission RPC is localhost-only by design, check via SSH -check_service "Transmission RPC" "ssh indri 'curl -sf http://127.0.0.1:9091/transmission/rpc'" -# Check that transmission metrics are being collected -check_service "Transmission metrics" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/transmission.prom'" -# Zot registry (via Tailscale service) -check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog" -check_service "Zot metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" -check_service "Minikube metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" +echo "Metrics textfiles:" +check_service "borgmatic.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/borgmatic.prom'" +check_service "zot.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" +check_service "minikube.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" +check_service "plex.prom" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/plex.prom'" echo "" echo "Kubernetes cluster:" @@ -81,14 +65,29 @@ check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" -echo "Kubernetes workloads (via Tailscale):" +echo "HTTP endpoints (via Tailscale):" +check_http "Prometheus" "https://prometheus.tail8d86e.ts.net/-/healthy" +check_http "Loki" "https://loki.tail8d86e.ts.net/ready" +check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health" check_http "ArgoCD" "https://argocd.tail8d86e.ts.net/healthz" -# k8s PostgreSQL - check TCP connection (no auth needed for pg_isready) +check_http "Forgejo" "https://forge.tail8d86e.ts.net/" +check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog" +check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/" +check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck" +check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api" +check_http "Transmission" "https://torrent.tail8d86e.ts.net/" + +echo "" +echo "Database:" check_service "PostgreSQL (k8s)" "pg_isready -h pg.tail8d86e.ts.net -p 5432" -# k8s miniflux pod -check_service "Miniflux pod" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" -# ArgoCD apps sync status -check_service "ArgoCD apps synced" "kubectl --context=minikube-indri get applications -n argocd -o jsonpath='{.items[*].status.sync.status}' | grep -v OutOfSync" + +echo "" +echo "Kubernetes pods:" +check_service "prometheus-0" "kubectl --context=minikube-indri -n monitoring get pod prometheus-0 -o jsonpath='{.status.phase}' | grep -q Running" +check_service "loki-0" "kubectl --context=minikube-indri -n monitoring get pod loki-0 -o jsonpath='{.status.phase}' | grep -q Running" +check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].status.phase}' | grep -q Running" +check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" +check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running" echo "" if [ $FAILED -eq 0 ]; then -- 2.50.1 (Apple Git-155) From f6a15745bdb28062c6e251c4395976bf00ca8fa3 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 11:27:31 -0800 Subject: [PATCH 08/11] Add ArgoCD sync status to services check Shows app name, sync status, health, and revision (truncated to 7 chars) --- mise-tasks/indri-services-check | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mise-tasks/indri-services-check b/mise-tasks/indri-services-check index 1019bee..e7fc9a8 100755 --- a/mise-tasks/indri-services-check +++ b/mise-tasks/indri-services-check @@ -89,6 +89,22 @@ check_service "grafana" "kubectl --context=minikube-indri -n monitoring get pods check_service "miniflux" "kubectl --context=minikube-indri -n miniflux get pods -l app=miniflux -o jsonpath='{.items[0].status.phase}' | grep -q Running" check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get pods -l cnpg.io/cluster=blumeops-pg -o jsonpath='{.items[0].status.phase}' | grep -q Running" +echo "" +echo "ArgoCD app sync status:" +printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "REVISION" +while read -r name sync health rev; do + # Truncate revision to first 7 chars + short_rev="${rev:0:7}" + if [[ "$sync" == "Synced" ]]; then + printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$short_rev" + elif [[ "$sync" == "OutOfSync" ]]; then + printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$short_rev" + FAILED=1 + else + printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$short_rev" + fi +done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,REVISION:.status.sync.revision' 2>/dev/null) + echo "" if [ $FAILED -eq 0 ]; then echo -e "${GREEN}All services healthy!${NC}" -- 2.50.1 (Apple Git-155) From 7ee0410f272e0b3dfd105b2289e95a8ce3081be2 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 11:28:34 -0800 Subject: [PATCH 09/11] Show target branch instead of commit hash in ArgoCD status --- mise-tasks/indri-services-check | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mise-tasks/indri-services-check b/mise-tasks/indri-services-check index e7fc9a8..d1cd525 100755 --- a/mise-tasks/indri-services-check +++ b/mise-tasks/indri-services-check @@ -91,19 +91,17 @@ check_service "blumeops-pg" "kubectl --context=minikube-indri -n databases get p echo "" echo "ArgoCD app sync status:" -printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "REVISION" -while read -r name sync health rev; do - # Truncate revision to first 7 chars - short_rev="${rev:0:7}" +printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET" +while read -r name sync health target; do if [[ "$sync" == "Synced" ]]; then - printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$short_rev" + printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" elif [[ "$sync" == "OutOfSync" ]]; then - printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$short_rev" + printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target" FAILED=1 else - printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$short_rev" + printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target" fi -done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,REVISION:.status.sync.revision' 2>/dev/null) +done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null) echo "" if [ $FAILED -eq 0 ]; then -- 2.50.1 (Apple Git-155) From 9c7cdf24817b69454b05a54c7b6bd58b9ac2de17 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 11:43:46 -0800 Subject: [PATCH 10/11] Fix KUBECTL variable definition in minikube-metrics script The previous replace_all edit corrupted the variable definition from "kubectl" to "$KUBECTL", causing an unbound variable error. Co-Authored-By: Claude Opus 4.5 --- .../templates/minikube-metrics.sh.j2 | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 b/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 index 447c5a5..68521d6 100644 --- a/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 +++ b/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 @@ -4,6 +4,10 @@ set -euo pipefail +# Use absolute paths for LaunchAgent compatibility +MINIKUBE="/opt/homebrew/bin/minikube" +KUBECTL="/opt/homebrew/bin/kubectl" + OUTPUT_FILE="{{ minikube_metrics_dir }}/minikube.prom" TEMP_FILE="${OUTPUT_FILE}.tmp" @@ -22,7 +26,7 @@ cat > "$TEMP_FILE" << 'HEADER' HEADER # Check if minikube is running -if minikube status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then +if $MINIKUBE status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then echo "minikube_up 1" >> "$TEMP_FILE" else echo "minikube_up 0" >> "$TEMP_FILE" @@ -35,22 +39,22 @@ else fi # Check API server health -if kubectl get --raw /healthz >/dev/null 2>&1; then +if $KUBECTL get --raw /healthz >/dev/null 2>&1; then echo "minikube_apiserver_up 1" >> "$TEMP_FILE" else echo "minikube_apiserver_up 0" >> "$TEMP_FILE" fi # Get node count -NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') +NODE_COUNT=$($KUBECTL get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') echo "minikube_node_count ${NODE_COUNT:-0}" >> "$TEMP_FILE" # Get pod count (all namespaces) -POD_COUNT=$(kubectl get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ') +POD_COUNT=$($KUBECTL get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ') echo "minikube_pod_count ${POD_COUNT:-0}" >> "$TEMP_FILE" # Get namespace count -NS_COUNT=$(kubectl get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') +NS_COUNT=$($KUBECTL get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') echo "minikube_namespace_count ${NS_COUNT:-0}" >> "$TEMP_FILE" # Atomic move -- 2.50.1 (Apple Git-155) From bd8ac77d676f0712cb3dfe6235f2f464574eef91 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 22 Jan 2026 12:04:57 -0800 Subject: [PATCH 11/11] Fix minikube-metrics LaunchAgent environment Add HOME and PATH environment variables to the LaunchAgent plist. Minikube needs HOME to find its config files (~/.minikube/) and PATH to find docker for status checks. Co-Authored-By: Claude Opus 4.5 --- ansible/roles/minikube_metrics/defaults/main.yml | 1 + .../minikube_metrics/templates/minikube-metrics.plist.j2 | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/ansible/roles/minikube_metrics/defaults/main.yml b/ansible/roles/minikube_metrics/defaults/main.yml index 68fd672..91ae59c 100644 --- a/ansible/roles/minikube_metrics/defaults/main.yml +++ b/ansible/roles/minikube_metrics/defaults/main.yml @@ -3,3 +3,4 @@ minikube_metrics_dir: /opt/homebrew/var/node_exporter/textfile minikube_metrics_script: /Users/erichblume/bin/minikube-metrics minikube_metrics_interval: 60 # seconds between metric collection minikube_metrics_log_dir: /opt/homebrew/var/log +minikube_metrics_user_home: /Users/erichblume diff --git a/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 b/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 index 4e751d7..fe2198b 100644 --- a/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 +++ b/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 @@ -5,6 +5,13 @@ Label mcquack.eblume.minikube-metrics + EnvironmentVariables + + HOME + {{ minikube_metrics_user_home }} + PATH + /opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin + ProgramArguments {{ minikube_metrics_script }} -- 2.50.1 (Apple Git-155)