Migrate observability stack to Kubernetes (#42)

Note: the name of this branch was chosen before the scope widened to encompass the entire observability stack.

Summary

  - Fix Grafana data source URLs (docker driver uses host.minikube.internal, not host.containers.internal)
  - Migrate Prometheus and Loki from indri to Kubernetes with Tailscale Ingresses
  - Expose CNPG PostgreSQL metrics via Tailscale and update dashboard to use cnpg_* metrics
  - Update Alloy to push metrics/logs to k8s endpoints (prometheus.tail8d86e.ts.net, loki.tail8d86e.ts.net)
  - Add ACL rule for port 9187 (CNPG metrics)
  - Delete obsolete ansible roles for prometheus and loki

Changes

  - argocd/manifests/prometheus/ - New Prometheus StatefulSet with 20Gi PVC and Tailscale Ingress
  - argocd/manifests/loki/ - New Loki StatefulSet with 20Gi PVC and Tailscale Ingress
  - argocd/apps/prometheus.yaml, argocd/apps/loki.yaml - ArgoCD Applications
  - argocd/manifests/grafana/values.yaml - Data sources now use k8s internal DNS
  - argocd/manifests/databases/service-metrics-tailscale.yaml - CNPG metrics endpoint
  - argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml - Updated to cnpg_* metrics
  - ansible/roles/alloy/defaults/main.yml - Push to k8s Tailscale endpoints
  - pulumi/policy.hujson - ACL for port 9187
  - Deleted ansible/roles/prometheus/ and ansible/roles/loki/

Deployment and Testing

  - Stop prometheus and loki on indri
  - Sync ArgoCD apps (apps, prometheus, loki, grafana)
  - Run mise run provision-indri -- --tags alloy
  - Verify Grafana dashboards show data

🤖 Generated with https://claude.ai/claude-code

Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/42
This commit is contained in:
Erich Blume 2026-01-22 12:06:02 -08:00
commit 17023085cb
36 changed files with 569 additions and 270 deletions

17
argocd/apps/loki.yaml Normal file
View file

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: loki
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/loki
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/prometheus
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -6,3 +6,4 @@ namespace: databases
resources:
- blumeops-pg.yaml
- service-tailscale.yaml
- service-metrics-tailscale.yaml

View file

@ -0,0 +1,22 @@
# Tailscale LoadBalancer for CNPG metrics access
# Exposes native postgres_exporter metrics on port 9187
# Canonical hostname: cnpg-metrics.tail8d86e.ts.net
apiVersion: v1
kind: Service
metadata:
name: blumeops-pg-metrics-tailscale
namespace: databases
annotations:
tailscale.com/hostname: "cnpg-metrics"
tailscale.com/proxy-class: "default"
spec:
type: LoadBalancer
loadBalancerClass: tailscale
selector:
cnpg.io/cluster: blumeops-pg
role: primary
ports:
- name: metrics
port: 9187
targetPort: 9187
protocol: TCP

View file

@ -54,7 +54,7 @@ data:
},
"targets": [
{
"expr": "pg_up",
"expr": "cnpg_collector_up",
"refId": "A"
}
],
@ -95,7 +95,7 @@ data:
},
"targets": [
{
"expr": "pg_stat_activity_count{state=\"active\"}",
"expr": "cnpg_backends_total{state=\"active\"}",
"refId": "A"
}
],
@ -136,7 +136,7 @@ data:
},
"targets": [
{
"expr": "sum(pg_stat_activity_count)",
"expr": "sum(cnpg_backends_total)",
"refId": "A"
}
],
@ -177,7 +177,7 @@ data:
},
"targets": [
{
"expr": "sum(pg_database_size_bytes)",
"expr": "sum(cnpg_pg_database_size_bytes)",
"refId": "A"
}
],
@ -249,7 +249,7 @@ data:
},
"targets": [
{
"expr": "pg_stat_activity_count",
"expr": "cnpg_backends_total",
"legendFormat": "{{state}}",
"refId": "A"
}
@ -322,7 +322,7 @@ data:
},
"targets": [
{
"expr": "pg_database_size_bytes{datname!~\"template.*\"}",
"expr": "cnpg_pg_database_size_bytes{datname!~\"template.*\"}",
"legendFormat": "{{datname}}",
"refId": "A"
}
@ -395,22 +395,22 @@ data:
},
"targets": [
{
"expr": "rate(pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])",
"expr": "rate(cnpg_pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])",
"legendFormat": "{{datname}} fetched",
"refId": "A"
},
{
"expr": "rate(pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])",
"expr": "rate(cnpg_pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])",
"legendFormat": "{{datname}} inserted",
"refId": "B"
},
{
"expr": "rate(pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])",
"expr": "rate(cnpg_pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])",
"legendFormat": "{{datname}} updated",
"refId": "C"
},
{
"expr": "rate(pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])",
"expr": "rate(cnpg_pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])",
"legendFormat": "{{datname}} deleted",
"refId": "D"
}
@ -483,12 +483,12 @@ data:
},
"targets": [
{
"expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])",
"expr": "rate(cnpg_pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])",
"legendFormat": "{{datname}} commits",
"refId": "A"
},
{
"expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])",
"expr": "rate(cnpg_pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])",
"legendFormat": "{{datname}} rollbacks",
"refId": "B"
}
@ -561,7 +561,7 @@ data:
},
"targets": [
{
"expr": "pg_database_xid_age_xid_age",
"expr": "cnpg_pg_database_xid_age",
"legendFormat": "{{datname}}",
"refId": "A"
}

View file

@ -24,8 +24,7 @@ grafana.ini:
check_for_updates: false
reporting_enabled: false
# Datasources - point to indri services via podman host gateway
# host.containers.internal resolves to the podman host (indri) from inside minikube
# Datasources - point to k8s-internal services
datasources:
datasources.yaml:
apiVersion: 1
@ -35,7 +34,7 @@ datasources:
access: proxy
orgId: 1
uid: prometheus
url: http://host.containers.internal:9090
url: http://prometheus.monitoring.svc.cluster.local:9090
isDefault: true
editable: false
- name: Loki
@ -43,7 +42,7 @@ datasources:
access: proxy
orgId: 1
uid: loki
url: http://host.containers.internal:3100
url: http://loki.monitoring.svc.cluster.local:3100
editable: false
# Dashboard provisioning - sidecar watches for ConfigMaps with label

View file

@ -0,0 +1,58 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-config
namespace: monitoring
data:
loki-config.yaml: |
auth_enabled: false
server:
http_listen_port: 3100
http_listen_address: 0.0.0.0
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
tsdb_shipper:
active_index_directory: /loki/tsdb-index
cache_location: /loki/tsdb-cache
limits_config:
retention_period: 744h # 31 days
compactor:
working_directory: /loki/compactor
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: filesystem

View file

@ -0,0 +1,25 @@
# Tailscale Ingress for Loki
# Allows Alloy on indri to push logs
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: loki-tailscale
namespace: monitoring
annotations:
tailscale.com/funnel: "false"
spec:
ingressClassName: tailscale
rules:
- host: loki
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: loki
port:
number: 3100
tls:
- hosts:
- loki

View file

@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- configmap.yaml
- statefulset.yaml
- service.yaml
- ingress-tailscale.yaml

View file

@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
name: loki
namespace: monitoring
spec:
selector:
app: loki
ports:
- name: http
port: 3100
targetPort: 3100
- name: grpc
port: 9096
targetPort: 9096
type: ClusterIP

View file

@ -0,0 +1,66 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: loki
namespace: monitoring
spec:
serviceName: loki
replicas: 1
selector:
matchLabels:
app: loki
template:
metadata:
labels:
app: loki
spec:
securityContext:
fsGroup: 10001
runAsNonRoot: true
runAsUser: 10001
containers:
- name: loki
image: grafana/loki:3.3.2
args:
- -config.file=/etc/loki/loki-config.yaml
ports:
- name: http
containerPort: 3100
- name: grpc
containerPort: 9096
volumeMounts:
- name: config
mountPath: /etc/loki
- name: data
mountPath: /loki
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /ready
port: 3100
initialDelaySeconds: 45
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 3100
initialDelaySeconds: 10
periodSeconds: 5
volumes:
- name: config
configMap:
name: loki-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi

View file

@ -0,0 +1,38 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
# Indri system metrics are pushed via Alloy remote_write
# K8s services are scraped directly
scrape_configs:
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
- job_name: "node-exporter-sifaka"
static_configs:
- targets: ["192.168.1.203:9100"]
# CNPG PostgreSQL metrics (k8s internal)
- job_name: "cnpg-postgres"
static_configs:
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
labels:
instance: "blumeops-pg"
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Loki metrics
- job_name: "loki"
static_configs:
- targets: ["loki.monitoring.svc.cluster.local:3100"]

View file

@ -0,0 +1,25 @@
# Tailscale Ingress for Prometheus
# Allows Alloy on indri to push metrics via remote_write
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus-tailscale
namespace: monitoring
annotations:
tailscale.com/funnel: "false"
spec:
ingressClassName: tailscale
rules:
- host: prometheus
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus
port:
number: 9090
tls:
- hosts:
- prometheus

View file

@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- configmap.yaml
- statefulset.yaml
- service.yaml
- ingress-tailscale.yaml

View file

@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
spec:
selector:
app: prometheus
ports:
- name: http
port: 9090
targetPort: 9090
type: ClusterIP

View file

@ -0,0 +1,68 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: monitoring
spec:
serviceName: prometheus
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
containers:
- name: prometheus
image: prom/prometheus:v3.2.1
args:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=15d
- --web.enable-remote-write-receiver
- --web.enable-lifecycle
ports:
- name: http
containerPort: 9090
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: data
mountPath: /prometheus
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: config
configMap:
name: prometheus-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi