Migrate observability stack to Kubernetes (#42)
Note: the name of this branch was chosen before the scope widened to encompass the entire observability stack. Summary - Fix Grafana data source URLs (docker driver uses host.minikube.internal, not host.containers.internal) - Migrate Prometheus and Loki from indri to Kubernetes with Tailscale Ingresses - Expose CNPG PostgreSQL metrics via Tailscale and update dashboard to use cnpg_* metrics - Update Alloy to push metrics/logs to k8s endpoints (prometheus.tail8d86e.ts.net, loki.tail8d86e.ts.net) - Add ACL rule for port 9187 (CNPG metrics) - Delete obsolete ansible roles for prometheus and loki Changes - argocd/manifests/prometheus/ - New Prometheus StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/manifests/loki/ - New Loki StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/apps/prometheus.yaml, argocd/apps/loki.yaml - ArgoCD Applications - argocd/manifests/grafana/values.yaml - Data sources now use k8s internal DNS - argocd/manifests/databases/service-metrics-tailscale.yaml - CNPG metrics endpoint - argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml - Updated to cnpg_* metrics - ansible/roles/alloy/defaults/main.yml - Push to k8s Tailscale endpoints - pulumi/policy.hujson - ACL for port 9187 - Deleted ansible/roles/prometheus/ and ansible/roles/loki/ Deployment and Testing - Stop prometheus and loki on indri - Sync ArgoCD apps (apps, prometheus, loki, grafana) - Run mise run provision-indri -- --tags alloy - Verify Grafana dashboards show data 🤖 Generated with https://claude.ai/claude-code Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/42
This commit is contained in:
parent
5a829e0afd
commit
17023085cb
36 changed files with 569 additions and 270 deletions
17
argocd/apps/loki.yaml
Normal file
17
argocd/apps/loki.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/loki
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
17
argocd/apps/prometheus.yaml
Normal file
17
argocd/apps/prometheus.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/prometheus
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
|
|
@ -6,3 +6,4 @@ namespace: databases
|
|||
resources:
|
||||
- blumeops-pg.yaml
|
||||
- service-tailscale.yaml
|
||||
- service-metrics-tailscale.yaml
|
||||
|
|
|
|||
22
argocd/manifests/databases/service-metrics-tailscale.yaml
Normal file
22
argocd/manifests/databases/service-metrics-tailscale.yaml
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
# Tailscale LoadBalancer for CNPG metrics access
|
||||
# Exposes native postgres_exporter metrics on port 9187
|
||||
# Canonical hostname: cnpg-metrics.tail8d86e.ts.net
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: blumeops-pg-metrics-tailscale
|
||||
namespace: databases
|
||||
annotations:
|
||||
tailscale.com/hostname: "cnpg-metrics"
|
||||
tailscale.com/proxy-class: "default"
|
||||
spec:
|
||||
type: LoadBalancer
|
||||
loadBalancerClass: tailscale
|
||||
selector:
|
||||
cnpg.io/cluster: blumeops-pg
|
||||
role: primary
|
||||
ports:
|
||||
- name: metrics
|
||||
port: 9187
|
||||
targetPort: 9187
|
||||
protocol: TCP
|
||||
|
|
@ -54,7 +54,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_up",
|
||||
"expr": "cnpg_collector_up",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
|
@ -95,7 +95,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_activity_count{state=\"active\"}",
|
||||
"expr": "cnpg_backends_total{state=\"active\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
|
@ -136,7 +136,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(pg_stat_activity_count)",
|
||||
"expr": "sum(cnpg_backends_total)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
|
@ -177,7 +177,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(pg_database_size_bytes)",
|
||||
"expr": "sum(cnpg_pg_database_size_bytes)",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
|
|
@ -249,7 +249,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_stat_activity_count",
|
||||
"expr": "cnpg_backends_total",
|
||||
"legendFormat": "{{state}}",
|
||||
"refId": "A"
|
||||
}
|
||||
|
|
@ -322,7 +322,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_database_size_bytes{datname!~\"template.*\"}",
|
||||
"expr": "cnpg_pg_database_size_bytes{datname!~\"template.*\"}",
|
||||
"legendFormat": "{{datname}}",
|
||||
"refId": "A"
|
||||
}
|
||||
|
|
@ -395,22 +395,22 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])",
|
||||
"expr": "rate(cnpg_pg_stat_database_tup_fetched{datname!~\"template.*\"}[5m])",
|
||||
"legendFormat": "{{datname}} fetched",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])",
|
||||
"expr": "rate(cnpg_pg_stat_database_tup_inserted{datname!~\"template.*\"}[5m])",
|
||||
"legendFormat": "{{datname}} inserted",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])",
|
||||
"expr": "rate(cnpg_pg_stat_database_tup_updated{datname!~\"template.*\"}[5m])",
|
||||
"legendFormat": "{{datname}} updated",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])",
|
||||
"expr": "rate(cnpg_pg_stat_database_tup_deleted{datname!~\"template.*\"}[5m])",
|
||||
"legendFormat": "{{datname}} deleted",
|
||||
"refId": "D"
|
||||
}
|
||||
|
|
@ -483,12 +483,12 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "rate(pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])",
|
||||
"expr": "rate(cnpg_pg_stat_database_xact_commit{datname!~\"template.*\"}[5m])",
|
||||
"legendFormat": "{{datname}} commits",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"expr": "rate(pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])",
|
||||
"expr": "rate(cnpg_pg_stat_database_xact_rollback{datname!~\"template.*\"}[5m])",
|
||||
"legendFormat": "{{datname}} rollbacks",
|
||||
"refId": "B"
|
||||
}
|
||||
|
|
@ -561,7 +561,7 @@ data:
|
|||
},
|
||||
"targets": [
|
||||
{
|
||||
"expr": "pg_database_xid_age_xid_age",
|
||||
"expr": "cnpg_pg_database_xid_age",
|
||||
"legendFormat": "{{datname}}",
|
||||
"refId": "A"
|
||||
}
|
||||
|
|
|
|||
|
|
@ -24,8 +24,7 @@ grafana.ini:
|
|||
check_for_updates: false
|
||||
reporting_enabled: false
|
||||
|
||||
# Datasources - point to indri services via podman host gateway
|
||||
# host.containers.internal resolves to the podman host (indri) from inside minikube
|
||||
# Datasources - point to k8s-internal services
|
||||
datasources:
|
||||
datasources.yaml:
|
||||
apiVersion: 1
|
||||
|
|
@ -35,7 +34,7 @@ datasources:
|
|||
access: proxy
|
||||
orgId: 1
|
||||
uid: prometheus
|
||||
url: http://host.containers.internal:9090
|
||||
url: http://prometheus.monitoring.svc.cluster.local:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
- name: Loki
|
||||
|
|
@ -43,7 +42,7 @@ datasources:
|
|||
access: proxy
|
||||
orgId: 1
|
||||
uid: loki
|
||||
url: http://host.containers.internal:3100
|
||||
url: http://loki.monitoring.svc.cluster.local:3100
|
||||
editable: false
|
||||
|
||||
# Dashboard provisioning - sidecar watches for ConfigMaps with label
|
||||
|
|
|
|||
58
argocd/manifests/loki/configmap.yaml
Normal file
58
argocd/manifests/loki/configmap.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
loki-config.yaml: |
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
http_listen_address: 0.0.0.0
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
tsdb_shipper:
|
||||
active_index_directory: /loki/tsdb-index
|
||||
cache_location: /loki/tsdb-cache
|
||||
|
||||
limits_config:
|
||||
retention_period: 744h # 31 days
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
delete_request_store: filesystem
|
||||
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Tailscale Ingress for Loki
|
||||
# Allows Alloy on indri to push logs
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: loki-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- host: loki
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: loki
|
||||
port:
|
||||
number: 3100
|
||||
tls:
|
||||
- hosts:
|
||||
- loki
|
||||
10
argocd/manifests/loki/kustomization.yaml
Normal file
10
argocd/manifests/loki/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- configmap.yaml
|
||||
- statefulset.yaml
|
||||
- service.yaml
|
||||
- ingress-tailscale.yaml
|
||||
16
argocd/manifests/loki/service.yaml
Normal file
16
argocd/manifests/loki/service.yaml
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: loki
|
||||
ports:
|
||||
- name: http
|
||||
port: 3100
|
||||
targetPort: 3100
|
||||
- name: grpc
|
||||
port: 9096
|
||||
targetPort: 9096
|
||||
type: ClusterIP
|
||||
66
argocd/manifests/loki/statefulset.yaml
Normal file
66
argocd/manifests/loki/statefulset.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
serviceName: loki
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: loki
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: loki
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 10001
|
||||
runAsNonRoot: true
|
||||
runAsUser: 10001
|
||||
containers:
|
||||
- name: loki
|
||||
image: grafana/loki:3.3.2
|
||||
args:
|
||||
- -config.file=/etc/loki/loki-config.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3100
|
||||
- name: grpc
|
||||
containerPort: 9096
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/loki
|
||||
- name: data
|
||||
mountPath: /loki
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3100
|
||||
initialDelaySeconds: 45
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3100
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: loki-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
38
argocd/manifests/prometheus/configmap.yaml
Normal file
38
argocd/manifests/prometheus/configmap.yaml
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Indri system metrics are pushed via Alloy remote_write
|
||||
# K8s services are scraped directly
|
||||
|
||||
scrape_configs:
|
||||
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
|
||||
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
|
||||
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
|
||||
- job_name: "node-exporter-sifaka"
|
||||
static_configs:
|
||||
- targets: ["192.168.1.203:9100"]
|
||||
|
||||
# CNPG PostgreSQL metrics (k8s internal)
|
||||
- job_name: "cnpg-postgres"
|
||||
static_configs:
|
||||
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
|
||||
labels:
|
||||
instance: "blumeops-pg"
|
||||
|
||||
# Prometheus self-monitoring
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# Loki metrics
|
||||
- job_name: "loki"
|
||||
static_configs:
|
||||
- targets: ["loki.monitoring.svc.cluster.local:3100"]
|
||||
25
argocd/manifests/prometheus/ingress-tailscale.yaml
Normal file
25
argocd/manifests/prometheus/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Tailscale Ingress for Prometheus
|
||||
# Allows Alloy on indri to push metrics via remote_write
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: prometheus-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- host: prometheus
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: prometheus
|
||||
port:
|
||||
number: 9090
|
||||
tls:
|
||||
- hosts:
|
||||
- prometheus
|
||||
10
argocd/manifests/prometheus/kustomization.yaml
Normal file
10
argocd/manifests/prometheus/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- configmap.yaml
|
||||
- statefulset.yaml
|
||||
- service.yaml
|
||||
- ingress-tailscale.yaml
|
||||
13
argocd/manifests/prometheus/service.yaml
Normal file
13
argocd/manifests/prometheus/service.yaml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: prometheus
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
targetPort: 9090
|
||||
type: ClusterIP
|
||||
68
argocd/manifests/prometheus/statefulset.yaml
Normal file
68
argocd/manifests/prometheus/statefulset.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
serviceName: prometheus
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 65534
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: prom/prometheus:v3.2.1
|
||||
args:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --storage.tsdb.retention.time=15d
|
||||
- --web.enable-remote-write-receiver
|
||||
- --web.enable-lifecycle
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 9090
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/prometheus
|
||||
- name: data
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9090
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9090
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
Loading…
Add table
Add a link
Reference in a new issue