Migrate observability stack to Kubernetes (#42)
Note: the name of this branch was chosen before the scope widened to encompass the entire observability stack. Summary - Fix Grafana data source URLs (docker driver uses host.minikube.internal, not host.containers.internal) - Migrate Prometheus and Loki from indri to Kubernetes with Tailscale Ingresses - Expose CNPG PostgreSQL metrics via Tailscale and update dashboard to use cnpg_* metrics - Update Alloy to push metrics/logs to k8s endpoints (prometheus.tail8d86e.ts.net, loki.tail8d86e.ts.net) - Add ACL rule for port 9187 (CNPG metrics) - Delete obsolete ansible roles for prometheus and loki Changes - argocd/manifests/prometheus/ - New Prometheus StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/manifests/loki/ - New Loki StatefulSet with 20Gi PVC and Tailscale Ingress - argocd/apps/prometheus.yaml, argocd/apps/loki.yaml - ArgoCD Applications - argocd/manifests/grafana/values.yaml - Data sources now use k8s internal DNS - argocd/manifests/databases/service-metrics-tailscale.yaml - CNPG metrics endpoint - argocd/manifests/grafana-config/dashboards/configmap-postgresql.yaml - Updated to cnpg_* metrics - ansible/roles/alloy/defaults/main.yml - Push to k8s Tailscale endpoints - pulumi/policy.hujson - ACL for port 9187 - Deleted ansible/roles/prometheus/ and ansible/roles/loki/ Deployment and Testing - Stop prometheus and loki on indri - Sync ArgoCD apps (apps, prometheus, loki, grafana) - Run mise run provision-indri -- --tags alloy - Verify Grafana dashboards show data 🤖 Generated with https://claude.ai/claude-code Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/42
This commit is contained in:
parent
5a829e0afd
commit
17023085cb
36 changed files with 569 additions and 270 deletions
58
argocd/manifests/loki/configmap.yaml
Normal file
58
argocd/manifests/loki/configmap.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
loki-config.yaml: |
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
http_listen_address: 0.0.0.0
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
tsdb_shipper:
|
||||
active_index_directory: /loki/tsdb-index
|
||||
cache_location: /loki/tsdb-cache
|
||||
|
||||
limits_config:
|
||||
retention_period: 744h # 31 days
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
delete_request_store: filesystem
|
||||
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Tailscale Ingress for Loki
|
||||
# Allows Alloy on indri to push logs
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: loki-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- host: loki
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: loki
|
||||
port:
|
||||
number: 3100
|
||||
tls:
|
||||
- hosts:
|
||||
- loki
|
||||
10
argocd/manifests/loki/kustomization.yaml
Normal file
10
argocd/manifests/loki/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- configmap.yaml
|
||||
- statefulset.yaml
|
||||
- service.yaml
|
||||
- ingress-tailscale.yaml
|
||||
16
argocd/manifests/loki/service.yaml
Normal file
16
argocd/manifests/loki/service.yaml
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: loki
|
||||
ports:
|
||||
- name: http
|
||||
port: 3100
|
||||
targetPort: 3100
|
||||
- name: grpc
|
||||
port: 9096
|
||||
targetPort: 9096
|
||||
type: ClusterIP
|
||||
66
argocd/manifests/loki/statefulset.yaml
Normal file
66
argocd/manifests/loki/statefulset.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
serviceName: loki
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: loki
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: loki
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 10001
|
||||
runAsNonRoot: true
|
||||
runAsUser: 10001
|
||||
containers:
|
||||
- name: loki
|
||||
image: grafana/loki:3.3.2
|
||||
args:
|
||||
- -config.file=/etc/loki/loki-config.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3100
|
||||
- name: grpc
|
||||
containerPort: 9096
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/loki
|
||||
- name: data
|
||||
mountPath: /loki
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3100
|
||||
initialDelaySeconds: 45
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3100
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: loki-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
Loading…
Add table
Add a link
Reference in a new issue