Migrate Prometheus and Loki to Kubernetes
Major observability stack migration: - Deploy Prometheus in k8s with 20Gi PVC, Tailscale Ingress - Deploy Loki in k8s with 20Gi PVC, Tailscale Ingress - Update Grafana to use k8s-internal endpoints for data sources - Update Alloy on indri to push to k8s via Tailscale endpoints - Prometheus scrapes sifaka via LAN IP (Docker NAT, same as NFS) - Deprecate ansible prometheus/loki roles Alloy on indri continues to collect: - System metrics (via prometheus.exporter.unix) - Textfile metrics (borgmatic, plex) - Logs (forgejo, tailscale, borgmatic, zot, plex) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
74c218063d
commit
b7f5988ea7
15 changed files with 376 additions and 45 deletions
|
|
@ -4,11 +4,11 @@
|
|||
# Textfile collector directory (same as node_exporter for compatibility)
|
||||
alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile
|
||||
|
||||
# Prometheus remote write endpoint
|
||||
alloy_prometheus_url: "http://localhost:9090/api/v1/write"
|
||||
# Prometheus remote write endpoint (k8s via Tailscale)
|
||||
alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write"
|
||||
|
||||
# Loki endpoint (used in Phase 2)
|
||||
alloy_loki_url: "http://localhost:3100/loki/api/v1/push"
|
||||
# Loki endpoint (k8s via Tailscale)
|
||||
alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push"
|
||||
|
||||
# Instance label for metrics
|
||||
alloy_instance_label: indri
|
||||
|
|
@ -22,34 +22,17 @@ alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data
|
|||
|
||||
# Log paths to collect
|
||||
alloy_brew_logs:
|
||||
- path: /opt/homebrew/var/log/grafana-stdout.log
|
||||
service: grafana
|
||||
stream: stdout
|
||||
- path: /opt/homebrew/var/log/grafana-stderr.log
|
||||
service: grafana
|
||||
stream: stderr
|
||||
# NOTE: grafana, prometheus, loki removed - now hosted in k8s
|
||||
- path: /opt/homebrew/var/log/forgejo.log
|
||||
service: forgejo
|
||||
stream: stdout
|
||||
- path: /opt/homebrew/var/log/prometheus.err.log
|
||||
service: prometheus
|
||||
stream: stderr
|
||||
- path: /opt/homebrew/var/log/tailscaled.log
|
||||
service: tailscale
|
||||
stream: stdout
|
||||
- path: /opt/homebrew/var/transmission/transmission-daemon.log
|
||||
service: transmission
|
||||
stream: stdout
|
||||
# NOTE: postgresql and miniflux removed - now hosted in k8s
|
||||
# NOTE: transmission removed - now hosted in k8s
|
||||
|
||||
alloy_mcquack_logs:
|
||||
# NOTE: devpi logs removed - now hosted in k8s
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log
|
||||
service: kiwix
|
||||
stream: stdout
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log
|
||||
service: kiwix
|
||||
stream: stderr
|
||||
# NOTE: devpi, kiwix logs removed - now hosted in k8s
|
||||
- path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log
|
||||
service: borgmatic
|
||||
stream: stdout
|
||||
|
|
|
|||
|
|
@ -1,21 +1,7 @@
|
|||
# {{ ansible_managed }}
|
||||
# NOTE: Prometheus has been migrated to k8s. This ansible role is deprecated.
|
||||
# See argocd/manifests/prometheus/ for the k8s deployment.
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
|
||||
# Note: indri system metrics are pushed via Alloy remote_write
|
||||
# Sifaka still uses traditional scraping via node_exporter
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "node-exporter-sifaka"
|
||||
static_configs:
|
||||
- targets: ["sifaka:9100"]
|
||||
|
||||
- job_name: "loki"
|
||||
static_configs:
|
||||
- targets: ["localhost:3100"]
|
||||
|
||||
- job_name: "cnpg-postgres"
|
||||
static_configs:
|
||||
- targets: ["cnpg-metrics.tail8d86e.ts.net:9187"]
|
||||
labels:
|
||||
instance: "blumeops-pg"
|
||||
scrape_configs: []
|
||||
|
|
|
|||
17
argocd/apps/loki.yaml
Normal file
17
argocd/apps/loki.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/loki
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
17
argocd/apps/prometheus.yaml
Normal file
17
argocd/apps/prometheus.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/prometheus
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
|
|
@ -24,8 +24,7 @@ grafana.ini:
|
|||
check_for_updates: false
|
||||
reporting_enabled: false
|
||||
|
||||
# Datasources - point to indri services via docker host gateway
|
||||
# host.minikube.internal resolves to the docker host (indri) from inside minikube
|
||||
# Datasources - point to k8s-internal services
|
||||
datasources:
|
||||
datasources.yaml:
|
||||
apiVersion: 1
|
||||
|
|
@ -35,7 +34,7 @@ datasources:
|
|||
access: proxy
|
||||
orgId: 1
|
||||
uid: prometheus
|
||||
url: http://host.minikube.internal:9090
|
||||
url: http://prometheus.monitoring.svc.cluster.local:9090
|
||||
isDefault: true
|
||||
editable: false
|
||||
- name: Loki
|
||||
|
|
@ -43,7 +42,7 @@ datasources:
|
|||
access: proxy
|
||||
orgId: 1
|
||||
uid: loki
|
||||
url: http://host.minikube.internal:3100
|
||||
url: http://loki.monitoring.svc.cluster.local:3100
|
||||
editable: false
|
||||
|
||||
# Dashboard provisioning - sidecar watches for ConfigMaps with label
|
||||
|
|
|
|||
58
argocd/manifests/loki/configmap.yaml
Normal file
58
argocd/manifests/loki/configmap.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
loki-config.yaml: |
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
http_listen_address: 0.0.0.0
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
instance_addr: 127.0.0.1
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2024-01-01
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
storage_config:
|
||||
tsdb_shipper:
|
||||
active_index_directory: /loki/tsdb-index
|
||||
cache_location: /loki/tsdb-cache
|
||||
|
||||
limits_config:
|
||||
retention_period: 744h # 31 days
|
||||
|
||||
compactor:
|
||||
working_directory: /loki/compactor
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
delete_request_store: filesystem
|
||||
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
25
argocd/manifests/loki/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Tailscale Ingress for Loki
|
||||
# Allows Alloy on indri to push logs
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: loki-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- host: loki
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: loki
|
||||
port:
|
||||
number: 3100
|
||||
tls:
|
||||
- hosts:
|
||||
- loki
|
||||
10
argocd/manifests/loki/kustomization.yaml
Normal file
10
argocd/manifests/loki/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- configmap.yaml
|
||||
- statefulset.yaml
|
||||
- service.yaml
|
||||
- ingress-tailscale.yaml
|
||||
16
argocd/manifests/loki/service.yaml
Normal file
16
argocd/manifests/loki/service.yaml
Normal file
|
|
@ -0,0 +1,16 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: loki
|
||||
ports:
|
||||
- name: http
|
||||
port: 3100
|
||||
targetPort: 3100
|
||||
- name: grpc
|
||||
port: 9096
|
||||
targetPort: 9096
|
||||
type: ClusterIP
|
||||
66
argocd/manifests/loki/statefulset.yaml
Normal file
66
argocd/manifests/loki/statefulset.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
serviceName: loki
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: loki
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: loki
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 10001
|
||||
runAsNonRoot: true
|
||||
runAsUser: 10001
|
||||
containers:
|
||||
- name: loki
|
||||
image: grafana/loki:3.3.2
|
||||
args:
|
||||
- -config.file=/etc/loki/loki-config.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3100
|
||||
- name: grpc
|
||||
containerPort: 9096
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/loki
|
||||
- name: data
|
||||
mountPath: /loki
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3100
|
||||
initialDelaySeconds: 45
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3100
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: loki-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
38
argocd/manifests/prometheus/configmap.yaml
Normal file
38
argocd/manifests/prometheus/configmap.yaml
Normal file
|
|
@ -0,0 +1,38 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
# Indri system metrics are pushed via Alloy remote_write
|
||||
# K8s services are scraped directly
|
||||
|
||||
scrape_configs:
|
||||
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
|
||||
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
|
||||
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
|
||||
- job_name: "node-exporter-sifaka"
|
||||
static_configs:
|
||||
- targets: ["192.168.1.203:9100"]
|
||||
|
||||
# CNPG PostgreSQL metrics (k8s internal)
|
||||
- job_name: "cnpg-postgres"
|
||||
static_configs:
|
||||
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
|
||||
labels:
|
||||
instance: "blumeops-pg"
|
||||
|
||||
# Prometheus self-monitoring
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# Loki metrics
|
||||
- job_name: "loki"
|
||||
static_configs:
|
||||
- targets: ["loki.monitoring.svc.cluster.local:3100"]
|
||||
25
argocd/manifests/prometheus/ingress-tailscale.yaml
Normal file
25
argocd/manifests/prometheus/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
# Tailscale Ingress for Prometheus
|
||||
# Allows Alloy on indri to push metrics via remote_write
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: prometheus-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- host: prometheus
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: prometheus
|
||||
port:
|
||||
number: 9090
|
||||
tls:
|
||||
- hosts:
|
||||
- prometheus
|
||||
10
argocd/manifests/prometheus/kustomization.yaml
Normal file
10
argocd/manifests/prometheus/kustomization.yaml
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- configmap.yaml
|
||||
- statefulset.yaml
|
||||
- service.yaml
|
||||
- ingress-tailscale.yaml
|
||||
13
argocd/manifests/prometheus/service.yaml
Normal file
13
argocd/manifests/prometheus/service.yaml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: prometheus
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
targetPort: 9090
|
||||
type: ClusterIP
|
||||
68
argocd/manifests/prometheus/statefulset.yaml
Normal file
68
argocd/manifests/prometheus/statefulset.yaml
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
serviceName: prometheus
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: prometheus
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 65534
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: prom/prometheus:v3.2.1
|
||||
args:
|
||||
- --config.file=/etc/prometheus/prometheus.yml
|
||||
- --storage.tsdb.path=/prometheus
|
||||
- --storage.tsdb.retention.time=15d
|
||||
- --web.enable-remote-write-receiver
|
||||
- --web.enable-lifecycle
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 9090
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/prometheus
|
||||
- name: data
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: 9090
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 15
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: 9090
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 20Gi
|
||||
Loading…
Add table
Add a link
Reference in a new issue