Migrate Prometheus and Loki to Kubernetes

Major observability stack migration:
- Deploy Prometheus in k8s with 20Gi PVC, Tailscale Ingress
- Deploy Loki in k8s with 20Gi PVC, Tailscale Ingress
- Update Grafana to use k8s-internal endpoints for data sources
- Update Alloy on indri to push to k8s via Tailscale endpoints
- Prometheus scrapes sifaka via LAN IP (Docker NAT, same as NFS)
- Deprecate ansible prometheus/loki roles

Alloy on indri continues to collect:
- System metrics (via prometheus.exporter.unix)
- Textfile metrics (borgmatic, plex)
- Logs (forgejo, tailscale, borgmatic, zot, plex)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Erich Blume 2026-01-22 07:48:45 -08:00
commit b7f5988ea7
15 changed files with 376 additions and 45 deletions

View file

@ -4,11 +4,11 @@
# Textfile collector directory (same as node_exporter for compatibility)
alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile
# Prometheus remote write endpoint
alloy_prometheus_url: "http://localhost:9090/api/v1/write"
# Prometheus remote write endpoint (k8s via Tailscale)
alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write"
# Loki endpoint (used in Phase 2)
alloy_loki_url: "http://localhost:3100/loki/api/v1/push"
# Loki endpoint (k8s via Tailscale)
alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push"
# Instance label for metrics
alloy_instance_label: indri
@ -22,34 +22,17 @@ alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data
# Log paths to collect
alloy_brew_logs:
- path: /opt/homebrew/var/log/grafana-stdout.log
service: grafana
stream: stdout
- path: /opt/homebrew/var/log/grafana-stderr.log
service: grafana
stream: stderr
# NOTE: grafana, prometheus, loki removed - now hosted in k8s
- path: /opt/homebrew/var/log/forgejo.log
service: forgejo
stream: stdout
- path: /opt/homebrew/var/log/prometheus.err.log
service: prometheus
stream: stderr
- path: /opt/homebrew/var/log/tailscaled.log
service: tailscale
stream: stdout
- path: /opt/homebrew/var/transmission/transmission-daemon.log
service: transmission
stream: stdout
# NOTE: postgresql and miniflux removed - now hosted in k8s
# NOTE: transmission removed - now hosted in k8s
alloy_mcquack_logs:
# NOTE: devpi logs removed - now hosted in k8s
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log
service: kiwix
stream: stdout
- path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log
service: kiwix
stream: stderr
# NOTE: devpi, kiwix logs removed - now hosted in k8s
- path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log
service: borgmatic
stream: stdout

View file

@ -1,21 +1,7 @@
# {{ ansible_managed }}
# NOTE: Prometheus has been migrated to k8s. This ansible role is deprecated.
# See argocd/manifests/prometheus/ for the k8s deployment.
global:
scrape_interval: 15s
# Note: indri system metrics are pushed via Alloy remote_write
# Sifaka still uses traditional scraping via node_exporter
scrape_configs:
- job_name: "node-exporter-sifaka"
static_configs:
- targets: ["sifaka:9100"]
- job_name: "loki"
static_configs:
- targets: ["localhost:3100"]
- job_name: "cnpg-postgres"
static_configs:
- targets: ["cnpg-metrics.tail8d86e.ts.net:9187"]
labels:
instance: "blumeops-pg"
scrape_configs: []

17
argocd/apps/loki.yaml Normal file
View file

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: loki
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/loki
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -0,0 +1,17 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus
namespace: argocd
spec:
project: default
source:
repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
targetRevision: main
path: argocd/manifests/prometheus
destination:
server: https://kubernetes.default.svc
namespace: monitoring
syncPolicy:
syncOptions:
- CreateNamespace=true

View file

@ -24,8 +24,7 @@ grafana.ini:
check_for_updates: false
reporting_enabled: false
# Datasources - point to indri services via docker host gateway
# host.minikube.internal resolves to the docker host (indri) from inside minikube
# Datasources - point to k8s-internal services
datasources:
datasources.yaml:
apiVersion: 1
@ -35,7 +34,7 @@ datasources:
access: proxy
orgId: 1
uid: prometheus
url: http://host.minikube.internal:9090
url: http://prometheus.monitoring.svc.cluster.local:9090
isDefault: true
editable: false
- name: Loki
@ -43,7 +42,7 @@ datasources:
access: proxy
orgId: 1
uid: loki
url: http://host.minikube.internal:3100
url: http://loki.monitoring.svc.cluster.local:3100
editable: false
# Dashboard provisioning - sidecar watches for ConfigMaps with label

View file

@ -0,0 +1,58 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-config
namespace: monitoring
data:
loki-config.yaml: |
auth_enabled: false
server:
http_listen_port: 3100
http_listen_address: 0.0.0.0
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
storage_config:
tsdb_shipper:
active_index_directory: /loki/tsdb-index
cache_location: /loki/tsdb-cache
limits_config:
retention_period: 744h # 31 days
compactor:
working_directory: /loki/compactor
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: filesystem

View file

@ -0,0 +1,25 @@
# Tailscale Ingress for Loki
# Allows Alloy on indri to push logs
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: loki-tailscale
namespace: monitoring
annotations:
tailscale.com/funnel: "false"
spec:
ingressClassName: tailscale
rules:
- host: loki
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: loki
port:
number: 3100
tls:
- hosts:
- loki

View file

@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- configmap.yaml
- statefulset.yaml
- service.yaml
- ingress-tailscale.yaml

View file

@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
name: loki
namespace: monitoring
spec:
selector:
app: loki
ports:
- name: http
port: 3100
targetPort: 3100
- name: grpc
port: 9096
targetPort: 9096
type: ClusterIP

View file

@ -0,0 +1,66 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: loki
namespace: monitoring
spec:
serviceName: loki
replicas: 1
selector:
matchLabels:
app: loki
template:
metadata:
labels:
app: loki
spec:
securityContext:
fsGroup: 10001
runAsNonRoot: true
runAsUser: 10001
containers:
- name: loki
image: grafana/loki:3.3.2
args:
- -config.file=/etc/loki/loki-config.yaml
ports:
- name: http
containerPort: 3100
- name: grpc
containerPort: 9096
volumeMounts:
- name: config
mountPath: /etc/loki
- name: data
mountPath: /loki
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /ready
port: 3100
initialDelaySeconds: 45
periodSeconds: 10
readinessProbe:
httpGet:
path: /ready
port: 3100
initialDelaySeconds: 10
periodSeconds: 5
volumes:
- name: config
configMap:
name: loki-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi

View file

@ -0,0 +1,38 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
# Indri system metrics are pushed via Alloy remote_write
# K8s services are scraped directly
scrape_configs:
# Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
# Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
# If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
- job_name: "node-exporter-sifaka"
static_configs:
- targets: ["192.168.1.203:9100"]
# CNPG PostgreSQL metrics (k8s internal)
- job_name: "cnpg-postgres"
static_configs:
- targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
labels:
instance: "blumeops-pg"
# Prometheus self-monitoring
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Loki metrics
- job_name: "loki"
static_configs:
- targets: ["loki.monitoring.svc.cluster.local:3100"]

View file

@ -0,0 +1,25 @@
# Tailscale Ingress for Prometheus
# Allows Alloy on indri to push metrics via remote_write
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus-tailscale
namespace: monitoring
annotations:
tailscale.com/funnel: "false"
spec:
ingressClassName: tailscale
rules:
- host: prometheus
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus
port:
number: 9090
tls:
- hosts:
- prometheus

View file

@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- configmap.yaml
- statefulset.yaml
- service.yaml
- ingress-tailscale.yaml

View file

@ -0,0 +1,13 @@
apiVersion: v1
kind: Service
metadata:
name: prometheus
namespace: monitoring
spec:
selector:
app: prometheus
ports:
- name: http
port: 9090
targetPort: 9090
type: ClusterIP

View file

@ -0,0 +1,68 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: prometheus
namespace: monitoring
spec:
serviceName: prometheus
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
securityContext:
fsGroup: 65534
runAsNonRoot: true
runAsUser: 65534
containers:
- name: prometheus
image: prom/prometheus:v3.2.1
args:
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.time=15d
- --web.enable-remote-write-receiver
- --web.enable-lifecycle
ports:
- name: http
containerPort: 9090
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: data
mountPath: /prometheus
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "1Gi"
cpu: "500m"
livenessProbe:
httpGet:
path: /-/healthy
port: 9090
initialDelaySeconds: 30
periodSeconds: 15
readinessProbe:
httpGet:
path: /-/ready
port: 9090
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: config
configMap:
name: prometheus-config
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi