Migrate Prometheus and Loki to Kubernetes

Major observability stack migration: - Deploy Prometheus in k8s with 20Gi PVC, Tailscale Ingress - Deploy Loki in k8s with 20Gi PVC, Tailscale Ingress - Update Grafana to use k8s-internal endpoints for data sources - Update Alloy on indri to push to k8s via Tailscale endpoints - Prometheus scrapes sifaka via LAN IP (Docker NAT, same as NFS) - Deprecate ansible prometheus/loki roles Alloy on indri continues to collect: - System metrics (via prometheus.exporter.unix) - Textfile metrics (borgmatic, plex) - Logs (forgejo, tailscale, borgmatic, zot, plex) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 07:48:45 -08:00 · 2026-01-22 07:48:45 -08:00 · b7f5988ea7
commit b7f5988ea7
parent 74c218063d
15 changed files with 376 additions and 45 deletions
--- a/ansible/roles/alloy/defaults/main.yml
+++ b/ansible/roles/alloy/defaults/main.yml
@ -4,11 +4,11 @@
 # Textfile collector directory (same as node_exporter for compatibility)
 alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile

-# Prometheus remote write endpoint
-alloy_prometheus_url: "http://localhost:9090/api/v1/write"
+# Prometheus remote write endpoint (k8s via Tailscale)
+alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write"

-# Loki endpoint (used in Phase 2)
-alloy_loki_url: "http://localhost:3100/loki/api/v1/push"
+# Loki endpoint (k8s via Tailscale)
+alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push"

 # Instance label for metrics
 alloy_instance_label: indri
@ -22,34 +22,17 @@ alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data

 # Log paths to collect
 alloy_brew_logs:
-  - path: /opt/homebrew/var/log/grafana-stdout.log
-    service: grafana
-    stream: stdout
-  - path: /opt/homebrew/var/log/grafana-stderr.log
-    service: grafana
-    stream: stderr
+  # NOTE: grafana, prometheus, loki removed - now hosted in k8s
  - path: /opt/homebrew/var/log/forgejo.log
    service: forgejo
    stream: stdout
-  - path: /opt/homebrew/var/log/prometheus.err.log
-    service: prometheus
-    stream: stderr
  - path: /opt/homebrew/var/log/tailscaled.log
    service: tailscale
    stream: stdout
-  - path: /opt/homebrew/var/transmission/transmission-daemon.log
-    service: transmission
-    stream: stdout
-  # NOTE: postgresql and miniflux removed - now hosted in k8s
+  # NOTE: transmission removed - now hosted in k8s

 alloy_mcquack_logs:
-  # NOTE: devpi logs removed - now hosted in k8s
-  - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log
-    service: kiwix
-    stream: stdout
-  - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log
-    service: kiwix
-    stream: stderr
+  # NOTE: devpi, kiwix logs removed - now hosted in k8s
  - path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log
    service: borgmatic
    stream: stdout
--- a/ansible/roles/prometheus/templates/prometheus.yml.j2
+++ b/ansible/roles/prometheus/templates/prometheus.yml.j2
@ -1,21 +1,7 @@
 # {{ ansible_managed }}
+# NOTE: Prometheus has been migrated to k8s. This ansible role is deprecated.
+# See argocd/manifests/prometheus/ for the k8s deployment.
 global:
  scrape_interval: 15s

-# Note: indri system metrics are pushed via Alloy remote_write
-# Sifaka still uses traditional scraping via node_exporter
-
-scrape_configs:
-  - job_name: "node-exporter-sifaka"
-    static_configs:
-    - targets: ["sifaka:9100"]
-
-  - job_name: "loki"
-    static_configs:
-    - targets: ["localhost:3100"]
-
-  - job_name: "cnpg-postgres"
-    static_configs:
-    - targets: ["cnpg-metrics.tail8d86e.ts.net:9187"]
-      labels:
-        instance: "blumeops-pg"
+scrape_configs: []
--- a/argocd/apps/loki.yaml
+++ b/argocd/apps/loki.yaml
@ -0,0 +1,17 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: loki
+  namespace: argocd
+spec:
+  project: default
+  source:
+    repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
+    targetRevision: main
+    path: argocd/manifests/loki
+  destination:
+    server: https://kubernetes.default.svc
+    namespace: monitoring
+  syncPolicy:
+    syncOptions:
+      - CreateNamespace=true
--- a/argocd/apps/prometheus.yaml
+++ b/argocd/apps/prometheus.yaml
@ -0,0 +1,17 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Application
+metadata:
+  name: prometheus
+  namespace: argocd
+spec:
+  project: default
+  source:
+    repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git
+    targetRevision: main
+    path: argocd/manifests/prometheus
+  destination:
+    server: https://kubernetes.default.svc
+    namespace: monitoring
+  syncPolicy:
+    syncOptions:
+      - CreateNamespace=true
--- a/argocd/manifests/grafana/values.yaml
+++ b/argocd/manifests/grafana/values.yaml
@ -24,8 +24,7 @@ grafana.ini:
    check_for_updates: false
    reporting_enabled: false

-# Datasources - point to indri services via docker host gateway
-# host.minikube.internal resolves to the docker host (indri) from inside minikube
+# Datasources - point to k8s-internal services
 datasources:
  datasources.yaml:
    apiVersion: 1
@ -35,7 +34,7 @@ datasources:
        access: proxy
        orgId: 1
        uid: prometheus
-        url: http://host.minikube.internal:9090
+        url: http://prometheus.monitoring.svc.cluster.local:9090
        isDefault: true
        editable: false
      - name: Loki
@ -43,7 +42,7 @@ datasources:
        access: proxy
        orgId: 1
        uid: loki
-        url: http://host.minikube.internal:3100
+        url: http://loki.monitoring.svc.cluster.local:3100
        editable: false

 # Dashboard provisioning - sidecar watches for ConfigMaps with label
--- a/argocd/manifests/loki/configmap.yaml
+++ b/argocd/manifests/loki/configmap.yaml
@ -0,0 +1,58 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: loki-config
+  namespace: monitoring
+data:
+  loki-config.yaml: |
+    auth_enabled: false
+
+    server:
+      http_listen_port: 3100
+      http_listen_address: 0.0.0.0
+      grpc_listen_port: 9096
+
+    common:
+      instance_addr: 127.0.0.1
+      path_prefix: /loki
+      storage:
+        filesystem:
+          chunks_directory: /loki/chunks
+          rules_directory: /loki/rules
+      replication_factor: 1
+      ring:
+        kvstore:
+          store: inmemory
+
+    query_range:
+      results_cache:
+        cache:
+          embedded_cache:
+            enabled: true
+            max_size_mb: 100
+
+    schema_config:
+      configs:
+        - from: 2024-01-01
+          store: tsdb
+          object_store: filesystem
+          schema: v13
+          index:
+            prefix: index_
+            period: 24h
+
+    storage_config:
+      tsdb_shipper:
+        active_index_directory: /loki/tsdb-index
+        cache_location: /loki/tsdb-cache
+
+    limits_config:
+      retention_period: 744h  # 31 days
+
+    compactor:
+      working_directory: /loki/compactor
+      compaction_interval: 10m
+      retention_enabled: true
+      retention_delete_delay: 2h
+      retention_delete_worker_count: 150
+      delete_request_store: filesystem
--- a/argocd/manifests/loki/ingress-tailscale.yaml
+++ b/argocd/manifests/loki/ingress-tailscale.yaml
@ -0,0 +1,25 @@
+# Tailscale Ingress for Loki
+# Allows Alloy on indri to push logs
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: loki-tailscale
+  namespace: monitoring
+  annotations:
+    tailscale.com/funnel: "false"
+spec:
+  ingressClassName: tailscale
+  rules:
+    - host: loki
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: loki
+                port:
+                  number: 3100
+  tls:
+    - hosts:
+        - loki
--- a/argocd/manifests/loki/kustomization.yaml
+++ b/argocd/manifests/loki/kustomization.yaml
@ -0,0 +1,10 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+  - configmap.yaml
+  - statefulset.yaml
+  - service.yaml
+  - ingress-tailscale.yaml
--- a/argocd/manifests/loki/service.yaml
+++ b/argocd/manifests/loki/service.yaml
@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: loki
+  namespace: monitoring
+spec:
+  selector:
+    app: loki
+  ports:
+    - name: http
+      port: 3100
+      targetPort: 3100
+    - name: grpc
+      port: 9096
+      targetPort: 9096
+  type: ClusterIP
--- a/argocd/manifests/loki/statefulset.yaml
+++ b/argocd/manifests/loki/statefulset.yaml
@ -0,0 +1,66 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: loki
+  namespace: monitoring
+spec:
+  serviceName: loki
+  replicas: 1
+  selector:
+    matchLabels:
+      app: loki
+  template:
+    metadata:
+      labels:
+        app: loki
+    spec:
+      securityContext:
+        fsGroup: 10001
+        runAsNonRoot: true
+        runAsUser: 10001
+      containers:
+        - name: loki
+          image: grafana/loki:3.3.2
+          args:
+            - -config.file=/etc/loki/loki-config.yaml
+          ports:
+            - name: http
+              containerPort: 3100
+            - name: grpc
+              containerPort: 9096
+          volumeMounts:
+            - name: config
+              mountPath: /etc/loki
+            - name: data
+              mountPath: /loki
+          resources:
+            requests:
+              memory: "256Mi"
+              cpu: "100m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+          livenessProbe:
+            httpGet:
+              path: /ready
+              port: 3100
+            initialDelaySeconds: 45
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 3100
+            initialDelaySeconds: 10
+            periodSeconds: 5
+      volumes:
+        - name: config
+          configMap:
+            name: loki-config
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        resources:
+          requests:
+            storage: 20Gi
--- a/argocd/manifests/prometheus/configmap.yaml
+++ b/argocd/manifests/prometheus/configmap.yaml
@ -0,0 +1,38 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-config
+  namespace: monitoring
+data:
+  prometheus.yml: |
+    global:
+      scrape_interval: 15s
+      evaluation_interval: 15s
+
+    # Indri system metrics are pushed via Alloy remote_write
+    # K8s services are scraped directly
+
+    scrape_configs:
+      # Sifaka NAS node-exporter (via LAN - Docker NATs through indri)
+      # Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts)
+      # If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml
+      - job_name: "node-exporter-sifaka"
+        static_configs:
+          - targets: ["192.168.1.203:9100"]
+
+      # CNPG PostgreSQL metrics (k8s internal)
+      - job_name: "cnpg-postgres"
+        static_configs:
+          - targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"]
+            labels:
+              instance: "blumeops-pg"
+
+      # Prometheus self-monitoring
+      - job_name: "prometheus"
+        static_configs:
+          - targets: ["localhost:9090"]
+
+      # Loki metrics
+      - job_name: "loki"
+        static_configs:
+          - targets: ["loki.monitoring.svc.cluster.local:3100"]
--- a/argocd/manifests/prometheus/ingress-tailscale.yaml
+++ b/argocd/manifests/prometheus/ingress-tailscale.yaml
@ -0,0 +1,25 @@
+# Tailscale Ingress for Prometheus
+# Allows Alloy on indri to push metrics via remote_write
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+  name: prometheus-tailscale
+  namespace: monitoring
+  annotations:
+    tailscale.com/funnel: "false"
+spec:
+  ingressClassName: tailscale
+  rules:
+    - host: prometheus
+      http:
+        paths:
+          - path: /
+            pathType: Prefix
+            backend:
+              service:
+                name: prometheus
+                port:
+                  number: 9090
+  tls:
+    - hosts:
+        - prometheus
--- a/argocd/manifests/prometheus/kustomization.yaml
+++ b/argocd/manifests/prometheus/kustomization.yaml
@ -0,0 +1,10 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+  - configmap.yaml
+  - statefulset.yaml
+  - service.yaml
+  - ingress-tailscale.yaml
--- a/argocd/manifests/prometheus/service.yaml
+++ b/argocd/manifests/prometheus/service.yaml
@ -0,0 +1,13 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus
+  namespace: monitoring
+spec:
+  selector:
+    app: prometheus
+  ports:
+    - name: http
+      port: 9090
+      targetPort: 9090
+  type: ClusterIP
--- a/argocd/manifests/prometheus/statefulset.yaml
+++ b/argocd/manifests/prometheus/statefulset.yaml
@ -0,0 +1,68 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: prometheus
+  namespace: monitoring
+spec:
+  serviceName: prometheus
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus
+  template:
+    metadata:
+      labels:
+        app: prometheus
+    spec:
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+      containers:
+        - name: prometheus
+          image: prom/prometheus:v3.2.1
+          args:
+            - --config.file=/etc/prometheus/prometheus.yml
+            - --storage.tsdb.path=/prometheus
+            - --storage.tsdb.retention.time=15d
+            - --web.enable-remote-write-receiver
+            - --web.enable-lifecycle
+          ports:
+            - name: http
+              containerPort: 9090
+          volumeMounts:
+            - name: config
+              mountPath: /etc/prometheus
+            - name: data
+              mountPath: /prometheus
+          resources:
+            requests:
+              memory: "256Mi"
+              cpu: "100m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+          livenessProbe:
+            httpGet:
+              path: /-/healthy
+              port: 9090
+            initialDelaySeconds: 30
+            periodSeconds: 15
+          readinessProbe:
+            httpGet:
+              path: /-/ready
+              port: 9090
+            initialDelaySeconds: 5
+            periodSeconds: 5
+      volumes:
+        - name: config
+          configMap:
+            name: prometheus-config
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        resources:
+          requests:
+            storage: 20Gi