diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index cc4ff27..c3d5112 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -23,12 +23,8 @@ tags: [borgmatic] roles: - - role: loki - tags: loki - role: alloy tags: alloy - - role: prometheus - tags: prometheus - role: borgmatic tags: borgmatic - role: borgmatic_metrics diff --git a/ansible/roles/alloy/defaults/main.yml b/ansible/roles/alloy/defaults/main.yml index ec867f9..b01c845 100644 --- a/ansible/roles/alloy/defaults/main.yml +++ b/ansible/roles/alloy/defaults/main.yml @@ -4,11 +4,11 @@ # Textfile collector directory (same as node_exporter for compatibility) alloy_textfile_dir: /opt/homebrew/var/node_exporter/textfile -# Prometheus remote write endpoint -alloy_prometheus_url: "http://localhost:9090/api/v1/write" +# Prometheus remote write endpoint (k8s via Tailscale) +alloy_prometheus_url: "https://prometheus.tail8d86e.ts.net/api/v1/write" -# Loki endpoint (used in Phase 2) -alloy_loki_url: "http://localhost:3100/loki/api/v1/push" +# Loki endpoint (k8s via Tailscale) +alloy_loki_url: "https://loki.tail8d86e.ts.net/loki/api/v1/push" # Instance label for metrics alloy_instance_label: indri @@ -22,34 +22,14 @@ alloy_data_dir: /opt/homebrew/var/lib/grafana-alloy/data # Log paths to collect alloy_brew_logs: - - path: /opt/homebrew/var/log/grafana-stdout.log - service: grafana - stream: stdout - - path: /opt/homebrew/var/log/grafana-stderr.log - service: grafana - stream: stderr - path: /opt/homebrew/var/log/forgejo.log service: forgejo stream: stdout - - path: /opt/homebrew/var/log/prometheus.err.log - service: prometheus - stream: stderr - path: /opt/homebrew/var/log/tailscaled.log service: tailscale stream: stdout - - path: /opt/homebrew/var/transmission/transmission-daemon.log - service: transmission - stream: stdout - # NOTE: postgresql and miniflux removed - now hosted in k8s alloy_mcquack_logs: - # NOTE: devpi logs removed - now hosted in k8s - - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.out.log - service: kiwix - stream: stdout - - path: /Users/erichblume/Library/Logs/mcquack.kiwix-serve.err.log - service: kiwix - stream: stderr - path: /Users/erichblume/Library/Logs/mcquack.borgmatic.out.log service: borgmatic stream: stdout @@ -75,8 +55,7 @@ alloy_collect_logs: true alloy_collect_zot: true alloy_zot_metrics_url: "http://localhost:5050/metrics" -# PostgreSQL metrics collection -# NOTE: Disabled - brew postgresql removed, k8s CNPG metrics TBD +# PostgreSQL metrics collection (disabled, CNPG metrics scraped directly by k8s Prometheus) alloy_collect_postgres: false alloy_postgres_host: localhost alloy_postgres_port: 5432 diff --git a/ansible/roles/prometheus/templates/prometheus.yml.j2 b/ansible/roles/prometheus/templates/prometheus.yml.j2 index 4271805..2f966e2 100644 --- a/ansible/roles/prometheus/templates/prometheus.yml.j2 +++ b/ansible/roles/prometheus/templates/prometheus.yml.j2 @@ -2,20 +2,4 @@ global: scrape_interval: 15s -# Note: indri system metrics are pushed via Alloy remote_write -# Sifaka still uses traditional scraping via node_exporter - -scrape_configs: - - job_name: "node-exporter-sifaka" - static_configs: - - targets: ["sifaka:9100"] - - - job_name: "loki" - static_configs: - - targets: ["localhost:3100"] - - - job_name: "cnpg-postgres" - static_configs: - - targets: ["cnpg-metrics.tail8d86e.ts.net:9187"] - labels: - instance: "blumeops-pg" +scrape_configs: [] diff --git a/argocd/apps/loki.yaml b/argocd/apps/loki.yaml new file mode 100644 index 0000000..cb9dd41 --- /dev/null +++ b/argocd/apps/loki.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: loki + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/loki + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/prometheus.yaml b/argocd/apps/prometheus.yaml new file mode 100644 index 0000000..b53a243 --- /dev/null +++ b/argocd/apps/prometheus.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: prometheus + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@indri.tail8d86e.ts.net:2200/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/prometheus + destination: + server: https://kubernetes.default.svc + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/grafana/values.yaml b/argocd/manifests/grafana/values.yaml index 00614bb..db2e1a1 100644 --- a/argocd/manifests/grafana/values.yaml +++ b/argocd/manifests/grafana/values.yaml @@ -24,8 +24,7 @@ grafana.ini: check_for_updates: false reporting_enabled: false -# Datasources - point to indri services via docker host gateway -# host.minikube.internal resolves to the docker host (indri) from inside minikube +# Datasources - point to k8s-internal services datasources: datasources.yaml: apiVersion: 1 @@ -35,7 +34,7 @@ datasources: access: proxy orgId: 1 uid: prometheus - url: http://host.minikube.internal:9090 + url: http://prometheus.monitoring.svc.cluster.local:9090 isDefault: true editable: false - name: Loki @@ -43,7 +42,7 @@ datasources: access: proxy orgId: 1 uid: loki - url: http://host.minikube.internal:3100 + url: http://loki.monitoring.svc.cluster.local:3100 editable: false # Dashboard provisioning - sidecar watches for ConfigMaps with label diff --git a/argocd/manifests/loki/configmap.yaml b/argocd/manifests/loki/configmap.yaml new file mode 100644 index 0000000..19c516b --- /dev/null +++ b/argocd/manifests/loki/configmap.yaml @@ -0,0 +1,58 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-config + namespace: monitoring +data: + loki-config.yaml: | + auth_enabled: false + + server: + http_listen_port: 3100 + http_listen_address: 0.0.0.0 + grpc_listen_port: 9096 + + common: + instance_addr: 127.0.0.1 + path_prefix: /loki + storage: + filesystem: + chunks_directory: /loki/chunks + rules_directory: /loki/rules + replication_factor: 1 + ring: + kvstore: + store: inmemory + + query_range: + results_cache: + cache: + embedded_cache: + enabled: true + max_size_mb: 100 + + schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + + storage_config: + tsdb_shipper: + active_index_directory: /loki/tsdb-index + cache_location: /loki/tsdb-cache + + limits_config: + retention_period: 744h # 31 days + + compactor: + working_directory: /loki/compactor + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + delete_request_store: filesystem diff --git a/argocd/manifests/loki/ingress-tailscale.yaml b/argocd/manifests/loki/ingress-tailscale.yaml new file mode 100644 index 0000000..bee0148 --- /dev/null +++ b/argocd/manifests/loki/ingress-tailscale.yaml @@ -0,0 +1,25 @@ +# Tailscale Ingress for Loki +# Allows Alloy on indri to push logs +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: loki-tailscale + namespace: monitoring + annotations: + tailscale.com/funnel: "false" +spec: + ingressClassName: tailscale + rules: + - host: loki + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: loki + port: + number: 3100 + tls: + - hosts: + - loki diff --git a/argocd/manifests/loki/kustomization.yaml b/argocd/manifests/loki/kustomization.yaml new file mode 100644 index 0000000..1c65acb --- /dev/null +++ b/argocd/manifests/loki/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - configmap.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml diff --git a/argocd/manifests/loki/service.yaml b/argocd/manifests/loki/service.yaml new file mode 100644 index 0000000..74b688e --- /dev/null +++ b/argocd/manifests/loki/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: loki + namespace: monitoring +spec: + selector: + app: loki + ports: + - name: http + port: 3100 + targetPort: 3100 + - name: grpc + port: 9096 + targetPort: 9096 + type: ClusterIP diff --git a/argocd/manifests/loki/statefulset.yaml b/argocd/manifests/loki/statefulset.yaml new file mode 100644 index 0000000..18067b4 --- /dev/null +++ b/argocd/manifests/loki/statefulset.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: loki + namespace: monitoring +spec: + serviceName: loki + replicas: 1 + selector: + matchLabels: + app: loki + template: + metadata: + labels: + app: loki + spec: + securityContext: + fsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + containers: + - name: loki + image: grafana/loki:3.3.2 + args: + - -config.file=/etc/loki/loki-config.yaml + ports: + - name: http + containerPort: 3100 + - name: grpc + containerPort: 9096 + volumeMounts: + - name: config + mountPath: /etc/loki + - name: data + mountPath: /loki + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 45 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 3100 + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: loki-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi diff --git a/argocd/manifests/prometheus/configmap.yaml b/argocd/manifests/prometheus/configmap.yaml new file mode 100644 index 0000000..7ae945a --- /dev/null +++ b/argocd/manifests/prometheus/configmap.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yml: | + global: + scrape_interval: 15s + evaluation_interval: 15s + + # Indri system metrics are pushed via Alloy remote_write + # K8s services are scraped directly + + scrape_configs: + # Sifaka NAS node-exporter (via LAN - Docker NATs through indri) + # Using LAN IP since k8s pods can reach LAN via Docker NAT (same as NFS mounts) + # If IP changes, fallback: create Tailscale egress in tailscale-operator/egress-sifaka.yaml + - job_name: "node-exporter-sifaka" + static_configs: + - targets: ["192.168.1.203:9100"] + + # CNPG PostgreSQL metrics (k8s internal) + - job_name: "cnpg-postgres" + static_configs: + - targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"] + labels: + instance: "blumeops-pg" + + # Prometheus self-monitoring + - job_name: "prometheus" + static_configs: + - targets: ["localhost:9090"] + + # Loki metrics + - job_name: "loki" + static_configs: + - targets: ["loki.monitoring.svc.cluster.local:3100"] diff --git a/argocd/manifests/prometheus/ingress-tailscale.yaml b/argocd/manifests/prometheus/ingress-tailscale.yaml new file mode 100644 index 0000000..1aeaa34 --- /dev/null +++ b/argocd/manifests/prometheus/ingress-tailscale.yaml @@ -0,0 +1,25 @@ +# Tailscale Ingress for Prometheus +# Allows Alloy on indri to push metrics via remote_write +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: prometheus-tailscale + namespace: monitoring + annotations: + tailscale.com/funnel: "false" +spec: + ingressClassName: tailscale + rules: + - host: prometheus + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 + tls: + - hosts: + - prometheus diff --git a/argocd/manifests/prometheus/kustomization.yaml b/argocd/manifests/prometheus/kustomization.yaml new file mode 100644 index 0000000..1c65acb --- /dev/null +++ b/argocd/manifests/prometheus/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: monitoring + +resources: + - configmap.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml diff --git a/argocd/manifests/prometheus/service.yaml b/argocd/manifests/prometheus/service.yaml new file mode 100644 index 0000000..84d1909 --- /dev/null +++ b/argocd/manifests/prometheus/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring +spec: + selector: + app: prometheus + ports: + - name: http + port: 9090 + targetPort: 9090 + type: ClusterIP diff --git a/argocd/manifests/prometheus/statefulset.yaml b/argocd/manifests/prometheus/statefulset.yaml new file mode 100644 index 0000000..651451f --- /dev/null +++ b/argocd/manifests/prometheus/statefulset.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: prometheus + namespace: monitoring +spec: + serviceName: prometheus + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + securityContext: + fsGroup: 65534 + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: prometheus + image: prom/prometheus:v3.2.1 + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.time=15d + - --web.enable-remote-write-receiver + - --web.enable-lifecycle + ports: + - name: http + containerPort: 9090 + volumeMounts: + - name: config + mountPath: /etc/prometheus + - name: data + mountPath: /prometheus + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "1Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /-/healthy + port: 9090 + initialDelaySeconds: 30 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /-/ready + port: 9090 + initialDelaySeconds: 5 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: prometheus-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 20Gi