diff --git a/README.md b/README.md index 8ba6b8d..ee4c3c8 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,21 @@ flakes), all connected via Tailscale: Authentik SSO) on k3s, plus NixOS systemd services. - **Sifaka** (Synology NAS) - backup target and bulk storage. -Notable services include Grafana/Prometheus/Loki observability, Immich photos, -Jellyfin media, Forgejo git forge, a Zot container registry, and more. Public -access is routed through a Fly.io proxy; everything else is tailnet-only. +Notable services include Immich photos, Jellyfin media, Forgejo git forge, a +Zot container registry, and more. Public access is routed through a Fly.io +proxy; everything else is tailnet-only. + +### Observability stack + +The four(+) pillars of observability — metrics, logs, traces, and profiles — +collected by Grafana Alloy and visualized in Grafana with cross-signal linking: + +| Pillar | Backend | How | +|--------|---------|-----| +| **Metrics** | Prometheus | Alloy scrape + remote_write | +| **Logs** | Loki | Alloy pod log collection | +| **Traces** | Tempo | Alloy Beyla eBPF auto-instrumentation | +| **Profiles** | Pyroscope | Alloy eBPF continuous profiling | ## Project structure diff --git a/argocd/apps/alloy-profiling-ringtail.yaml b/argocd/apps/alloy-profiling-ringtail.yaml new file mode 100644 index 0000000..7f65782 --- /dev/null +++ b/argocd/apps/alloy-profiling-ringtail.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: alloy-profiling-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/alloy-profiling-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: alloy + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/pyroscope.yaml b/argocd/apps/pyroscope.yaml new file mode 100644 index 0000000..0019105 --- /dev/null +++ b/argocd/apps/pyroscope.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: pyroscope + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/pyroscope + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: pyroscope + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/alloy-profiling-ringtail/config.alloy b/argocd/manifests/alloy-profiling-ringtail/config.alloy new file mode 100644 index 0000000..7864805 --- /dev/null +++ b/argocd/manifests/alloy-profiling-ringtail/config.alloy @@ -0,0 +1,61 @@ +// Alloy profiling configuration for ringtail +// Uses pyroscope.ebpf to continuously profile workloads and export to Pyroscope + +// ============== KUBERNETES DISCOVERY ============== + +discovery.kubernetes "pods" { + role = "pod" +} + +discovery.relabel "pods" { + targets = discovery.kubernetes.pods.targets + + // Keep namespace label + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + + // Keep pod name label + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + + // Keep container name label + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + + // Drop infrastructure namespaces + rule { + source_labels = ["namespace"] + regex = "kube-system|tailscale" + action = "drop" + } + + // Drop alloy pods to avoid self-profiling noise + rule { + source_labels = ["__meta_kubernetes_pod_label_app"] + regex = "alloy|alloy-tracing|alloy-profiling" + action = "drop" + } +} + +// ============== eBPF PROFILING ============== + +pyroscope.ebpf "instance" { + forward_to = [pyroscope.write.endpoint.receiver] + targets_only = true + targets = discovery.relabel.pods.output + demangle = "none" +} + +// ============== PYROSCOPE WRITE ============== + +pyroscope.write "endpoint" { + endpoint { + url = "http://pyroscope.pyroscope.svc.cluster.local:4040" + } +} diff --git a/argocd/manifests/alloy-profiling-ringtail/daemonset.yaml b/argocd/manifests/alloy-profiling-ringtail/daemonset.yaml new file mode 100644 index 0000000..ee97a0a --- /dev/null +++ b/argocd/manifests/alloy-profiling-ringtail/daemonset.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: alloy-profiling + namespace: alloy + labels: + app: alloy-profiling +spec: + selector: + matchLabels: + app: alloy-profiling + template: + metadata: + labels: + app: alloy-profiling + spec: + serviceAccountName: alloy-profiling + hostPID: true + containers: + - name: alloy + image: registry.ops.eblu.me/blumeops/alloy:kustomized + args: + - run + - --server.http.listen-addr=0.0.0.0:12347 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + ports: + - containerPort: 12347 + name: http + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + volumeMounts: + - name: config + mountPath: /etc/alloy + - name: data + mountPath: /var/lib/alloy/data + securityContext: + privileged: true + tolerations: + - operator: Exists + volumes: + - name: config + configMap: + name: alloy-profiling-config + - name: data + emptyDir: {} diff --git a/argocd/manifests/alloy-profiling-ringtail/kustomization.yaml b/argocd/manifests/alloy-profiling-ringtail/kustomization.yaml new file mode 100644 index 0000000..76af63f --- /dev/null +++ b/argocd/manifests/alloy-profiling-ringtail/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: alloy + +resources: + - rbac.yaml + - daemonset.yaml + +images: + - name: registry.ops.eblu.me/blumeops/alloy + newTag: v1.14.0-fd0bebb-nix + +configMapGenerator: + - name: alloy-profiling-config + files: + - config.alloy diff --git a/argocd/manifests/alloy-profiling-ringtail/rbac.yaml b/argocd/manifests/alloy-profiling-ringtail/rbac.yaml new file mode 100644 index 0000000..5b0bb04 --- /dev/null +++ b/argocd/manifests/alloy-profiling-ringtail/rbac.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alloy-profiling + namespace: alloy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: alloy-profiling +rules: + - apiGroups: [""] + resources: ["pods", "services", "endpoints", "nodes", "namespaces"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: alloy-profiling +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alloy-profiling +subjects: + - kind: ServiceAccount + name: alloy-profiling + namespace: alloy diff --git a/argocd/manifests/grafana/datasources.yaml b/argocd/manifests/grafana/datasources.yaml index 5a3d0f3..286bbf0 100644 --- a/argocd/manifests/grafana/datasources.yaml +++ b/argocd/manifests/grafana/datasources.yaml @@ -48,6 +48,19 @@ datasources: datasourceUid: prometheus nodeGraph: enabled: true + tracesToProfilesV2: + datasourceUid: pyroscope + customQuery: false +- access: proxy + editable: false + name: Pyroscope + orgId: 1 + type: grafana-pyroscope-datasource + uid: pyroscope + url: https://pyroscope.tail8d86e.ts.net + jsonData: + backendType: pyroscope + tlsSkipVerify: true - access: proxy database: teslamate editable: false diff --git a/argocd/manifests/pyroscope/config.yaml b/argocd/manifests/pyroscope/config.yaml new file mode 100644 index 0000000..cc1a136 --- /dev/null +++ b/argocd/manifests/pyroscope/config.yaml @@ -0,0 +1,13 @@ +storage: + backend: filesystem + filesystem: + dir: /data + +compactor: + compaction_interval: 30m + +limits: + max_query_lookback: 168h + +self_profiling: + disable_push: true diff --git a/argocd/manifests/pyroscope/ingress-tailscale.yaml b/argocd/manifests/pyroscope/ingress-tailscale.yaml new file mode 100644 index 0000000..4384def --- /dev/null +++ b/argocd/manifests/pyroscope/ingress-tailscale.yaml @@ -0,0 +1,26 @@ +# Tailscale Ingress for Pyroscope query API +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: pyroscope-tailscale + namespace: pyroscope + annotations: + tailscale.com/funnel: "false" + tailscale.com/proxy-group: "ingress" + tailscale.com/tags: "tag:k8s" + gethomepage.dev/enabled: "false" +spec: + ingressClassName: tailscale + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: pyroscope + port: + number: 4040 + tls: + - hosts: + - pyroscope diff --git a/argocd/manifests/pyroscope/kustomization.yaml b/argocd/manifests/pyroscope/kustomization.yaml new file mode 100644 index 0000000..1813cdc --- /dev/null +++ b/argocd/manifests/pyroscope/kustomization.yaml @@ -0,0 +1,19 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: pyroscope + +resources: + - namespace.yaml + - statefulset.yaml + - service.yaml + - ingress-tailscale.yaml + +images: + - name: grafana/pyroscope + newTag: "1.13.3" + +configMapGenerator: + - name: pyroscope-config + files: + - config.yaml diff --git a/argocd/manifests/pyroscope/namespace.yaml b/argocd/manifests/pyroscope/namespace.yaml new file mode 100644 index 0000000..a96c46f --- /dev/null +++ b/argocd/manifests/pyroscope/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: pyroscope diff --git a/argocd/manifests/pyroscope/service.yaml b/argocd/manifests/pyroscope/service.yaml new file mode 100644 index 0000000..ccb0dc2 --- /dev/null +++ b/argocd/manifests/pyroscope/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: pyroscope + namespace: pyroscope +spec: + selector: + app: pyroscope + ports: + - name: http + port: 4040 + targetPort: 4040 + type: ClusterIP diff --git a/argocd/manifests/pyroscope/statefulset.yaml b/argocd/manifests/pyroscope/statefulset.yaml new file mode 100644 index 0000000..5da57f4 --- /dev/null +++ b/argocd/manifests/pyroscope/statefulset.yaml @@ -0,0 +1,66 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: pyroscope + namespace: pyroscope +spec: + serviceName: pyroscope + replicas: 1 + selector: + matchLabels: + app: pyroscope + template: + metadata: + labels: + app: pyroscope + spec: + securityContext: + fsGroup: 10001 + runAsNonRoot: true + runAsUser: 10001 + seccompProfile: + type: RuntimeDefault + containers: + - name: pyroscope + image: grafana/pyroscope:kustomized + args: + - -config.path=/etc/pyroscope/config.yaml + ports: + - name: http + containerPort: 4040 + volumeMounts: + - name: config + mountPath: /etc/pyroscope + - name: data + mountPath: /data + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "2Gi" + cpu: "500m" + livenessProbe: + httpGet: + path: /ready + port: 4040 + initialDelaySeconds: 45 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /ready + port: 4040 + initialDelaySeconds: 10 + periodSeconds: 5 + volumes: + - name: config + configMap: + name: pyroscope-config + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi diff --git a/docs/changelog.d/feature-pyroscope-profiling.feature.md b/docs/changelog.d/feature-pyroscope-profiling.feature.md new file mode 100644 index 0000000..0d429fe --- /dev/null +++ b/docs/changelog.d/feature-pyroscope-profiling.feature.md @@ -0,0 +1 @@ +Deploy Grafana Pyroscope on ringtail for continuous eBPF profiling, with Alloy collection agent and Grafana cross-signal linking. diff --git a/docs/reference/kubernetes/apps.md b/docs/reference/kubernetes/apps.md index 02215fc..db9baca 100644 --- a/docs/reference/kubernetes/apps.md +++ b/docs/reference/kubernetes/apps.md @@ -1,6 +1,6 @@ --- title: Apps -modified: 2026-03-04 +modified: 2026-03-26 tags: - kubernetes - argocd @@ -30,6 +30,8 @@ Registry of all applications deployed via [[argocd]]. | `tempo` | monitoring | `argocd/manifests/tempo/` | [[tempo]] | | `alloy-k8s` | alloy | `argocd/manifests/alloy-k8s/` | [[alloy|Alloy]] | | `alloy-tracing-ringtail` | alloy | `argocd/manifests/alloy-tracing-ringtail/` | [[alloy|Alloy]] (eBPF tracing) | +| `alloy-profiling-ringtail` | alloy | `argocd/manifests/alloy-profiling-ringtail/` | [[alloy|Alloy]] (eBPF profiling) | +| `pyroscope` | pyroscope | `argocd/manifests/pyroscope/` | [[pyroscope]] | | `kube-state-metrics` | monitoring | `argocd/manifests/kube-state-metrics/` | K8s metrics | | `miniflux` | miniflux | `argocd/manifests/miniflux/` | [[miniflux]] | | `kiwix` | kiwix | `argocd/manifests/kiwix/` | [[kiwix]] | diff --git a/docs/reference/operations/observability.md b/docs/reference/operations/observability.md index 35136d5..e136fb6 100644 --- a/docs/reference/operations/observability.md +++ b/docs/reference/operations/observability.md @@ -1,21 +1,30 @@ --- title: Observability -modified: 2026-03-22 +modified: 2026-03-26 tags: - operations --- # Observability -Metrics, logs, traces, and dashboards for BlumeOps infrastructure. +The four(+) pillars of observability — metrics, logs, traces, and profiles — collected and visualized via the Grafana ecosystem. ## Components -- [[prometheus]] - Metrics storage and querying -- [[loki]] - Log aggregation -- [[tempo]] - Distributed tracing -- [[alloy|Alloy]] - Metrics, log, and trace collection -- [[grafana]] - Dashboards and visualization +| Pillar | Backend | Collector | Cluster | +|--------|---------|-----------|---------| +| **Metrics** | [[prometheus]] | [[alloy]] | indri | +| **Logs** | [[loki]] | [[alloy]] | indri | +| **Traces** | [[tempo]] | [[alloy]] (Beyla eBPF) | indri (backend), ringtail (collection) | +| **Profiles** | [[pyroscope]] | [[alloy]] (pyroscope.ebpf) | ringtail | + +All four are visualized in [[grafana]] with cross-signal linking (traces → logs, traces → profiles, traces → metrics). + +## Future: Frontend Monitoring (RUM) + +Grafana Faro is a Real User Monitoring SDK that captures page loads, web vitals, errors, and network timings from the browser, feeding into Loki (logs) and Tempo (traces) via Alloy's `faro.receiver` component. This would add an "outside-in" view of service health from the user's perspective. + +**Not currently deployed.** RUM captures browsing behavior from visitors to public services, creating a data retention liability. Would require careful sanitization before deploying. ## Alerting diff --git a/docs/reference/services/alloy.md b/docs/reference/services/alloy.md index d781f2f..d62b638 100644 --- a/docs/reference/services/alloy.md +++ b/docs/reference/services/alloy.md @@ -1,6 +1,6 @@ --- title: Alloy -modified: 2026-03-13 +modified: 2026-03-26 tags: - service - observability @@ -8,10 +8,12 @@ tags: # Grafana Alloy -Unified observability collector for metrics and logs with three deployments: +Unified observability collector for metrics, logs, traces, and profiles with five deployments: 1. **Indri (host)** - System metrics and service logs from macOS host 2. **Kubernetes (DaemonSet)** - Automatic pod log collection and service health probes 3. **Fly.io proxy (embedded)** - nginx access log metrics and log forwarding from [[flyio-proxy]] +4. **Ringtail tracing (DaemonSet)** - Beyla eBPF auto-instrumentation for HTTP traces +5. **Ringtail profiling (DaemonSet)** - `pyroscope.ebpf` continuous CPU profiling ## Quick Reference @@ -64,4 +66,7 @@ The Homebrew bottle uses `CGO_ENABLED=0`, which breaks Tailscale MagicDNS. Build - [[prometheus]] - Metrics storage - [[loki]] - Log storage +- [[tempo]] - Trace storage +- [[pyroscope]] - Profile storage - [[grafana]] - Visualization +- [[observability]] - Full stack overview diff --git a/docs/reference/services/grafana.md b/docs/reference/services/grafana.md index 3a9ae01..e5e038c 100644 --- a/docs/reference/services/grafana.md +++ b/docs/reference/services/grafana.md @@ -1,6 +1,6 @@ --- title: Grafana -modified: 2026-02-28 +modified: 2026-03-26 tags: - service - observability @@ -37,6 +37,7 @@ The OIDC client secret is injected via [[external-secrets]] (`grafana-authentik- | Prometheus | prometheus | `prometheus.monitoring.svc.cluster.local:9090` | | Loki | loki | `loki.monitoring.svc.cluster.local:3100` | | Tempo | tempo | `tempo.monitoring.svc.cluster.local:3200` | +| Pyroscope | grafana-pyroscope-datasource | `pyroscope.tail8d86e.ts.net` (ringtail, via Tailscale) | | TeslaMate | postgres | `blumeops-pg-rw.databases.svc.cluster.local:5432` | ## Dashboard Provisioning diff --git a/docs/reference/services/pyroscope.md b/docs/reference/services/pyroscope.md new file mode 100644 index 0000000..927d1b7 --- /dev/null +++ b/docs/reference/services/pyroscope.md @@ -0,0 +1,50 @@ +--- +title: Pyroscope +modified: 2026-03-26 +tags: + - service + - observability +--- + +# Grafana Pyroscope + +Continuous profiling backend for BlumeOps. Stores CPU profiles collected by Alloy's eBPF profiler on ringtail, providing function-level visibility into where compute time is spent. + +## Quick Reference + +| Property | Value | +|----------|-------| +| **URL** | https://pyroscope.tail8d86e.ts.net | +| **Namespace** | `pyroscope` | +| **Cluster** | ringtail (k3s) | +| **Deployment** | StatefulSet (`argocd/manifests/pyroscope/`) | +| **Image** | `grafana/pyroscope` | +| **Port** | 4040 | +| **Storage** | 10Gi PVC at `/data` | +| **Retention** | 7 days (`max_query_lookback: 168h`) | + +## Architecture + +Pyroscope runs on ringtail because eBPF profiling requires Linux. Grafana on indri queries it via Tailscale Ingress. + +``` +Alloy (pyroscope.ebpf on ringtail) → Pyroscope (ringtail) → Grafana (indri, via Tailscale) +``` + +## Collection + +Profiles are collected by the `alloy-profiling-ringtail` DaemonSet, which runs the `pyroscope.ebpf` component in privileged mode with `hostPID: true`. It discovers Kubernetes pods automatically and excludes infrastructure namespaces (`kube-system`, `tailscale`) and Alloy pods. + +The eBPF profiler works without application instrumentation — it samples CPU stack traces from the kernel, covering native code (Go, C/C++), interpreted languages (Python, Ruby, Node.js), and JIT-compiled runtimes (.NET). + +**Limitations:** +- GPU workloads (e.g., Frigate inference via CUDA) are invisible to CPU profiling +- Stripped binaries (no debug symbols) produce opaque stack frames +- Python frame quality varies depending on runtime version + +## Related + +- [[alloy]] - Collection agent +- [[observability]] - Full observability stack overview +- [[grafana]] - Visualization +- [[tempo]] - Distributed tracing (cross-linked via traces-to-profiles)