Add OpenTelemetry distributed tracing (Tempo + Beyla eBPF) (#286)
## Summary
Adds the third observability pillar — **distributed tracing** — alongside existing metrics (Prometheus) and logs (Loki).
- **Grafana Tempo 2.10.1** on minikube-indri for trace storage with 7d retention, OTLP receivers, and `metrics_generator` that remote-writes span-metrics (RED) to Prometheus
- **Beyla eBPF auto-instrumentation** via a privileged Alloy DaemonSet on ringtail — instruments HTTP services (Frigate, ntfy, Ollama, Immich) without code changes
- **Grafana integration** — Tempo datasource with trace↔log and trace↔metrics correlation, plus Loki derivedFields for trace ID linking
- **Prometheus** scrapes Tempo operational metrics
### Architecture
```
ringtail (k3s) indri (minikube)
┌──────────────────────┐ ┌─────────────────────┐
│ Alloy+Beyla (eBPF) │──OTLP HTTP────────→ │ Tempo │
│ ↳ Frigate, ntfy, │ via tailnet │ ↳ trace storage │
│ Ollama, Immich │ │ ↳ RED → Prometheus │
└──────────────────────┘ │ │
│ Grafana │
│ ↳ Tempo datasource │
└─────────────────────┘
```
### New files (12)
- `docs/reference/services/tempo.md` — reference doc
- `docs/changelog.d/feature-otel-tracing.feature.md`
- `argocd/apps/tempo.yaml` + `argocd/manifests/tempo/` (6 files)
- `argocd/apps/alloy-tracing-ringtail.yaml` + `argocd/manifests/alloy-tracing-ringtail/` (4 files)
### Modified files (6)
- `argocd/manifests/grafana/datasources.yaml` — Tempo datasource + Loki derivedFields
- `argocd/manifests/prometheus/prometheus.yml` — Tempo scrape target
- `service-versions.yaml` — tempo + alloy-tracing-ringtail entries
- `docs/reference/services/grafana.md` — Tempo in datasources table
- `docs/reference/reference.md` — Tempo in services index
- `docs/reference/operations/observability.md` — Tempo in components list
## Deployment and Testing
- [ ] Sync `apps` app to pick up new Application definitions
- [ ] `argocd app set tempo --revision feature/otel-tracing && argocd app sync tempo`
- [ ] Verify Tempo pod: `kubectl --context=minikube-indri get pods -n monitoring -l app=tempo`
- [ ] Verify Tempo ready: port-forward 3200 and `curl localhost:3200/ready`
- [ ] Verify Tailscale ingresses: `kubectl --context=minikube-indri get ingress -n monitoring`
- [ ] `argocd app set alloy-tracing-ringtail --revision feature/otel-tracing && argocd app sync alloy-tracing-ringtail`
- [ ] Check Beyla discovery in alloy-tracing logs on ringtail
- [ ] Sync grafana-config for updated datasources
- [ ] Sync prometheus for updated scrape config
- [ ] Test Grafana Tempo datasource connection
- [ ] Generate test traffic and search traces in Grafana Explore → Tempo
- [ ] After merge: reset all ArgoCD app revisions back to main
Reviewed-on: #286
This commit is contained in:
parent
d15071aaf9
commit
c281fb5403
23 changed files with 1077 additions and 2 deletions
17
argocd/apps/alloy-tracing-ringtail.yaml
Normal file
17
argocd/apps/alloy-tracing-ringtail.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: alloy-tracing-ringtail
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/alloy-tracing-ringtail
|
||||
destination:
|
||||
server: https://ringtail.tail8d86e.ts.net:6443
|
||||
namespace: alloy
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
17
argocd/apps/tempo.yaml
Normal file
17
argocd/apps/tempo.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: tempo
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git
|
||||
targetRevision: main
|
||||
path: argocd/manifests/tempo
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: monitoring
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
93
argocd/manifests/alloy-tracing-ringtail/config.alloy
Normal file
93
argocd/manifests/alloy-tracing-ringtail/config.alloy
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
// Alloy tracing configuration for ringtail
|
||||
// Uses Beyla eBPF to auto-instrument HTTP services and export traces to Tempo on indri
|
||||
|
||||
// ============== BEYLA eBPF AUTO-INSTRUMENTATION ==============
|
||||
|
||||
beyla.ebpf "http_services" {
|
||||
discovery {
|
||||
// Instrument HTTP services on common ports
|
||||
instrument {
|
||||
open_ports = "80-9999"
|
||||
}
|
||||
|
||||
// Exclude infrastructure pods
|
||||
exclude_instrument {
|
||||
kubernetes {
|
||||
namespace = "kube-system"
|
||||
}
|
||||
}
|
||||
exclude_instrument {
|
||||
kubernetes {
|
||||
namespace = "tailscale"
|
||||
}
|
||||
}
|
||||
exclude_instrument {
|
||||
kubernetes {
|
||||
pod_labels = { app = "alloy" }
|
||||
}
|
||||
}
|
||||
exclude_instrument {
|
||||
kubernetes {
|
||||
pod_labels = { app = "alloy-tracing" }
|
||||
}
|
||||
}
|
||||
exclude_instrument {
|
||||
kubernetes {
|
||||
pod_labels = { app = "kube-state-metrics" }
|
||||
}
|
||||
}
|
||||
exclude_instrument {
|
||||
kubernetes {
|
||||
pod_labels = { "app.kubernetes.io/name" = "nvidia-device-plugin" }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
attributes {
|
||||
kubernetes {
|
||||
enable = "true"
|
||||
cluster_name = "ringtail"
|
||||
}
|
||||
}
|
||||
|
||||
traces {
|
||||
instrumentations = ["http"]
|
||||
}
|
||||
|
||||
output {
|
||||
traces = [otelcol.processor.batch.default.input]
|
||||
}
|
||||
}
|
||||
|
||||
// ============== OTEL TRACE PIPELINE ==============
|
||||
|
||||
// Batch traces before export
|
||||
otelcol.processor.batch "default" {
|
||||
output {
|
||||
traces = [otelcol.processor.attributes.add_cluster.input]
|
||||
}
|
||||
}
|
||||
|
||||
// Add cluster label to all spans
|
||||
otelcol.processor.attributes "add_cluster" {
|
||||
action {
|
||||
key = "cluster"
|
||||
value = "ringtail"
|
||||
action = "upsert"
|
||||
}
|
||||
|
||||
output {
|
||||
traces = [otelcol.exporter.otlphttp.tempo.input]
|
||||
}
|
||||
}
|
||||
|
||||
// Export traces to Tempo on indri via Tailscale
|
||||
otelcol.exporter.otlphttp "tempo" {
|
||||
client {
|
||||
endpoint = "https://tempo-otlp.tail8d86e.ts.net"
|
||||
|
||||
tls {
|
||||
insecure_skip_verify = true
|
||||
}
|
||||
}
|
||||
}
|
||||
56
argocd/manifests/alloy-tracing-ringtail/daemonset.yaml
Normal file
56
argocd/manifests/alloy-tracing-ringtail/daemonset.yaml
Normal file
|
|
@ -0,0 +1,56 @@
|
|||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: alloy-tracing
|
||||
namespace: alloy
|
||||
labels:
|
||||
app: alloy-tracing
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: alloy-tracing
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: alloy-tracing
|
||||
spec:
|
||||
serviceAccountName: alloy-tracing
|
||||
hostPID: true
|
||||
containers:
|
||||
- name: alloy
|
||||
image: grafana/alloy
|
||||
args:
|
||||
- run
|
||||
- --server.http.listen-addr=0.0.0.0:12346
|
||||
- --storage.path=/var/lib/alloy/data
|
||||
- /etc/alloy/config.alloy
|
||||
ports:
|
||||
- containerPort: 12346
|
||||
name: http
|
||||
env:
|
||||
- name: HOSTNAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: "1"
|
||||
memory: 1Gi
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/alloy
|
||||
- name: data
|
||||
mountPath: /var/lib/alloy/data
|
||||
securityContext:
|
||||
privileged: true
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: alloy-tracing-config
|
||||
- name: data
|
||||
emptyDir: {}
|
||||
17
argocd/manifests/alloy-tracing-ringtail/kustomization.yaml
Normal file
17
argocd/manifests/alloy-tracing-ringtail/kustomization.yaml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: alloy
|
||||
|
||||
resources:
|
||||
- rbac.yaml
|
||||
- daemonset.yaml
|
||||
|
||||
images:
|
||||
- name: grafana/alloy
|
||||
newTag: v1.13.1
|
||||
|
||||
configMapGenerator:
|
||||
- name: alloy-tracing-config
|
||||
files:
|
||||
- config.alloy
|
||||
30
argocd/manifests/alloy-tracing-ringtail/rbac.yaml
Normal file
30
argocd/manifests/alloy-tracing-ringtail/rbac.yaml
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: alloy-tracing
|
||||
namespace: alloy
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: alloy-tracing
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "services", "endpoints", "nodes", "namespaces"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: ["apps"]
|
||||
resources: ["deployments", "replicasets", "statefulsets", "daemonsets"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: alloy-tracing
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: alloy-tracing
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: alloy-tracing
|
||||
namespace: alloy
|
||||
491
argocd/manifests/grafana-config/dashboards/configmap-tempo.yaml
Normal file
491
argocd/manifests/grafana-config/dashboards/configmap-tempo.yaml
Normal file
|
|
@ -0,0 +1,491 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboard-tempo
|
||||
namespace: monitoring
|
||||
labels:
|
||||
grafana_dashboard: "1"
|
||||
data:
|
||||
tempo.json: |
|
||||
{
|
||||
"annotations": {
|
||||
"list": []
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 5368709120 },
|
||||
{ "color": "red", "value": 8589934592 }
|
||||
]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 0 },
|
||||
"id": 1,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum(tempodb_backend_bytes_total{job=\"tempo\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Storage Used",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 50 },
|
||||
{ "color": "red", "value": 80 }
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"max": 100,
|
||||
"min": 0
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 0 },
|
||||
"id": 2,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum(tempodb_backend_bytes_total{job=\"tempo\"}) / 10737418240 * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "PVC Utilization (of 10Gi)",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 0 },
|
||||
"id": 3,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum(tempodb_blocklist_length{job=\"tempo\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Total Blocks",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{ "color": "green", "value": null },
|
||||
{ "color": "yellow", "value": 0.5 },
|
||||
{ "color": "red", "value": 0.9 }
|
||||
]
|
||||
},
|
||||
"unit": "percentunit"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 0 },
|
||||
"id": 4,
|
||||
"options": {
|
||||
"colorMode": "value",
|
||||
"graphMode": "none",
|
||||
"justifyMode": "auto",
|
||||
"orientation": "auto",
|
||||
"reduceOptions": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"fields": "",
|
||||
"values": false
|
||||
},
|
||||
"textMode": "auto"
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "1 - (go_memstats_heap_idle_bytes{job=\"tempo\"} / go_memstats_heap_sys_bytes{job=\"tempo\"})",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Heap Usage",
|
||||
"type": "stat"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "bytes"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 },
|
||||
"id": 5,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["lastNotNull"],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "sum(tempodb_backend_bytes_total{job=\"tempo\"})",
|
||||
"legendFormat": "Backend Storage",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "go_memstats_heap_inuse_bytes{job=\"tempo\"}",
|
||||
"legendFormat": "Heap In Use",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Storage Over Time",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "short"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 },
|
||||
"id": 6,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "rate(tempo_distributor_spans_received_total{job=\"tempo\"}[5m])",
|
||||
"legendFormat": "Spans/sec",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Span Ingestion Rate",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "Bps"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 },
|
||||
"id": 7,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "rate(tempo_distributor_bytes_received_total{job=\"tempo\"}[5m])",
|
||||
"legendFormat": "Bytes Received",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Ingestion Throughput",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "palette-classic"
|
||||
},
|
||||
"custom": {
|
||||
"axisBorderShow": false,
|
||||
"axisCenteredZero": false,
|
||||
"axisColorMode": "text",
|
||||
"axisLabel": "",
|
||||
"axisPlacement": "auto",
|
||||
"barAlignment": 0,
|
||||
"drawStyle": "line",
|
||||
"fillOpacity": 10,
|
||||
"gradientMode": "none",
|
||||
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
|
||||
"insertNulls": false,
|
||||
"lineInterpolation": "linear",
|
||||
"lineWidth": 1,
|
||||
"pointSize": 5,
|
||||
"scaleDistribution": { "type": "linear" },
|
||||
"showPoints": "never",
|
||||
"spanNulls": false,
|
||||
"stacking": { "group": "A", "mode": "none" },
|
||||
"thresholdsStyle": { "mode": "off" }
|
||||
},
|
||||
"mappings": [],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [{ "color": "green", "value": null }]
|
||||
},
|
||||
"unit": "s"
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 },
|
||||
"id": 8,
|
||||
"options": {
|
||||
"legend": {
|
||||
"calcs": ["mean", "max"],
|
||||
"displayMode": "table",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"tooltip": { "mode": "multi", "sort": "desc" }
|
||||
},
|
||||
"pluginVersion": "10.0.0",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "histogram_quantile(0.95, sum(rate(tempo_query_frontend_result_metrics_duration_seconds_bucket{job=\"tempo\"}[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": { "type": "prometheus", "uid": "prometheus" },
|
||||
"expr": "histogram_quantile(0.50, sum(rate(tempo_query_frontend_result_metrics_duration_seconds_bucket{job=\"tempo\"}[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "B"
|
||||
}
|
||||
],
|
||||
"title": "Query Latency",
|
||||
"type": "timeseries"
|
||||
}
|
||||
],
|
||||
"refresh": "1m",
|
||||
"schemaVersion": 38,
|
||||
"tags": ["tempo", "tracing"],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-24h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "Tempo",
|
||||
"uid": "tempo-homelab",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
|
|
@ -25,6 +25,7 @@ resources:
|
|||
- dashboards/configmap-flyio.yaml
|
||||
- dashboards/configmap-sifaka-disks.yaml
|
||||
- dashboards/configmap-forgejo.yaml
|
||||
- dashboards/configmap-tempo.yaml
|
||||
# TeslaMate dashboards
|
||||
- dashboards/configmap-teslamate-overview.yaml
|
||||
- dashboards/configmap-teslamate-charges.yaml
|
||||
|
|
|
|||
|
|
@ -15,6 +15,39 @@ datasources:
|
|||
type: loki
|
||||
uid: loki
|
||||
url: http://loki.monitoring.svc.cluster.local:3100
|
||||
jsonData:
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"traceID":"(\w+)"'
|
||||
name: TraceID
|
||||
url: "$${__value.raw}"
|
||||
- access: proxy
|
||||
editable: false
|
||||
name: Tempo
|
||||
orgId: 1
|
||||
type: tempo
|
||||
uid: tempo
|
||||
url: http://tempo.monitoring.svc.cluster.local:3200
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: loki
|
||||
filterByTraceID: true
|
||||
filterBySpanID: false
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
spanStartTimeShift: "-1h"
|
||||
spanEndTimeShift: "1h"
|
||||
queries:
|
||||
- name: Request rate
|
||||
query: "sum(rate(traces_spanmetrics_calls_total{$$__tags}[5m]))"
|
||||
- name: Error rate
|
||||
query: "sum(rate(traces_spanmetrics_calls_total{$$__tags, status_code=\"STATUS_CODE_ERROR\"}[5m]))"
|
||||
- name: Duration (p95)
|
||||
query: "histogram_quantile(0.95, sum(rate(traces_spanmetrics_duration_seconds_bucket{$$__tags}[5m])) by (le))"
|
||||
serviceMap:
|
||||
datasourceUid: prometheus
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
- access: proxy
|
||||
database: teslamate
|
||||
editable: false
|
||||
|
|
|
|||
|
|
@ -64,6 +64,14 @@ scrape_configs:
|
|||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Tempo operational metrics
|
||||
- job_name: "tempo"
|
||||
static_configs:
|
||||
- targets: ["tempo.monitoring.svc.cluster.local:3200"]
|
||||
metric_relabel_configs:
|
||||
- target_label: cluster
|
||||
replacement: indri
|
||||
|
||||
# Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail)
|
||||
- job_name: "frigate"
|
||||
scheme: https
|
||||
|
|
|
|||
27
argocd/manifests/tempo/ingress-tailscale-otlp.yaml
Normal file
27
argocd/manifests/tempo/ingress-tailscale-otlp.yaml
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
# Tailscale Ingress for Tempo OTLP HTTP receiver
|
||||
# Used by ringtail Alloy to push traces across tailnet
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: tempo-otlp-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
tailscale.com/proxy-group: "ingress"
|
||||
tailscale.com/tags: "tag:k8s"
|
||||
gethomepage.dev/enabled: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: tempo
|
||||
port:
|
||||
number: 4318
|
||||
tls:
|
||||
- hosts:
|
||||
- tempo-otlp
|
||||
26
argocd/manifests/tempo/ingress-tailscale.yaml
Normal file
26
argocd/manifests/tempo/ingress-tailscale.yaml
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
# Tailscale Ingress for Tempo query API
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: tempo-tailscale
|
||||
namespace: monitoring
|
||||
annotations:
|
||||
tailscale.com/funnel: "false"
|
||||
tailscale.com/proxy-group: "ingress"
|
||||
tailscale.com/tags: "tag:k8s"
|
||||
gethomepage.dev/enabled: "false"
|
||||
spec:
|
||||
ingressClassName: tailscale
|
||||
rules:
|
||||
- http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: tempo
|
||||
port:
|
||||
number: 3200
|
||||
tls:
|
||||
- hosts:
|
||||
- tempo
|
||||
19
argocd/manifests/tempo/kustomization.yaml
Normal file
19
argocd/manifests/tempo/kustomization.yaml
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- statefulset.yaml
|
||||
- service.yaml
|
||||
- ingress-tailscale.yaml
|
||||
- ingress-tailscale-otlp.yaml
|
||||
|
||||
images:
|
||||
- name: grafana/tempo
|
||||
newTag: "2.10.1"
|
||||
|
||||
configMapGenerator:
|
||||
- name: tempo-config
|
||||
files:
|
||||
- tempo.yaml
|
||||
22
argocd/manifests/tempo/service.yaml
Normal file
22
argocd/manifests/tempo/service.yaml
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: tempo
|
||||
namespace: monitoring
|
||||
spec:
|
||||
selector:
|
||||
app: tempo
|
||||
ports:
|
||||
- name: http
|
||||
port: 3200
|
||||
targetPort: 3200
|
||||
- name: grpc
|
||||
port: 9095
|
||||
targetPort: 9095
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
targetPort: 4317
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
targetPort: 4318
|
||||
type: ClusterIP
|
||||
70
argocd/manifests/tempo/statefulset.yaml
Normal file
70
argocd/manifests/tempo/statefulset.yaml
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: tempo
|
||||
namespace: monitoring
|
||||
spec:
|
||||
serviceName: tempo
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: tempo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: tempo
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 10001
|
||||
runAsNonRoot: true
|
||||
runAsUser: 10001
|
||||
containers:
|
||||
- name: tempo
|
||||
image: grafana/tempo
|
||||
args:
|
||||
- -config.file=/etc/tempo/tempo.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3200
|
||||
- name: grpc
|
||||
containerPort: 9095
|
||||
- name: otlp-grpc
|
||||
containerPort: 4317
|
||||
- name: otlp-http
|
||||
containerPort: 4318
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/tempo
|
||||
- name: data
|
||||
mountPath: /var/tempo
|
||||
resources:
|
||||
requests:
|
||||
memory: "256Mi"
|
||||
cpu: "100m"
|
||||
limits:
|
||||
memory: "1Gi"
|
||||
cpu: "500m"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3200
|
||||
initialDelaySeconds: 45
|
||||
periodSeconds: 10
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: 3200
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 5
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: tempo-config
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: ["ReadWriteOnce"]
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
58
argocd/manifests/tempo/tempo.yaml
Normal file
58
argocd/manifests/tempo/tempo.yaml
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
stream_over_http_enabled: true
|
||||
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
grpc_listen_port: 9095
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: "0.0.0.0:4317"
|
||||
http:
|
||||
endpoint: "0.0.0.0:4318"
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
local:
|
||||
path: /var/tempo/blocks
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 168h # 7 days
|
||||
|
||||
metrics_generator:
|
||||
registry:
|
||||
external_labels:
|
||||
source: tempo
|
||||
storage:
|
||||
path: /var/tempo/generator/wal
|
||||
remote_write:
|
||||
- url: http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write
|
||||
send_exemplars: true
|
||||
traces_storage:
|
||||
path: /var/tempo/generator/traces
|
||||
processor:
|
||||
span_metrics:
|
||||
dimensions:
|
||||
- service.name
|
||||
- http.method
|
||||
- http.status_code
|
||||
- http.target
|
||||
service_graphs:
|
||||
dimensions:
|
||||
- service.name
|
||||
local_blocks:
|
||||
flush_to_storage: false
|
||||
|
||||
overrides:
|
||||
defaults:
|
||||
metrics_generator:
|
||||
processors:
|
||||
- span-metrics
|
||||
- service-graphs
|
||||
- local-blocks
|
||||
Loading…
Add table
Add a link
Reference in a new issue