diff --git a/ansible/roles/alloy/templates/config.alloy.j2 b/ansible/roles/alloy/templates/config.alloy.j2 index 49a079b..51d2c94 100644 --- a/ansible/roles/alloy/templates/config.alloy.j2 +++ b/ansible/roles/alloy/templates/config.alloy.j2 @@ -29,6 +29,11 @@ prometheus.relabel "instance" { target_label = "instance" replacement = "{{ alloy_instance_label }}" } + + rule { + target_label = "cluster" + replacement = "indri" + } } // Push metrics to Prometheus via remote_write @@ -110,6 +115,11 @@ loki.relabel "add_host" { target_label = "host" replacement = "{{ alloy_instance_label }}" } + + rule { + target_label = "cluster" + replacement = "indri" + } } // Write logs to Loki diff --git a/argocd/apps/alloy-ringtail.yaml b/argocd/apps/alloy-ringtail.yaml new file mode 100644 index 0000000..b5d7297 --- /dev/null +++ b/argocd/apps/alloy-ringtail.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: alloy-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/alloy-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: alloy + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/apps/kube-state-metrics-ringtail.yaml b/argocd/apps/kube-state-metrics-ringtail.yaml new file mode 100644 index 0000000..44dd50f --- /dev/null +++ b/argocd/apps/kube-state-metrics-ringtail.yaml @@ -0,0 +1,17 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: kube-state-metrics-ringtail + namespace: argocd +spec: + project: default + source: + repoURL: ssh://forgejo@forge.ops.eblu.me:2222/eblume/blumeops.git + targetRevision: main + path: argocd/manifests/kube-state-metrics-ringtail + destination: + server: https://ringtail.tail8d86e.ts.net:6443 + namespace: monitoring + syncPolicy: + syncOptions: + - CreateNamespace=true diff --git a/argocd/manifests/alloy-k8s/config.alloy b/argocd/manifests/alloy-k8s/config.alloy index 582a692..86c0747 100644 --- a/argocd/manifests/alloy-k8s/config.alloy +++ b/argocd/manifests/alloy-k8s/config.alloy @@ -108,6 +108,11 @@ loki.process "pods" { repository = "", } } + + // Add cluster label for multi-cluster identification + stage.static_labels { + values = { cluster = "indri" } + } } // Write logs to Loki @@ -163,6 +168,8 @@ prometheus.scrape "blackbox" { // Push metrics to Prometheus prometheus.remote_write "prometheus" { + external_labels = { cluster = "indri" } + endpoint { url = "http://prometheus.monitoring.svc.cluster.local:9090/api/v1/write" } diff --git a/argocd/manifests/alloy-ringtail/config.alloy b/argocd/manifests/alloy-ringtail/config.alloy new file mode 100644 index 0000000..9ae8981 --- /dev/null +++ b/argocd/manifests/alloy-ringtail/config.alloy @@ -0,0 +1,165 @@ +// Alloy ringtail configuration - collects host metrics, pod logs, and kube-state-metrics +// Remote-writes metrics to indri Prometheus, logs to indri Loki + +// ============== HOST METRICS ============== + +// System metrics exporter (Linux host via /host/proc, /host/sys mounts) +prometheus.exporter.unix "system" { + procfs_path = "/host/proc" + sysfs_path = "/host/sys" + rootfs_path = "/host/root" +} + +// Scrape system metrics and add instance label +prometheus.scrape "system" { + targets = prometheus.exporter.unix.system.targets + forward_to = [prometheus.relabel.instance.receiver] + scrape_interval = "15s" +} + +// Add instance label +prometheus.relabel "instance" { + forward_to = [prometheus.remote_write.prometheus.receiver] + + rule { + target_label = "instance" + replacement = "ringtail" + } +} + +// ============== KUBE-STATE-METRICS SCRAPE ============== + +prometheus.scrape "kube_state_metrics" { + targets = [{"__address__" = "kube-state-metrics.monitoring.svc.cluster.local:8080"}] + scrape_interval = "15s" + forward_to = [prometheus.remote_write.prometheus.receiver] +} + +// Push metrics to indri Prometheus +prometheus.remote_write "prometheus" { + external_labels = { cluster = "ringtail" } + + endpoint { + url = "https://prometheus.tail8d86e.ts.net/api/v1/write" + + tls_config { + insecure_skip_verify = true + } + } +} + +// ============== K8S POD LOG DISCOVERY ============== + +// Discover all pods in the cluster +discovery.kubernetes "pods" { + role = "pod" +} + +// Relabel to extract useful metadata +discovery.relabel "pods" { + targets = discovery.kubernetes.pods.targets + + // Keep only running pods + rule { + source_labels = ["__meta_kubernetes_pod_phase"] + regex = "Pending|Succeeded|Failed|Unknown" + action = "drop" + } + + // Set namespace label + rule { + source_labels = ["__meta_kubernetes_namespace"] + target_label = "namespace" + } + + // Set pod name label + rule { + source_labels = ["__meta_kubernetes_pod_name"] + target_label = "pod" + } + + // Set container name label + rule { + source_labels = ["__meta_kubernetes_pod_container_name"] + target_label = "container" + } + + // Set app label from pod labels + rule { + source_labels = ["__meta_kubernetes_pod_label_app"] + target_label = "app" + } + + // Fallback: use app.kubernetes.io/name if no app label + rule { + source_labels = ["__meta_kubernetes_pod_label_app_kubernetes_io_name"] + target_label = "app" + regex = "(.+)" + action = "replace" + } + + // Set node name + rule { + source_labels = ["__meta_kubernetes_pod_node_name"] + target_label = "node" + } + + // Build the log path for the pod container + rule { + source_labels = ["__meta_kubernetes_pod_uid", "__meta_kubernetes_pod_container_name"] + target_label = "__path__" + separator = "/" + replacement = "/var/log/pods/*$1/$2/*.log" + } +} + +// Tail pod logs +loki.source.kubernetes "pods" { + targets = discovery.relabel.pods.output + forward_to = [loki.process.pods.receiver] +} + +// Process logs - parse JSON if present, add labels +loki.process "pods" { + forward_to = [loki.write.loki.receiver] + + // Try to parse JSON logs + stage.json { + expressions = { + level = "level", + msg = "msg", + message = "message", + time = "time", + caller = "caller", + } + } + + // Drop JSON parsing error labels (non-JSON logs are fine) + stage.label_drop { + values = ["__error__", "__error_details__"] + } + + // Extract labels from parsed JSON data + stage.labels { + values = { + level = "", + caller = "", + } + } + + // Add cluster label for multi-cluster identification + stage.static_labels { + values = { cluster = "ringtail" } + } +} + +// Write logs to indri Loki +loki.write "loki" { + endpoint { + url = "https://loki.tail8d86e.ts.net/loki/api/v1/push" + + tls_config { + insecure_skip_verify = true + } + } +} diff --git a/argocd/manifests/alloy-ringtail/daemonset.yaml b/argocd/manifests/alloy-ringtail/daemonset.yaml new file mode 100644 index 0000000..a8d060a --- /dev/null +++ b/argocd/manifests/alloy-ringtail/daemonset.yaml @@ -0,0 +1,86 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: alloy + namespace: alloy + labels: + app: alloy +spec: + selector: + matchLabels: + app: alloy + template: + metadata: + labels: + app: alloy + spec: + serviceAccountName: alloy + securityContext: + fsGroup: 473 # alloy user group + containers: + - name: alloy + image: grafana/alloy + args: + - run + - --server.http.listen-addr=0.0.0.0:12345 + - --storage.path=/var/lib/alloy/data + - /etc/alloy/config.alloy + ports: + - containerPort: 12345 + name: http + env: + - name: HOSTNAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + volumeMounts: + - name: config + mountPath: /etc/alloy + - name: varlog + mountPath: /var/log + readOnly: true + - name: data + mountPath: /var/lib/alloy/data + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + mountPropagation: HostToContainer + readOnly: true + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + capabilities: + drop: + - ALL + tolerations: + - operator: Exists + volumes: + - name: config + configMap: + name: alloy-config + - name: varlog + hostPath: + path: /var/log + - name: data + emptyDir: {} + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / diff --git a/argocd/manifests/alloy-ringtail/kustomization.yaml b/argocd/manifests/alloy-ringtail/kustomization.yaml new file mode 100644 index 0000000..1d43d8f --- /dev/null +++ b/argocd/manifests/alloy-ringtail/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: alloy + +resources: + - namespace.yaml + - rbac.yaml + - daemonset.yaml + +images: + - name: grafana/alloy + newTag: v1.13.1 + +configMapGenerator: + - name: alloy-config + files: + - config.alloy diff --git a/argocd/manifests/alloy-ringtail/namespace.yaml b/argocd/manifests/alloy-ringtail/namespace.yaml new file mode 100644 index 0000000..94f62be --- /dev/null +++ b/argocd/manifests/alloy-ringtail/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: alloy diff --git a/argocd/manifests/alloy-ringtail/rbac.yaml b/argocd/manifests/alloy-ringtail/rbac.yaml new file mode 100644 index 0000000..58a31df --- /dev/null +++ b/argocd/manifests/alloy-ringtail/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alloy + namespace: alloy +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: alloy +rules: + - apiGroups: [""] + resources: ["nodes", "nodes/proxy", "nodes/metrics", "services", "endpoints", "pods", "pods/log", "namespaces"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["configmaps"] + verbs: ["get"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["get", "list", "watch"] + - nonResourceURLs: ["/metrics", "/metrics/cadvisor"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: alloy +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alloy +subjects: + - kind: ServiceAccount + name: alloy + namespace: alloy diff --git a/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml b/argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml similarity index 75% rename from argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml rename to argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml index cbca654..2f3f2da 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-minikube.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-kubernetes.yaml @@ -1,12 +1,12 @@ apiVersion: v1 kind: ConfigMap metadata: - name: grafana-dashboard-minikube + name: grafana-dashboard-kubernetes namespace: monitoring labels: grafana_dashboard: "1" data: - minikube.json: | + kubernetes.json: | { "annotations": { "list": [] }, "editable": true, @@ -15,95 +15,63 @@ data: "id": null, "links": [], "panels": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }], - "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } - } - }, - "gridPos": { "h": 3, "w": 3, "x": 0, "y": 0 }, - "id": 1, - "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "minikube_up", "refId": "A" }], - "title": "Cluster", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [{ "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" }], - "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] } - } - }, - "gridPos": { "h": 3, "w": 3, "x": 3, "y": 0 }, - "id": 2, - "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "minikube_apiserver_up", "refId": "A" }], - "title": "API Server", - "type": "stat" - }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, - "gridPos": { "h": 3, "w": 3, "x": 6, "y": 0 }, - "id": 3, + "gridPos": { "h": 3, "w": 4, "x": 0, "y": 0 }, + "id": 1, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{namespace=~\"$namespace\"})", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }], "title": "Pods", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, - "gridPos": { "h": 3, "w": 3, "x": 9, "y": 0 }, - "id": 4, + "gridPos": { "h": 3, "w": 4, "x": 4, "y": 0 }, + "id": 2, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{namespace=~\"$namespace\"})", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }], "title": "Deployments", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, - "gridPos": { "h": 3, "w": 3, "x": 12, "y": 0 }, - "id": 5, + "gridPos": { "h": 3, "w": 4, "x": 8, "y": 0 }, + "id": 3, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{namespace=~\"$namespace\"})", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_statefulset_created{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }], "title": "StatefulSets", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, - "gridPos": { "h": 3, "w": 3, "x": 15, "y": 0 }, - "id": 6, + "gridPos": { "h": 3, "w": 4, "x": 12, "y": 0 }, + "id": 4, "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created)", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_namespace_created{cluster=~\"$cluster\"})", "refId": "A" }], "title": "Namespaces", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "bytes", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, - "gridPos": { "h": 3, "w": 3, "x": 18, "y": 0 }, - "id": 7, + "gridPos": { "h": 3, "w": 4, "x": 16, "y": 0 }, + "id": 5, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }], "title": "Memory Requests", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "short", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, - "gridPos": { "h": 3, "w": 3, "x": 21, "y": 0 }, - "id": 8, + "gridPos": { "h": 3, "w": 4, "x": 20, "y": 0 }, + "id": 6, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\"})", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "refId": "A" }], "title": "CPU Requests (cores)", "type": "stat" }, @@ -117,9 +85,9 @@ data: } }, "gridPos": { "h": 4, "w": 6, "x": 0, "y": 3 }, - "id": 13, + "id": 7, "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }], "title": "Unhealthy Pods", "type": "stat" }, @@ -135,9 +103,9 @@ data: } }, "gridPos": { "h": 4, "w": 18, "x": 6, "y": 3 }, - "id": 14, + "id": 8, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (reason) (kube_pod_container_status_waiting_reason{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{reason}}", "refId": "A" }], "title": "Pods by Waiting Reason", "type": "timeseries" }, @@ -155,7 +123,7 @@ data: "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, "id": 9, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], "title": "Pods by Namespace", "type": "timeseries" }, @@ -173,7 +141,7 @@ data: "gridPos": { "h": 8, "w": 12, "x": 12, "y": 7 }, "id": 10, "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, - "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "legendFormat": "{{namespace}}", "refId": "A" }], "title": "Memory Requests by Namespace", "type": "timeseries" }, @@ -197,11 +165,11 @@ data: "id": 11, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Pods" }] }, "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\",namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" } + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "pods" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_req" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"memory\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "mem_lim" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_requests{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_req" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (namespace) (kube_pod_container_resource_limits{resource=\"cpu\", cluster=~\"$cluster\", namespace=~\"$namespace\"})", "format": "table", "instant": true, "refId": "cpu_lim" } ], "title": "Namespace Resource Summary", "transformations": [ @@ -215,27 +183,44 @@ data: "gridPos": { "h": 8, "w": 24, "x": 0, "y": 23 }, "id": 12, "options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false }, - "targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{namespace=~\"$namespace\"}", "refId": "A" }], + "targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=~\"$cluster\", namespace=~\"$namespace\"}", "refId": "A" }], "title": "Pod Logs", "type": "logs" } ], "refresh": "30s", "schemaVersion": 38, - "tags": ["minikube", "kubernetes", "k8s"], + "tags": ["kubernetes", "k8s", "multi-cluster"], "templating": { "list": [ { "current": { "selected": true, "text": "All", "value": "$__all" }, "datasource": { "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(kube_namespace_created, namespace)", + "definition": "label_values(kube_namespace_created, cluster)", + "hide": 0, + "includeAll": true, + "label": "Cluster", + "multi": true, + "name": "cluster", + "options": [], + "query": { "query": "label_values(kube_namespace_created, cluster)", "refId": "StandardVariableQuery" }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "definition": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)", "hide": 0, "includeAll": true, "label": "Namespace", "multi": true, "name": "namespace", "options": [], - "query": { "query": "label_values(kube_namespace_created, namespace)", "refId": "StandardVariableQuery" }, + "query": { "query": "label_values(kube_namespace_created{cluster=~\"$cluster\"}, namespace)", "refId": "StandardVariableQuery" }, "refresh": 2, "regex": "", "skipUrlSync": false, @@ -247,8 +232,8 @@ data: "time": { "from": "now-6h", "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "Minikube Kubernetes", - "uid": "minikube", - "version": 2, + "title": "Kubernetes Clusters", + "uid": "kubernetes", + "version": 1, "weekStart": "" } diff --git a/argocd/manifests/grafana-config/dashboards/configmap-ringtail.yaml b/argocd/manifests/grafana-config/dashboards/configmap-ringtail.yaml new file mode 100644 index 0000000..63cd2aa --- /dev/null +++ b/argocd/manifests/grafana-config/dashboards/configmap-ringtail.yaml @@ -0,0 +1,314 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-ringtail + namespace: monitoring + labels: + grafana_dashboard: "1" +data: + ringtail.json: | + { + "annotations": { "list": [] }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 100, + "panels": [], + "title": "System Overview", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, "unit": "dtdurations" } }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 1, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "time() - node_boot_time_seconds{instance=\"ringtail\"}", "refId": "A" }], + "title": "Uptime", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "blue", "value": null }] }, "unit": "decbytes" } }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 1 }, + "id": 2, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemTotal_bytes{instance=\"ringtail\"}", "refId": "A" }], + "title": "Total Memory", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "purple", "value": null }] }, "unit": "short" } }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 1 }, + "id": 3, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(node_cpu_seconds_total{instance=\"ringtail\", mode=\"idle\"})", "refId": "A" }], + "title": "CPU Cores", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 4 }, { "color": "red", "value": 8 }] }, "unit": "short", "decimals": 2 } }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 4, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_load1{instance=\"ringtail\"}", "refId": "A" }], + "title": "Load (1m)", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 5, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_info{cluster=\"ringtail\"})", "refId": "A" }], + "title": "K8s Pods", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "unit": "percent", "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 80 }, { "color": "red", "value": 95 }] } } }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 6, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_usage_percent", "refId": "A" }], + "title": "GPU Usage %", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 101, + "panels": [], + "title": "CPU & Memory", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "percentunit", + "max": 1 + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, + "id": 7, + "options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum by (mode) (rate(node_cpu_seconds_total{instance=\"ringtail\", mode!=\"idle\"}[5m])) / on() group_left count(node_cpu_seconds_total{instance=\"ringtail\", mode=\"idle\"})", "legendFormat": "{{mode}}", "refId": "A" } + ], + "title": "CPU Usage by Mode", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "bytes" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, + "id": 8, + "options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemTotal_bytes{instance=\"ringtail\"} - node_memory_MemAvailable_bytes{instance=\"ringtail\"}", "legendFormat": "Used", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_MemAvailable_bytes{instance=\"ringtail\"}", "legendFormat": "Available", "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_memory_Cached_bytes{instance=\"ringtail\"}", "legendFormat": "Cached", "refId": "C" } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 102, + "panels": [], + "title": "Storage", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.8 }, { "color": "red", "value": 0.95 }] } + }, + "overrides": [ + { "matcher": { "id": "byName", "options": "Size" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { "matcher": { "id": "byName", "options": "Available" }, "properties": [{ "id": "unit", "value": "bytes" }] }, + { "matcher": { "id": "byName", "options": "Used %" }, "properties": [{ "id": "unit", "value": "percentunit" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "yellow", "value": 0.8 }, { "color": "red", "value": 0.95 }] } }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "gauge" } }] } + ] + }, + "gridPos": { "h": 6, "w": 24, "x": 0, "y": 15 }, + "id": 9, + "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": ["sum"], "show": false }, "showHeader": true, "sortBy": [{ "desc": true, "displayName": "Size" }] }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_filesystem_size_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"}", "format": "table", "instant": true, "refId": "size" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "node_filesystem_avail_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"}", "format": "table", "instant": true, "refId": "avail" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "1 - (node_filesystem_avail_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"} / node_filesystem_size_bytes{instance=\"ringtail\", fstype!~\"tmpfs|overlay|squashfs\"})", "format": "table", "instant": true, "refId": "pct" } + ], + "title": "Filesystem Usage", + "transformations": [ + { "id": "seriesToColumns", "options": { "byField": "mountpoint" } }, + { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 2": true, "Time 3": true, "__name__": true, "__name__ 1": true, "__name__ 2": true, "device": true, "device 1": true, "device 2": true, "fstype": true, "fstype 1": true, "fstype 2": true, "instance": true, "instance 1": true, "instance 2": true, "job": true, "job 1": true, "job 2": true, "cluster": true, "cluster 1": true, "cluster 2": true }, "renameByName": { "mountpoint": "Mount", "Value #size": "Size", "Value #avail": "Available", "Value #pct": "Used %" } } } + ], + "type": "table" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, + "id": 103, + "panels": [], + "title": "Network", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": true, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "Bps" + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 22 }, + "id": 10, + "options": { "legend": { "calcs": ["mean", "lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "rate(node_network_receive_bytes_total{instance=\"ringtail\", device!~\"lo|veth.*|cali.*|flannel.*|cni.*\"}[5m])", "legendFormat": "{{device}} rx", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "-rate(node_network_transmit_bytes_total{instance=\"ringtail\", device!~\"lo|veth.*|cali.*|flannel.*|cni.*\"}[5m])", "legendFormat": "{{device}} tx", "refId": "B" } + ], + "title": "Network Traffic", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 30 }, + "id": 104, + "panels": [], + "title": "GPU", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "percent" + } + }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 31 }, + "id": 11, + "options": { "legend": { "calcs": ["mean", "lastNotNull", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_usage_percent", "legendFormat": "GPU Usage", "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "frigate_gpu_mem_usage_percent", "legendFormat": "GPU Memory", "refId": "B" } + ], + "title": "GPU Overview", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 39 }, + "id": 105, + "panels": [], + "title": "Kubernetes", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 80, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, + "unit": "short" + } + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 40 }, + "id": 12, + "options": { "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last *", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count by (namespace) (kube_pod_info{cluster=\"ringtail\"})", "legendFormat": "{{namespace}}", "refId": "A" }], + "title": "Pods by Namespace", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }, { "color": "red", "value": 1 }] } + } + }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 40 }, + "id": 13, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_pod_container_status_waiting_reason{cluster=\"ringtail\", reason=~\"ImagePullBackOff|ErrImagePull|CrashLoopBackOff|CreateContainerError|RunContainerError\"}) or vector(0)", "refId": "A" }], + "title": "Unhealthy Pods", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] } } }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 40 }, + "id": 14, + "options": { "colorMode": "value", "graphMode": "none", "justifyMode": "center", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "value" }, + "targets": [{ "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "count(kube_deployment_created{cluster=\"ringtail\"})", "refId": "A" }], + "title": "Deployments", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 48 }, + "id": 106, + "panels": [], + "title": "Logs", + "type": "row" + }, + { + "datasource": { "type": "loki", "uid": "loki" }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 49 }, + "id": 15, + "options": { "dedupStrategy": "none", "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": true, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false }, + "targets": [{ "datasource": { "type": "loki", "uid": "loki" }, "expr": "{cluster=\"ringtail\"}", "refId": "A" }], + "title": "Pod Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["ringtail", "k3s", "gpu", "system"], + "templating": { "list": [] }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "Ringtail", + "uid": "ringtail", + "version": 1, + "weekStart": "" + } diff --git a/argocd/manifests/grafana-config/dashboards/configmap-services.yaml b/argocd/manifests/grafana-config/dashboards/configmap-services.yaml deleted file mode 100644 index 241212a..0000000 --- a/argocd/manifests/grafana-config/dashboards/configmap-services.yaml +++ /dev/null @@ -1,145 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-dashboard-services - namespace: monitoring - labels: - grafana_dashboard: "1" -data: - services.json: | - { - "annotations": { "list": [] }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [ - { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "UP" } }, "type": "value" } - ], - "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }, - "unit": "short" - }, - "overrides": [] - }, - "gridPos": { "h": 6, "w": 24, "x": 0, "y": 0 }, - "id": 1, - "options": { - "colorMode": "background", - "graphMode": "none", - "justifyMode": "center", - "orientation": "horizontal", - "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, - "textMode": "value_and_name" - }, - "pluginVersion": "11.0.0", - "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/miniflux\"}", "legendFormat": "Miniflux", "refId": "A" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/kiwix\"}", "legendFormat": "Kiwix", "refId": "B" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/transmission\"}", "legendFormat": "Transmission", "refId": "C" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/devpi\"}", "legendFormat": "Devpi", "refId": "D" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_success{job=\"integrations/blackbox/argocd\"}", "legendFormat": "ArgoCD", "refId": "E" } - ], - "title": "Service Status", - "type": "stat" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "palette-classic" }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { "type": "linear" }, - "showPoints": "never", - "spanNulls": false, - "stacking": { "group": "A", "mode": "none" }, - "thresholdsStyle": { "mode": "off" } - }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { "h": 8, "w": 24, "x": 0, "y": 6 }, - "id": 2, - "options": { - "legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "right", "showLegend": true }, - "tooltip": { "mode": "multi", "sort": "desc" } - }, - "pluginVersion": "11.0.0", - "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/miniflux\"}", "legendFormat": "Miniflux", "refId": "A" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/kiwix\"}", "legendFormat": "Kiwix", "refId": "B" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/transmission\"}", "legendFormat": "Transmission", "refId": "C" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/devpi\"}", "legendFormat": "Devpi", "refId": "D" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "probe_duration_seconds{job=\"integrations/blackbox/argocd\"}", "legendFormat": "ArgoCD", "refId": "E" } - ], - "title": "Response Time", - "type": "timeseries" - }, - { - "datasource": { "type": "prometheus", "uid": "prometheus" }, - "fieldConfig": { - "defaults": { - "color": { "mode": "thresholds" }, - "mappings": [], - "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.95 }, { "color": "green", "value": 0.99 }] }, - "unit": "percentunit" - }, - "overrides": [] - }, - "gridPos": { "h": 6, "w": 24, "x": 0, "y": 14 }, - "id": 3, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "center", - "orientation": "horizontal", - "reduceOptions": { "calcs": ["mean"], "fields": "", "values": false }, - "textMode": "value_and_name" - }, - "pluginVersion": "11.0.0", - "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/miniflux\"}[$__range])", "legendFormat": "Miniflux", "refId": "A" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/kiwix\"}[$__range])", "legendFormat": "Kiwix", "refId": "B" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/transmission\"}[$__range])", "legendFormat": "Transmission", "refId": "C" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/devpi\"}[$__range])", "legendFormat": "Devpi", "refId": "D" }, - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "avg_over_time(probe_success{job=\"integrations/blackbox/argocd\"}[$__range])", "legendFormat": "ArgoCD", "refId": "E" } - ], - "title": "Uptime (selected period)", - "type": "stat" - } - ], - "refresh": "30s", - "schemaVersion": 38, - "tags": ["services", "health"], - "templating": { "list": [] }, - "time": { "from": "now-24h", "to": "now" }, - "timepicker": {}, - "timezone": "browser", - "title": "K8s Services Health", - "uid": "k8s-services", - "version": 1, - "weekStart": "" - } diff --git a/argocd/manifests/grafana-config/kustomization.yaml b/argocd/manifests/grafana-config/kustomization.yaml index eb5611e..95bb4bb 100644 --- a/argocd/manifests/grafana-config/kustomization.yaml +++ b/argocd/manifests/grafana-config/kustomization.yaml @@ -13,10 +13,10 @@ resources: - dashboards/configmap-devpi.yaml - dashboards/configmap-loki.yaml - dashboards/configmap-macos.yaml - - dashboards/configmap-minikube.yaml + - dashboards/configmap-kubernetes.yaml - dashboards/configmap-jellyfin.yaml - dashboards/configmap-postgresql.yaml - - dashboards/configmap-services.yaml + - dashboards/configmap-ringtail.yaml - dashboards/configmap-zot.yaml - dashboards/configmap-frigate.yaml - dashboards/configmap-cv-apm.yaml diff --git a/argocd/manifests/kube-state-metrics-ringtail/deployment.yaml b/argocd/manifests/kube-state-metrics-ringtail/deployment.yaml new file mode 100644 index 0000000..cba8cac --- /dev/null +++ b/argocd/manifests/kube-state-metrics-ringtail/deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: registry.k8s.io/kube-state-metrics/kube-state-metrics + ports: + - containerPort: 8080 + name: http-metrics + - containerPort: 8081 + name: telemetry + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + readinessProbe: + httpGet: + path: / + port: 8080 + initialDelaySeconds: 5 + timeoutSeconds: 5 + resources: + requests: + cpu: 10m + memory: 64Mi + limits: + cpu: 100m + memory: 256Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 65534 + capabilities: + drop: + - ALL diff --git a/argocd/manifests/kube-state-metrics-ringtail/kustomization.yaml b/argocd/manifests/kube-state-metrics-ringtail/kustomization.yaml new file mode 100644 index 0000000..005cba8 --- /dev/null +++ b/argocd/manifests/kube-state-metrics-ringtail/kustomization.yaml @@ -0,0 +1,9 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - rbac.yaml + - deployment.yaml + - service.yaml +images: + - name: registry.k8s.io/kube-state-metrics/kube-state-metrics + newTag: v2.18.0 diff --git a/argocd/manifests/kube-state-metrics-ringtail/rbac.yaml b/argocd/manifests/kube-state-metrics-ringtail/rbac.yaml new file mode 100644 index 0000000..36193ac --- /dev/null +++ b/argocd/manifests/kube-state-metrics-ringtail/rbac.yaml @@ -0,0 +1,79 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: + - apiGroups: [""] + resources: + - configmaps + - secrets + - nodes + - pods + - services + - serviceaccounts + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] + - apiGroups: ["apps"] + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] + - apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] + - apiGroups: ["networking.k8s.io"] + resources: + - networkpolicies + - ingresses + verbs: ["list", "watch"] + - apiGroups: ["coordination.k8s.io"] + resources: + - leases + verbs: ["list", "watch"] + - apiGroups: ["certificates.k8s.io"] + resources: + - certificatesigningrequests + verbs: ["list", "watch"] + - apiGroups: ["storage.k8s.io"] + resources: + - storageclasses + - volumeattachments + verbs: ["list", "watch"] + - apiGroups: ["admissionregistration.k8s.io"] + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/argocd/manifests/kube-state-metrics-ringtail/service.yaml b/argocd/manifests/kube-state-metrics-ringtail/service.yaml new file mode 100644 index 0000000..3a804df --- /dev/null +++ b/argocd/manifests/kube-state-metrics-ringtail/service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + selector: + app: kube-state-metrics + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + - name: telemetry + port: 8081 + targetPort: telemetry diff --git a/argocd/manifests/prometheus/prometheus.yml b/argocd/manifests/prometheus/prometheus.yml index 09d1f4f..524e9f8 100644 --- a/argocd/manifests/prometheus/prometheus.yml +++ b/argocd/manifests/prometheus/prometheus.yml @@ -10,11 +10,17 @@ scrape_configs: - job_name: "node-exporter-sifaka" static_configs: - targets: ["nas.ops.eblu.me:9100"] + metric_relabel_configs: + - target_label: cluster + replacement: indri - job_name: "smartctl-sifaka" scrape_interval: 60s static_configs: - targets: ["nas.ops.eblu.me:9633"] + metric_relabel_configs: + - target_label: cluster + replacement: indri # CNPG PostgreSQL metrics (k8s internal) - job_name: "cnpg-postgres" @@ -22,21 +28,33 @@ scrape_configs: - targets: ["blumeops-pg-metrics-tailscale.databases.svc.cluster.local:9187"] labels: instance: "blumeops-pg" + metric_relabel_configs: + - target_label: cluster + replacement: indri # Prometheus self-monitoring - job_name: "prometheus" static_configs: - targets: ["localhost:9090"] + metric_relabel_configs: + - target_label: cluster + replacement: indri # Loki metrics - job_name: "loki" static_configs: - targets: ["loki.monitoring.svc.cluster.local:3100"] + metric_relabel_configs: + - target_label: cluster + replacement: indri # Kubernetes state metrics (pods, deployments, resource usage, etc.) - job_name: "kube-state-metrics" static_configs: - targets: ["kube-state-metrics.monitoring.svc.cluster.local:8080"] + metric_relabel_configs: + - target_label: cluster + replacement: indri # Frigate NVR metrics (via Caddy on indri — Frigate runs on ringtail) - job_name: "frigate" @@ -44,3 +62,6 @@ scrape_configs: static_configs: - targets: ["nvr.ops.eblu.me"] metrics_path: /api/metrics + metric_relabel_configs: + - target_label: cluster + replacement: ringtail diff --git a/docs/changelog.d/feature-ringtail-metrics-dashboards.infra.md b/docs/changelog.d/feature-ringtail-metrics-dashboards.infra.md new file mode 100644 index 0000000..fbd098f --- /dev/null +++ b/docs/changelog.d/feature-ringtail-metrics-dashboards.infra.md @@ -0,0 +1 @@ +Add multi-cluster Kubernetes observability: deploy kube-state-metrics and Alloy on ringtail (k3s), add `cluster` label to all metrics/logs, replace single-cluster dashboards with multi-cluster Kubernetes dashboard and dedicated Ringtail dashboard with GPU monitoring.