From 9c789a1868f107630a85ae7777dbc625f5969194 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 12 Feb 2026 18:40:48 -0800 Subject: [PATCH] Fix cache hit rate on APM and Fly.io dashboards (#177) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Remove `match_all = true` from `flyio_nginx_cache_requests_total` in Alloy so the metric only counts requests that go through the proxy cache (excludes health checks with empty `cache_status`) - Change dashboard queries from `rate(...[5m])` to `increase(...[$__range])` — aggregates over the full dashboard time window instead of a 5-minute sliding window, giving meaningful ratios for low-traffic static sites - Add null/NaN value mapping to show "No traffic" in neutral color instead of blank/red ## Root cause Health check requests from Fly.io hit the default nginx server block (no `proxy_cache`), producing entries with empty `upstream_cache_status`. With `match_all = true`, these were counted in the cache metric, diluting the Fly.io dashboard ratio. For APM dashboards, `rate()[5m]` on low-traffic sites with 24h cache validity almost always returns either all-HITs (100%) or no data (blank → red background). ## Deployment - Fly.io proxy redeploy needed for Alloy config change - ArgoCD sync for dashboard ConfigMap changes ## Test plan - [ ] Redeploy Fly.io proxy - [ ] Sync grafana-config in ArgoCD - [ ] Verify CV APM cache hit ratio shows a real percentage (not 100%) - [ ] Verify Docs APM shows "No traffic" in neutral color when idle, real ratio when visited - [ ] Verify Fly.io proxy dashboard cache ratio excludes health checks Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/177 --- .../manifests/grafana-config/dashboards/configmap-cv-apm.yaml | 3 ++- .../grafana-config/dashboards/configmap-docs-apm.yaml | 3 ++- .../manifests/grafana-config/dashboards/configmap-flyio.yaml | 3 ++- docs/changelog.d/fix-cache-hit-rate-dashboards.bugfix.md | 1 + fly/alloy.river | 1 - 5 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 docs/changelog.d/fix-cache-hit-rate-dashboards.bugfix.md diff --git a/argocd/manifests/grafana-config/dashboards/configmap-cv-apm.yaml b/argocd/manifests/grafana-config/dashboards/configmap-cv-apm.yaml index fef84d7..fd05c4f 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-cv-apm.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-cv-apm.yaml @@ -90,6 +90,7 @@ data: "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "mappings": [{ "type": "special", "options": { "match": "null+nan", "result": { "text": "No traffic", "color": "text" } } }], "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.5 }, { "color": "green", "value": 0.8 }] }, "unit": "percentunit" }, @@ -106,7 +107,7 @@ data: "textMode": "auto" }, "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_cache_requests_total{host=\"cv.eblu.me\",cache_status=\"HIT\"}[5m])) / sum(rate(flyio_nginx_cache_requests_total{host=\"cv.eblu.me\"}[5m]))", "refId": "A" } + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(flyio_nginx_cache_requests_total{host=\"cv.eblu.me\",cache_status=\"HIT\"}[$__range])) / sum(increase(flyio_nginx_cache_requests_total{host=\"cv.eblu.me\"}[$__range]))", "refId": "A" } ], "title": "Cache Hit Ratio", "type": "stat" diff --git a/argocd/manifests/grafana-config/dashboards/configmap-docs-apm.yaml b/argocd/manifests/grafana-config/dashboards/configmap-docs-apm.yaml index 8a7c60c..b96d1ea 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-docs-apm.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-docs-apm.yaml @@ -90,6 +90,7 @@ data: "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "mappings": [{ "type": "special", "options": { "match": "null+nan", "result": { "text": "No traffic", "color": "text" } } }], "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.5 }, { "color": "green", "value": 0.8 }] }, "unit": "percentunit" }, @@ -106,7 +107,7 @@ data: "textMode": "auto" }, "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_cache_requests_total{host=\"docs.eblu.me\",cache_status=\"HIT\"}[5m])) / sum(rate(flyio_nginx_cache_requests_total{host=\"docs.eblu.me\"}[5m]))", "refId": "A" } + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(flyio_nginx_cache_requests_total{host=\"docs.eblu.me\",cache_status=\"HIT\"}[$__range])) / sum(increase(flyio_nginx_cache_requests_total{host=\"docs.eblu.me\"}[$__range]))", "refId": "A" } ], "title": "Cache Hit Ratio", "type": "stat" diff --git a/argocd/manifests/grafana-config/dashboards/configmap-flyio.yaml b/argocd/manifests/grafana-config/dashboards/configmap-flyio.yaml index 7228060..981f7ea 100644 --- a/argocd/manifests/grafana-config/dashboards/configmap-flyio.yaml +++ b/argocd/manifests/grafana-config/dashboards/configmap-flyio.yaml @@ -101,6 +101,7 @@ data: "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, + "mappings": [{ "type": "special", "options": { "match": "null+nan", "result": { "text": "No traffic", "color": "text" } } }], "thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "yellow", "value": 0.5 }, { "color": "green", "value": 0.8 }] }, "unit": "percentunit" }, @@ -117,7 +118,7 @@ data: "textMode": "auto" }, "targets": [ - { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(rate(flyio_nginx_cache_requests_total{instance=\"flyio-proxy\",cache_status=\"HIT\"}[5m])) / sum(rate(flyio_nginx_cache_requests_total{instance=\"flyio-proxy\"}[5m]))", "refId": "A" } + { "datasource": { "type": "prometheus", "uid": "prometheus" }, "expr": "sum(increase(flyio_nginx_cache_requests_total{instance=\"flyio-proxy\",cache_status=\"HIT\"}[$__range])) / sum(increase(flyio_nginx_cache_requests_total{instance=\"flyio-proxy\"}[$__range]))", "refId": "A" } ], "title": "Cache Hit Ratio", "type": "stat" diff --git a/docs/changelog.d/fix-cache-hit-rate-dashboards.bugfix.md b/docs/changelog.d/fix-cache-hit-rate-dashboards.bugfix.md new file mode 100644 index 0000000..571a576 --- /dev/null +++ b/docs/changelog.d/fix-cache-hit-rate-dashboards.bugfix.md @@ -0,0 +1 @@ +Fix cache hit rate panels on APM and Fly.io dashboards showing blank/red or misleading 100% for low-traffic static sites. diff --git a/fly/alloy.river b/fly/alloy.river index 36417d4..06ad977 100644 --- a/fly/alloy.river +++ b/fly/alloy.river @@ -79,7 +79,6 @@ loki.process "nginx" { name = "flyio_nginx_cache_requests_total" description = "Total cache lookups by cache status." source = "cache_status" - match_all = true action = "inc" } }