From 19a82373d5a94d7207c6a3aebf3a66ae20185d81 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 18 Jan 2026 12:06:28 -0800 Subject: [PATCH] K8s Migration Phase 0: Foundation Infrastructure (#26) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary - Step 0.1: Update Pulumi ACLs with tag:registry - Step 0.3: Create Zot registry ansible role with mcquack LaunchAgent - Step 0.4: Add Zot to Tailscale Serve configuration - Step 0.5: Create Zot metrics role for Prometheus scraping - Step 0.6: Add Zot log collection to Alloy - Step 0.7: Update indri-services-check with zot checks - Step 0.8: Add podman role for container runtime - Step 0.9: Add minikube role for Kubernetes cluster - Step 0.10: Configure remote kubectl access with 1Password credentials ## Remaining Steps - [ ] Step 0.11: Add minikube to indri-services-check - [ ] Step 0.12: Create zettelkasten documentation - [ ] Step 0.13: Verify main playbook (already done - roles added) ## Deployment and Testing - [x] Zot registry deployed and accessible at https://registry.tail8d86e.ts.net - [x] Podman machine running on indri - [x] Minikube cluster running on indri - [x] kubectl access from gilbert working with 1Password credentials - [ ] indri-services-check passes all checks 🤖 Generated with [Claude Code](https://claude.com/claude-code) Reviewed-on: https://forge.tail8d86e.ts.net/eblume/blumeops/pulls/26 --- Brewfile | 1 + ansible/playbooks/indri.yml | 10 + ansible/roles/alloy/defaults/main.yml | 10 + ansible/roles/alloy/templates/config.alloy.j2 | 12 + .../grafana/files/dashboards/minikube.json | 449 ++++++++++++++++ .../roles/grafana/files/dashboards/zot.json | 488 ++++++++++++++++++ ansible/roles/minikube/defaults/main.yml | 14 + ansible/roles/minikube/handlers/main.yml | 9 + ansible/roles/minikube/tasks/main.yml | 56 ++ .../roles/minikube_metrics/defaults/main.yml | 5 + .../roles/minikube_metrics/handlers/main.yml | 6 + ansible/roles/minikube_metrics/tasks/main.yml | 43 ++ .../templates/minikube-metrics.plist.j2 | 21 + .../templates/minikube-metrics.sh.j2 | 57 ++ ansible/roles/podman/handlers/main.yml | 3 + ansible/roles/podman/tasks/main.yml | 55 ++ .../roles/tailscale_serve/defaults/main.yml | 5 + ansible/roles/zot/defaults/main.yml | 16 + ansible/roles/zot/handlers/main.yml | 6 + ansible/roles/zot/tasks/main.yml | 66 +++ ansible/roles/zot/templates/config.json.j2 | 47 ++ ansible/roles/zot/templates/zot.plist.j2 | 24 + ansible/roles/zot_metrics/defaults/main.yml | 6 + ansible/roles/zot_metrics/handlers/main.yml | 6 + ansible/roles/zot_metrics/tasks/main.yml | 43 ++ .../templates/zot-metrics.plist.j2 | 21 + .../zot_metrics/templates/zot-metrics.sh.j2 | 25 + bin/kubectl-credential-1password | 31 ++ mise-tasks/indri-services-check | 21 +- plans/k8s-migration.md | 259 +++++++++- pulumi/__main__.py | 1 + pulumi/policy.hujson | 5 +- 32 files changed, 1811 insertions(+), 10 deletions(-) create mode 100644 ansible/roles/grafana/files/dashboards/minikube.json create mode 100644 ansible/roles/grafana/files/dashboards/zot.json create mode 100644 ansible/roles/minikube/defaults/main.yml create mode 100644 ansible/roles/minikube/handlers/main.yml create mode 100644 ansible/roles/minikube/tasks/main.yml create mode 100644 ansible/roles/minikube_metrics/defaults/main.yml create mode 100644 ansible/roles/minikube_metrics/handlers/main.yml create mode 100644 ansible/roles/minikube_metrics/tasks/main.yml create mode 100644 ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 create mode 100644 ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 create mode 100644 ansible/roles/podman/handlers/main.yml create mode 100644 ansible/roles/podman/tasks/main.yml create mode 100644 ansible/roles/zot/defaults/main.yml create mode 100644 ansible/roles/zot/handlers/main.yml create mode 100644 ansible/roles/zot/tasks/main.yml create mode 100644 ansible/roles/zot/templates/config.json.j2 create mode 100644 ansible/roles/zot/templates/zot.plist.j2 create mode 100644 ansible/roles/zot_metrics/defaults/main.yml create mode 100644 ansible/roles/zot_metrics/handlers/main.yml create mode 100644 ansible/roles/zot_metrics/tasks/main.yml create mode 100644 ansible/roles/zot_metrics/templates/zot-metrics.plist.j2 create mode 100644 ansible/roles/zot_metrics/templates/zot-metrics.sh.j2 create mode 100755 bin/kubectl-credential-1password diff --git a/Brewfile b/Brewfile index 68803fb..d4186a6 100644 --- a/Brewfile +++ b/Brewfile @@ -1,3 +1,4 @@ # CLI tools for blumeops management brew "bat" # Syntax-highlighted file concatenation brew "tea" # Gitea/Forgejo CLI for forge.tail8d86e.ts.net +brew "podman" # Container CLI (uses VM on macOS, for building/pushing images) diff --git a/ansible/playbooks/indri.yml b/ansible/playbooks/indri.yml index 4645cb4..c5a08f5 100644 --- a/ansible/playbooks/indri.yml +++ b/ansible/playbooks/indri.yml @@ -99,6 +99,16 @@ tags: devpi - role: devpi_metrics tags: devpi_metrics + - role: zot + tags: zot + - role: zot_metrics + tags: zot_metrics + - role: podman + tags: podman + - role: minikube + tags: minikube + - role: minikube_metrics + tags: minikube_metrics - role: plex_metrics tags: plex_metrics - role: postgresql diff --git a/ansible/roles/alloy/defaults/main.yml b/ansible/roles/alloy/defaults/main.yml index 45cc52b..117d703 100644 --- a/ansible/roles/alloy/defaults/main.yml +++ b/ansible/roles/alloy/defaults/main.yml @@ -66,6 +66,12 @@ alloy_mcquack_logs: - path: /Users/erichblume/Library/Logs/mcquack.borgmatic.err.log service: borgmatic stream: stderr + - path: /Users/erichblume/Library/Logs/mcquack.zot.out.log + service: zot + stream: stdout + - path: /Users/erichblume/Library/Logs/mcquack.zot.err.log + service: zot + stream: stderr alloy_plex_logs: - path: /Users/erichblume/Library/Logs/Plex Media Server/Plex Media Server.log @@ -75,6 +81,10 @@ alloy_plex_logs: # Enable log collection (requires Loki to be running) alloy_collect_logs: true +# Zot registry metrics collection +alloy_collect_zot: true +alloy_zot_metrics_url: "http://localhost:5050/metrics" + # PostgreSQL metrics collection alloy_collect_postgres: true alloy_postgres_host: localhost diff --git a/ansible/roles/alloy/templates/config.alloy.j2 b/ansible/roles/alloy/templates/config.alloy.j2 index e0c1cad..d6d2e75 100644 --- a/ansible/roles/alloy/templates/config.alloy.j2 +++ b/ansible/roles/alloy/templates/config.alloy.j2 @@ -54,6 +54,18 @@ prometheus.scrape "postgresql" { } {% endif %} +{% if alloy_collect_zot | default(false) %} +// ============== ZOT REGISTRY METRICS ============== + +// Scrape Zot's native metrics endpoint +prometheus.scrape "zot" { + targets = [{"__address__" = "localhost:5050"}] + metrics_path = "/metrics" + forward_to = [prometheus.relabel.instance.receiver] + scrape_interval = "{{ alloy_scrape_interval }}" +} +{% endif %} + {% if alloy_collect_logs %} // ============== LOG COLLECTION ============== diff --git a/ansible/roles/grafana/files/dashboards/minikube.json b/ansible/roles/grafana/files/dashboards/minikube.json new file mode 100644 index 0000000..484ff40 --- /dev/null +++ b/ansible/roles/grafana/files/dashboards/minikube.json @@ -0,0 +1,449 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 0, "text": "DOWN" } + }, + "type": "value" + }, + { + "options": { + "1": { "color": "green", "index": 1, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_up", + "refId": "A" + } + ], + "title": "Minikube Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 0, "text": "DOWN" } + }, + "type": "value" + }, + { + "options": { + "1": { "color": "green", "index": 1, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_apiserver_up", + "refId": "A" + } + ], + "title": "API Server", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_node_count", + "refId": "A" + } + ], + "title": "Node Count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_pod_count", + "refId": "A" + } + ], + "title": "Pod Count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 0 }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_namespace_count", + "refId": "A" + } + ], + "title": "Namespaces", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 6, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_up", + "legendFormat": "Minikube", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_apiserver_up", + "legendFormat": "API Server", + "refId": "B" + } + ], + "title": "Cluster Health Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 7, + "options": { + "legend": { + "calcs": ["lastNotNull", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_pod_count", + "legendFormat": "Pods", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "minikube_namespace_count", + "legendFormat": "Namespaces", + "refId": "B" + } + ], + "title": "Resource Counts Over Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "loki", + "uid": "loki" + }, + "gridPos": { "h": 10, "w": 24, "x": 0, "y": 12 }, + "id": 8, + "options": { + "dedupStrategy": "none", + "enableLogDetails": true, + "prettifyLogMessage": false, + "showCommonLabels": false, + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "loki", "uid": "loki" }, + "expr": "{host=\"indri\"} |= \"minikube\" or {host=\"indri\"} |= \"kube\"", + "refId": "A" + } + ], + "title": "Kubernetes Related Logs", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["minikube", "kubernetes", "k8s"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Minikube Kubernetes", + "uid": "minikube", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/grafana/files/dashboards/zot.json b/ansible/roles/grafana/files/dashboards/zot.json new file mode 100644 index 0000000..41c99a8 --- /dev/null +++ b/ansible/roles/grafana/files/dashboards/zot.json @@ -0,0 +1,488 @@ +{ + "annotations": { + "list": [] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [ + { + "options": { + "0": { "color": "red", "index": 0, "text": "DOWN" } + }, + "type": "value" + }, + { + "options": { + "1": { "color": "green", "index": 1, "text": "UP" } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "red", "value": null }, + { "color": "green", "value": 1 } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 0 }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "zot_up", + "refId": "A" + } + ], + "title": "Zot Status", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 4, "y": 0 }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "go_goroutines{job=\"prometheus.scrape.zot\"}", + "refId": "A" + } + ], + "title": "Goroutines", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": null }, + { "color": "yellow", "value": 536870912 }, + { "color": "red", "value": 1073741824 } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 8, "y": 0 }, + "id": 3, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(zot_repo_storage_bytes)", + "refId": "A" + } + ], + "title": "Total Storage", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 0 }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum(rate(zot_http_requests_total{job=\"prometheus.scrape.zot\"}[5m]))", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (method) (rate(zot_http_requests_total{job=\"prometheus.scrape.zot\"}[5m]))", + "legendFormat": "{{method}}", + "refId": "A" + } + ], + "title": "HTTP Requests by Method", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, + "id": 6, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "sum by (code) (rate(zot_http_requests_total{job=\"prometheus.scrape.zot\"}[5m]))", + "legendFormat": "{{code}}", + "refId": "A" + } + ], + "title": "HTTP Requests by Status Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, + "id": 7, + "options": { + "legend": { + "calcs": ["mean", "p95"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.50, sum(rate(zot_http_method_latency_seconds_bucket{job=\"prometheus.scrape.zot\"}[5m])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.95, sum(rate(zot_http_method_latency_seconds_bucket{job=\"prometheus.scrape.zot\"}[5m])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "histogram_quantile(0.99, sum(rate(zot_http_method_latency_seconds_bucket{job=\"prometheus.scrape.zot\"}[5m])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "HTTP Request Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "never", + "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [{ "color": "green", "value": null }] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, + "id": 8, + "options": { + "legend": { + "calcs": ["lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { "mode": "multi", "sort": "desc" } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { "type": "prometheus", "uid": "prometheus" }, + "expr": "zot_repo_storage_bytes", + "legendFormat": "{{repo}}", + "refId": "A" + } + ], + "title": "Storage by Repository", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "tags": ["zot", "registry", "oci"], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Zot Container Registry", + "uid": "zot", + "version": 1, + "weekStart": "" +} diff --git a/ansible/roles/minikube/defaults/main.yml b/ansible/roles/minikube/defaults/main.yml new file mode 100644 index 0000000..07140bb --- /dev/null +++ b/ansible/roles/minikube/defaults/main.yml @@ -0,0 +1,14 @@ +--- +# Minikube cluster configuration +minikube_cpus: 4 +# Note: Must be less than podman machine memory (8192MB) to account for overhead +minikube_memory: 7800 +minikube_disk_size: "200g" +minikube_driver: podman +minikube_container_runtime: cri-o + +# Remote access configuration +# These allow kubectl from other machines (e.g., gilbert) to connect +minikube_apiserver_names: + - indri +minikube_listen_address: "0.0.0.0" diff --git a/ansible/roles/minikube/handlers/main.yml b/ansible/roles/minikube/handlers/main.yml new file mode 100644 index 0000000..7ba10c9 --- /dev/null +++ b/ansible/roles/minikube/handlers/main.yml @@ -0,0 +1,9 @@ +--- +# Minikube handlers +# Note: Restarting minikube is a heavy operation and may require manual intervention + +- name: Restart minikube + ansible.builtin.shell: | + minikube stop 2>/dev/null || true + minikube start + changed_when: true diff --git a/ansible/roles/minikube/tasks/main.yml b/ansible/roles/minikube/tasks/main.yml new file mode 100644 index 0000000..c3a9157 --- /dev/null +++ b/ansible/roles/minikube/tasks/main.yml @@ -0,0 +1,56 @@ +--- +# Minikube installation and cluster setup for indri +# Requires podman machine to be running (see podman role) +# +# NOTE: Similar to podman, minikube start may have issues when run via SSH. +# If cluster fails to start, manually run on indri: +# minikube start --driver=podman --container-runtime=cri-o \ +# --cpus=4 --memory=7800 --disk-size=200g \ +# --apiserver-names=indri --listen-address=0.0.0.0 + +- name: Install minikube via homebrew + community.general.homebrew: + name: minikube + state: present + +- name: Install kubectl via homebrew + community.general.homebrew: + name: kubectl + state: present + +- name: Check if minikube cluster exists + ansible.builtin.command: + cmd: minikube status --format={% raw %}'{{.Host}}'{% endraw %} + register: minikube_status + changed_when: false + failed_when: false + +- name: Start minikube cluster + ansible.builtin.command: + cmd: > + minikube start + --driver={{ minikube_driver }} + --container-runtime={{ minikube_container_runtime }} + --cpus={{ minikube_cpus }} + --memory={{ minikube_memory }} + --disk-size={{ minikube_disk_size }} + {% for name in minikube_apiserver_names %} + --apiserver-names={{ name }} + {% endfor %} + --listen-address={{ minikube_listen_address }} + register: minikube_start + changed_when: minikube_start.rc == 0 + failed_when: false # Don't fail - may need manual intervention like podman + when: minikube_status.rc != 0 or 'Running' not in minikube_status.stdout + +- name: Check minikube status after start attempt + ansible.builtin.command: + cmd: minikube status --format={% raw %}'{{.Host}}'{% endraw %} + register: minikube_final_status + changed_when: false + failed_when: false + +- name: Warn if minikube failed to start + ansible.builtin.debug: + msg: "WARNING: minikube may not have started properly. Run 'minikube start' manually on indri if needed. Status: {{ minikube_final_status.stdout | default('unknown') }}" + when: minikube_final_status.rc != 0 or 'Running' not in minikube_final_status.stdout diff --git a/ansible/roles/minikube_metrics/defaults/main.yml b/ansible/roles/minikube_metrics/defaults/main.yml new file mode 100644 index 0000000..68fd672 --- /dev/null +++ b/ansible/roles/minikube_metrics/defaults/main.yml @@ -0,0 +1,5 @@ +--- +minikube_metrics_dir: /opt/homebrew/var/node_exporter/textfile +minikube_metrics_script: /Users/erichblume/bin/minikube-metrics +minikube_metrics_interval: 60 # seconds between metric collection +minikube_metrics_log_dir: /opt/homebrew/var/log diff --git a/ansible/roles/minikube_metrics/handlers/main.yml b/ansible/roles/minikube_metrics/handlers/main.yml new file mode 100644 index 0000000..595f838 --- /dev/null +++ b/ansible/roles/minikube_metrics/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Reload minikube-metrics + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.minikube-metrics.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.minikube-metrics.plist + changed_when: true diff --git a/ansible/roles/minikube_metrics/tasks/main.yml b/ansible/roles/minikube_metrics/tasks/main.yml new file mode 100644 index 0000000..da043ab --- /dev/null +++ b/ansible/roles/minikube_metrics/tasks/main.yml @@ -0,0 +1,43 @@ +--- +- name: Ensure metrics directory exists + ansible.builtin.file: + path: "{{ minikube_metrics_dir }}" + state: directory + mode: '0755' + +- name: Ensure log directory exists + ansible.builtin.file: + path: "{{ minikube_metrics_log_dir }}" + state: directory + mode: '0755' + +- name: Ensure bin directory exists + ansible.builtin.file: + path: "{{ minikube_metrics_script | dirname }}" + state: directory + mode: '0755' + +- name: Deploy minikube-metrics script + ansible.builtin.template: + src: minikube-metrics.sh.j2 + dest: "{{ minikube_metrics_script }}" + mode: '0755' + +- name: Deploy minikube-metrics LaunchAgent plist + ansible.builtin.template: + src: minikube-metrics.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.minikube-metrics.plist + mode: '0644' + notify: Reload minikube-metrics + +- name: Check if minikube-metrics LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.minikube-metrics + register: minikube_metrics_launchctl_check + changed_when: false + failed_when: false + +- name: Load minikube-metrics LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.minikube-metrics.plist + when: minikube_metrics_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 b/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 new file mode 100644 index 0000000..4e751d7 --- /dev/null +++ b/ansible/roles/minikube_metrics/templates/minikube-metrics.plist.j2 @@ -0,0 +1,21 @@ + + + + + + Label + mcquack.eblume.minikube-metrics + ProgramArguments + + {{ minikube_metrics_script }} + + StartInterval + {{ minikube_metrics_interval }} + RunAtLoad + + StandardErrorPath + {{ minikube_metrics_log_dir }}/mcquack.minikube-metrics.err.log + StandardOutPath + {{ minikube_metrics_log_dir }}/mcquack.minikube-metrics.out.log + + diff --git a/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 b/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 new file mode 100644 index 0000000..447c5a5 --- /dev/null +++ b/ansible/roles/minikube_metrics/templates/minikube-metrics.sh.j2 @@ -0,0 +1,57 @@ +#!/bin/bash +# {{ ansible_managed }} +# Collects minikube/kubernetes metrics for node_exporter textfile collector + +set -euo pipefail + +OUTPUT_FILE="{{ minikube_metrics_dir }}/minikube.prom" +TEMP_FILE="${OUTPUT_FILE}.tmp" + +# Start output file +cat > "$TEMP_FILE" << 'HEADER' +# HELP minikube_up Minikube cluster is running +# TYPE minikube_up gauge +# HELP minikube_apiserver_up Kubernetes API server is responding +# TYPE minikube_apiserver_up gauge +# HELP minikube_node_count Number of nodes in the cluster +# TYPE minikube_node_count gauge +# HELP minikube_pod_count Number of pods in the cluster +# TYPE minikube_pod_count gauge +# HELP minikube_namespace_count Number of namespaces in the cluster +# TYPE minikube_namespace_count gauge +HEADER + +# Check if minikube is running +if minikube status --format='{% raw %}{{.Host}}{% endraw %}' 2>/dev/null | grep -q "Running"; then + echo "minikube_up 1" >> "$TEMP_FILE" +else + echo "minikube_up 0" >> "$TEMP_FILE" + echo "minikube_apiserver_up 0" >> "$TEMP_FILE" + echo "minikube_node_count 0" >> "$TEMP_FILE" + echo "minikube_pod_count 0" >> "$TEMP_FILE" + echo "minikube_namespace_count 0" >> "$TEMP_FILE" + mv "$TEMP_FILE" "$OUTPUT_FILE" + exit 0 +fi + +# Check API server health +if kubectl get --raw /healthz >/dev/null 2>&1; then + echo "minikube_apiserver_up 1" >> "$TEMP_FILE" +else + echo "minikube_apiserver_up 0" >> "$TEMP_FILE" +fi + +# Get node count +NODE_COUNT=$(kubectl get nodes --no-headers 2>/dev/null | wc -l | tr -d ' ') +echo "minikube_node_count ${NODE_COUNT:-0}" >> "$TEMP_FILE" + +# Get pod count (all namespaces) +POD_COUNT=$(kubectl get pods -A --no-headers 2>/dev/null | wc -l | tr -d ' ') +echo "minikube_pod_count ${POD_COUNT:-0}" >> "$TEMP_FILE" + +# Get namespace count +NS_COUNT=$(kubectl get namespaces --no-headers 2>/dev/null | wc -l | tr -d ' ') +echo "minikube_namespace_count ${NS_COUNT:-0}" >> "$TEMP_FILE" + +# Atomic move +mv "$TEMP_FILE" "$OUTPUT_FILE" diff --git a/ansible/roles/podman/handlers/main.yml b/ansible/roles/podman/handlers/main.yml new file mode 100644 index 0000000..89a6a94 --- /dev/null +++ b/ansible/roles/podman/handlers/main.yml @@ -0,0 +1,3 @@ +--- +# No handlers currently - podman machine start is unreliable via Ansible +# See known issue in tasks/main.yml diff --git a/ansible/roles/podman/tasks/main.yml b/ansible/roles/podman/tasks/main.yml new file mode 100644 index 0000000..86a3cda --- /dev/null +++ b/ansible/roles/podman/tasks/main.yml @@ -0,0 +1,55 @@ +--- +# Podman installation and machine setup for indri +# Used as container runtime for minikube +# +# KNOWN ISSUE: podman machine init/start has reliability issues when run via +# Ansible/SSH. The machine sometimes gets stuck in "Starting" state due to a +# race condition (see https://github.com/containers/podman/issues/16945). +# Additionally, Apple Hypervisor may require GUI session context. +# +# WORKAROUND: If the machine fails to start via Ansible, manually run on indri: +# podman machine rm -f podman-machine-default +# podman machine init --cpus 4 --memory 8192 --disk-size 220 +# podman machine start +# +# TODO: Investigate proper LaunchAgent or other solution for reliable automation. + +- name: Install podman via homebrew + community.general.homebrew: + name: podman + state: present + +- name: Check if podman machine exists + ansible.builtin.command: + cmd: podman machine list --format json + register: podman_machine_list + changed_when: false + +- name: Initialize podman machine (if not exists) + ansible.builtin.command: + cmd: podman machine init --cpus 4 --memory 8192 --disk-size 220 + register: podman_init + changed_when: podman_init.rc == 0 + failed_when: podman_init.rc not in [0, 125] # 125 = already exists + when: podman_machine_list.stdout == '[]' + +- name: Check if podman machine is running + ansible.builtin.command: + cmd: podman machine list --format "{{ '{{' }}.Running{{ '}}' }}" + register: podman_running + changed_when: false + +- name: Start podman machine (if stopped) + ansible.builtin.command: + cmd: podman machine start + register: podman_start + changed_when: "'started successfully' in podman_start.stdout" + failed_when: false # Don't fail - see known issue above + when: "'true' not in podman_running.stdout" + +- name: Warn if podman machine failed to start + ansible.builtin.debug: + msg: "WARNING: podman machine may not have started. Run 'podman machine start' manually on indri if needed." + when: + - "'true' not in podman_running.stdout" + - podman_start.rc != 0 or "'started successfully' not in podman_start.stdout" diff --git a/ansible/roles/tailscale_serve/defaults/main.yml b/ansible/roles/tailscale_serve/defaults/main.yml index b076fcd..b17b847 100644 --- a/ansible/roles/tailscale_serve/defaults/main.yml +++ b/ansible/roles/tailscale_serve/defaults/main.yml @@ -35,3 +35,8 @@ tailscale_serve_services: https: port: 443 upstream: http://localhost:8080 + + - name: svc:registry + https: + port: 443 + upstream: http://localhost:5050 diff --git a/ansible/roles/zot/defaults/main.yml b/ansible/roles/zot/defaults/main.yml new file mode 100644 index 0000000..812ac51 --- /dev/null +++ b/ansible/roles/zot/defaults/main.yml @@ -0,0 +1,16 @@ +--- +zot_repo_dir: /Users/erichblume/code/3rd/zot +zot_binary: "{{ zot_repo_dir }}/bin/zot-darwin-arm64" +zot_data_dir: /Users/erichblume/zot +zot_config_dir: /Users/erichblume/.config/zot +zot_port: 5050 +zot_log_dir: /Users/erichblume/Library/Logs + +# Pull-through cache registries (on-demand sync) +zot_sync_registries: + - name: docker.io + url: https://registry-1.docker.io + - name: ghcr.io + url: https://ghcr.io + - name: quay.io + url: https://quay.io diff --git a/ansible/roles/zot/handlers/main.yml b/ansible/roles/zot/handlers/main.yml new file mode 100644 index 0000000..0d823cb --- /dev/null +++ b/ansible/roles/zot/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Restart zot + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.zot.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.zot.plist + changed_when: true diff --git a/ansible/roles/zot/tasks/main.yml b/ansible/roles/zot/tasks/main.yml new file mode 100644 index 0000000..20713b5 --- /dev/null +++ b/ansible/roles/zot/tasks/main.yml @@ -0,0 +1,66 @@ +--- +# Note: Zot is built from source, not installed via homebrew. +# +# ONE-TIME SETUP (before running ansible): +# +# 1. Clone zot from forge mirror (use localhost:3001 - hairpinning doesn't work): +# ssh indri 'git clone http://localhost:3001/eblume/zot.git ~/code/3rd/zot' +# +# 2. Set up Go via mise: +# ssh indri 'cd ~/code/3rd/zot && mise use go@1.25' +# +# 3. Build (creates bin/zot-darwin-arm64): +# ssh indri 'cd ~/code/3rd/zot && mise x -- make binary' +# +# 4. Run ansible to deploy config and LaunchAgent + +- name: Verify zot binary exists + ansible.builtin.stat: + path: "{{ zot_binary }}" + register: zot_binary_stat + +- name: Fail if zot binary not found + ansible.builtin.fail: + msg: | + Zot binary not found at {{ zot_binary }}. + Please build from source first: + ssh indri 'cd ~/code/3rd/zot && mise x -- make binary' + when: not zot_binary_stat.stat.exists + +- name: Ensure zot data directory exists + ansible.builtin.file: + path: "{{ zot_data_dir }}" + state: directory + mode: '0755' + +- name: Ensure zot config directory exists + ansible.builtin.file: + path: "{{ zot_config_dir }}" + state: directory + mode: '0755' + +- name: Deploy zot config + ansible.builtin.template: + src: config.json.j2 + dest: "{{ zot_config_dir }}/config.json" + mode: '0644' + notify: Restart zot + +- name: Deploy zot LaunchAgent plist + ansible.builtin.template: + src: zot.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.zot.plist + mode: '0644' + notify: Restart zot + +- name: Check if zot LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.zot + register: zot_launchctl_check + changed_when: false + failed_when: false + +- name: Load zot LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.zot.plist + when: zot_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/zot/templates/config.json.j2 b/ansible/roles/zot/templates/config.json.j2 new file mode 100644 index 0000000..3c5c668 --- /dev/null +++ b/ansible/roles/zot/templates/config.json.j2 @@ -0,0 +1,47 @@ +{ + "distSpecVersion": "1.1.0", + "storage": { + "rootDirectory": "{{ zot_data_dir }}", + "gc": true, + "gcDelay": "1h", + "gcInterval": "24h" + }, + "http": { + "address": "0.0.0.0", + "port": "{{ zot_port }}" + }, + "log": { + "level": "info" + }, + "extensions": { + "metrics": { + "enable": true, + "prometheus": { + "path": "/metrics" + } + }, + "sync": { + "enable": true, + "registries": [ +{% for registry in zot_sync_registries %} + { + "urls": ["{{ registry.url }}"], + "content": [{"prefix": "**", "destination": "/{{ registry.name }}"}], + "onDemand": true, + "tlsVerify": true + }{% if not loop.last %},{% endif %} + +{% endfor %} + ] + }, + "search": { + "enable": true, + "cve": { + "updateInterval": "24h" + } + }, + "ui": { + "enable": true + } + } +} diff --git a/ansible/roles/zot/templates/zot.plist.j2 b/ansible/roles/zot/templates/zot.plist.j2 new file mode 100644 index 0000000..25b7da1 --- /dev/null +++ b/ansible/roles/zot/templates/zot.plist.j2 @@ -0,0 +1,24 @@ + + + + + + Label + mcquack.eblume.zot + ProgramArguments + + + {{ zot_binary }} + serve + {{ zot_config_dir }}/config.json + + RunAtLoad + + KeepAlive + + StandardOutPath + {{ zot_log_dir }}/mcquack.zot.out.log + StandardErrorPath + {{ zot_log_dir }}/mcquack.zot.err.log + + diff --git a/ansible/roles/zot_metrics/defaults/main.yml b/ansible/roles/zot_metrics/defaults/main.yml new file mode 100644 index 0000000..3280b20 --- /dev/null +++ b/ansible/roles/zot_metrics/defaults/main.yml @@ -0,0 +1,6 @@ +--- +zot_metrics_url: http://localhost:5050/v2/_catalog +zot_metrics_dir: /opt/homebrew/var/node_exporter/textfile +zot_metrics_script: /Users/erichblume/bin/zot-metrics +zot_metrics_interval: 60 # seconds between metric collection +zot_metrics_log_dir: /opt/homebrew/var/log diff --git a/ansible/roles/zot_metrics/handlers/main.yml b/ansible/roles/zot_metrics/handlers/main.yml new file mode 100644 index 0000000..44fe49b --- /dev/null +++ b/ansible/roles/zot_metrics/handlers/main.yml @@ -0,0 +1,6 @@ +--- +- name: Reload zot-metrics + ansible.builtin.shell: | + launchctl unload ~/Library/LaunchAgents/mcquack.eblume.zot-metrics.plist 2>/dev/null || true + launchctl load ~/Library/LaunchAgents/mcquack.eblume.zot-metrics.plist + changed_when: true diff --git a/ansible/roles/zot_metrics/tasks/main.yml b/ansible/roles/zot_metrics/tasks/main.yml new file mode 100644 index 0000000..9b43125 --- /dev/null +++ b/ansible/roles/zot_metrics/tasks/main.yml @@ -0,0 +1,43 @@ +--- +- name: Ensure metrics directory exists + ansible.builtin.file: + path: "{{ zot_metrics_dir }}" + state: directory + mode: '0755' + +- name: Ensure log directory exists + ansible.builtin.file: + path: "{{ zot_metrics_log_dir }}" + state: directory + mode: '0755' + +- name: Ensure bin directory exists + ansible.builtin.file: + path: "{{ zot_metrics_script | dirname }}" + state: directory + mode: '0755' + +- name: Deploy zot-metrics script + ansible.builtin.template: + src: zot-metrics.sh.j2 + dest: "{{ zot_metrics_script }}" + mode: '0755' + +- name: Deploy zot-metrics LaunchAgent plist + ansible.builtin.template: + src: zot-metrics.plist.j2 + dest: ~/Library/LaunchAgents/mcquack.eblume.zot-metrics.plist + mode: '0644' + notify: Reload zot-metrics + +- name: Check if zot-metrics LaunchAgent is loaded + ansible.builtin.command: launchctl list mcquack.eblume.zot-metrics + register: zot_metrics_launchctl_check + changed_when: false + failed_when: false + +- name: Load zot-metrics LaunchAgent if not loaded + ansible.builtin.command: launchctl load ~/Library/LaunchAgents/mcquack.eblume.zot-metrics.plist + when: zot_metrics_launchctl_check.rc != 0 + changed_when: true + failed_when: false diff --git a/ansible/roles/zot_metrics/templates/zot-metrics.plist.j2 b/ansible/roles/zot_metrics/templates/zot-metrics.plist.j2 new file mode 100644 index 0000000..6efae24 --- /dev/null +++ b/ansible/roles/zot_metrics/templates/zot-metrics.plist.j2 @@ -0,0 +1,21 @@ + + + + + + Label + mcquack.eblume.zot-metrics + ProgramArguments + + {{ zot_metrics_script }} + + StartInterval + {{ zot_metrics_interval }} + RunAtLoad + + StandardErrorPath + {{ zot_metrics_log_dir }}/mcquack.zot-metrics.err.log + StandardOutPath + {{ zot_metrics_log_dir }}/mcquack.zot-metrics.out.log + + diff --git a/ansible/roles/zot_metrics/templates/zot-metrics.sh.j2 b/ansible/roles/zot_metrics/templates/zot-metrics.sh.j2 new file mode 100644 index 0000000..24aaffd --- /dev/null +++ b/ansible/roles/zot_metrics/templates/zot-metrics.sh.j2 @@ -0,0 +1,25 @@ +#!/bin/bash +# {{ ansible_managed }} +# Collects Zot registry metrics for node_exporter textfile collector + +set -euo pipefail + +METRICS_URL="{{ zot_metrics_url }}" +OUTPUT_FILE="{{ zot_metrics_dir }}/zot.prom" +TEMP_FILE="${OUTPUT_FILE}.tmp" + +# Start output file with header +cat > "$TEMP_FILE" << 'HEADER' +# HELP zot_up Zot registry is up and responding +# TYPE zot_up gauge +HEADER + +# Check if zot is up +if curl -sf "$METRICS_URL" > /dev/null 2>&1; then + echo "zot_up 1" >> "$TEMP_FILE" +else + echo "zot_up 0" >> "$TEMP_FILE" +fi + +# Atomic move +mv "$TEMP_FILE" "$OUTPUT_FILE" diff --git a/bin/kubectl-credential-1password b/bin/kubectl-credential-1password new file mode 100755 index 0000000..04f2669 --- /dev/null +++ b/bin/kubectl-credential-1password @@ -0,0 +1,31 @@ +#!/bin/bash +# kubectl exec credential plugin for 1Password +# Usage: kubectl-credential-1password +# +# Fetches client certificate and key from 1Password and outputs +# ExecCredential JSON for kubectl authentication. + +set -euo pipefail + +VAULT_ID="$1" +ITEM_ID="$2" +CERT_FIELD="$3" +KEY_FIELD="$4" + +# Fetch credentials from 1Password (strips surrounding quotes from text fields) +CLIENT_CERT=$(op --vault "$VAULT_ID" item get "$ITEM_ID" --fields "$CERT_FIELD" | sed 's/^"//; s/"$//') +CLIENT_KEY=$(op --vault "$VAULT_ID" item get "$ITEM_ID" --fields "$KEY_FIELD" | sed 's/^"//; s/"$//') + +# Output ExecCredential JSON +# Note: jq is used to properly escape the PEM data for JSON +jq -n \ + --arg cert "$CLIENT_CERT" \ + --arg key "$CLIENT_KEY" \ + '{ + "apiVersion": "client.authentication.k8s.io/v1beta1", + "kind": "ExecCredential", + "status": { + "clientCertificateData": $cert, + "clientKeyData": $key + } + }' diff --git a/mise-tasks/indri-services-check b/mise-tasks/indri-services-check index 3d0b375..07f9feb 100755 --- a/mise-tasks/indri-services-check +++ b/mise-tasks/indri-services-check @@ -53,15 +53,18 @@ check_service "forgejo" "ssh indri 'brew services list | grep forgejo | grep sta check_service "devpi" "ssh indri 'launchctl list | grep devpi | grep -v \"^-\"'" check_service "postgresql" "ssh indri 'brew services list | grep postgresql | grep started'" check_service "miniflux" "ssh indri 'brew services list | grep miniflux | grep started'" +check_service "zot" "ssh indri 'launchctl list | grep mcquack.eblume.zot | grep -v \"^-\"'" +check_service "zot-metrics" "ssh indri 'launchctl list | grep zot-metrics | grep -v \"^-\"'" +check_service "minikube-metrics" "ssh indri 'launchctl list | grep minikube-metrics | grep -v \"^-\"'" echo "" echo "HTTP endpoints (via Tailscale):" check_http "Loki" "http://indri:3100/ready" check_http "Prometheus" "http://indri:9090/-/healthy" -check_http "Grafana" "http://indri:3000/api/health" -check_http "Kiwix" "http://indri:5501/" -check_http "Forgejo" "http://indri:3001/" -check_http "Devpi" "http://indri:3141/+api" +check_http "Grafana" "https://grafana.tail8d86e.ts.net/api/health" +check_http "Kiwix" "https://kiwix.tail8d86e.ts.net/" +check_http "Forgejo" "https://forge.tail8d86e.ts.net/" +check_http "Devpi" "https://pypi.tail8d86e.ts.net/+api" check_http "Miniflux" "https://feed.tail8d86e.ts.net/healthcheck" # Transmission RPC is localhost-only by design, check via SSH check_service "Transmission RPC" "ssh indri 'curl -sf http://127.0.0.1:9091/transmission/rpc'" @@ -69,6 +72,16 @@ check_service "Transmission RPC" "ssh indri 'curl -sf http://127.0.0.1:9091/tran check_service "Transmission metrics" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/transmission.prom'" # PostgreSQL uses TCP not HTTP, check via pg_isready check_service "PostgreSQL" "ssh indri '/opt/homebrew/opt/postgresql@18/bin/pg_isready -h localhost'" +# Zot registry (via Tailscale service) +check_http "Zot Registry" "https://registry.tail8d86e.ts.net/v2/_catalog" +check_service "Zot metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/zot.prom'" +check_service "Minikube metrics file" "ssh indri 'test -f /opt/homebrew/var/node_exporter/textfile/minikube.prom'" + +echo "" +echo "Kubernetes cluster:" +check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'" +check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'" +check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz" echo "" if [ $FAILED -eq 0 ]; then diff --git a/plans/k8s-migration.md b/plans/k8s-migration.md index 5a773cc..356691c 100644 --- a/plans/k8s-migration.md +++ b/plans/k8s-migration.md @@ -130,6 +130,9 @@ mise run tailnet-preview # Review changes - should show new tag mise run tailnet-up # Apply changes ``` +**Implementation Details:** +- Also need to add `"tag:registry"` to indri's tags in `pulumi/__main__.py` (the `DeviceTags` resource), not just define it in `policy.hujson`. The policy file defines the tag ownership rules, but the device tags are managed separately in the Python code. + --- ### Step 0.2: Create Tailscale Services in Admin Console (MANUAL) @@ -140,7 +143,9 @@ mise run tailnet-up # Apply changes 2. Create service `registry` with: - Port: 443 (HTTPS) - Host: indri -3. Apply tag `tag:registry` to indri if not already tagged + +**Implementation Details:** +- Tag is applied to indri via Pulumi in Step 0.1, not manually in admin console. **Verification:** ```bash @@ -319,6 +324,10 @@ ssh indri 'curl -s http://localhost:5000/v2/_catalog' # Expected: {"repositories":["docker.io/library/alpine"]} ``` +**Implementation Details:** +- Changed port from 5000 to 5050 because macOS ControlCenter (AirPlay Receiver) uses port 5000 by default. +- Fixed sync config: use `"content": [{"prefix": "**", "destination": "/{{ registry.name }}"}]` instead of `"prefix": "{{ registry.name }}/**"`. The destination rewrites the local path, while prefix `**` matches all upstream repos. + --- ### Step 0.4: Add Zot to Tailscale Serve @@ -352,6 +361,11 @@ curl -s https://registry.tail8d86e.ts.net/v2/_catalog # Expected: {"repositories":["blumeops/test","docker.io/library/alpine"]} ``` +**Implementation Details:** +- Changed upstream port from 5000 to 5050 (see Step 0.3 implementation details). +- After running `tailscale serve`, the service must be approved in Tailscale admin console at https://login.tailscale.com/admin/services before it becomes accessible. +- Podman needed on gilbert for testing - added to Brewfile. Requires `podman machine init && podman machine start` after install. + --- ### Step 0.5: Create Zot Metrics Role @@ -461,6 +475,9 @@ mise run indri-services-check # Zot metrics... OK ``` +**Implementation Details:** +- Used Tailscale service URL (`https://registry.tail8d86e.ts.net/v2/_catalog`) instead of internal endpoint to verify full path works. + --- ### Step 0.8: Install and Configure Podman on Indri @@ -504,6 +521,17 @@ ssh indri 'podman info' ssh indri 'podman run --rm hello-world' ``` +**Implementation Details:** +- **KNOWN ISSUE**: `podman machine init` and `podman machine start` have reliability issues when run via Ansible/SSH. The machine sometimes gets stuck in "Starting" state due to a race condition (see https://github.com/containers/podman/issues/16945). Apple Hypervisor may also require GUI session context. +- **WORKAROUND**: If the machine fails to start via Ansible, manually run on indri: + ```bash + podman machine rm -f podman-machine-default + podman machine init --cpus 4 --memory 8192 --disk-size 220 + podman machine start + ``` +- LaunchAgent approach was attempted but didn't resolve the issue reliably. +- TODO: Investigate proper automation solution for reliable podman machine management. + --- ### Step 0.9: Install and Configure Minikube @@ -570,6 +598,10 @@ ssh indri 'kubectl get nodes' # Expected: minikube Ready control-plane ... ``` +**Implementation Details:** +- Changed `minikube_memory` from 8192 to 7800 because podman machine reports slightly less available memory (7908MB) due to VM overhead. Minikube rejects memory requests exceeding what podman reports. +- Deployed with Kubernetes v1.34.0 and CRI-O 1.24.6. + --- ### Step 0.10: Configure Kubeconfig on Gilbert @@ -597,6 +629,93 @@ k9s # Should show the minikube cluster The exact approach will be determined during implementation based on what works best with the podman driver. +**Implementation Details:** + +Chose **Option 3: Recreate cluster with `--apiserver-names`** after researching alternatives: + +1. **SSH tunneling** - Requires keeping a tunnel running or complex on-demand setup +2. **SOCKS5 proxy with kubeconfig `proxy-url`** - Kubeconfig supports `proxy-url: socks5://localhost:1080` per-context, but still requires managing the proxy +3. **`--apiserver-names` + `--listen-address`** - Native minikube support, cleanest solution + +**Cluster Setup:** Recreated the minikube cluster with additional flags: +```bash +minikube delete +minikube start \ + --driver=podman \ + --container-runtime=cri-o \ + --cpus=4 --memory=7800 --disk-size=200g \ + --apiserver-names=indri \ + --listen-address=0.0.0.0 +``` + +- `--apiserver-names=indri` adds "indri" to the API server certificate SAN +- `--listen-address=0.0.0.0` tells podman to expose the API port on all interfaces +- API server port is dynamic (check with `kubectl config view --minify -o jsonpath="{.clusters[0].cluster.server}"` on indri) + +**Credential Management with 1Password:** + +Rather than copying private keys between machines, credentials are stored in 1Password and fetched on-demand using kubectl's exec credential plugin. This mirrors the 1Password SSH agent pattern for biometric-protected key access. + +1. **Store credentials in 1Password** (vault: `vg6xf6vvfmoh5hqjjhlhbeoaie`, item: `3jo4f2hnzvwfmamudfsbbbec7e`): + - `client-cert` - Contents of `~/.minikube/profiles/minikube/client.crt` (text field) + - `client-key` - Contents of `~/.minikube/profiles/minikube/client.key` (text field) + - `ca-cert` - Contents of `~/.minikube/ca.crt` (text field, not secret but stored for convenience) + +2. **Created credential helper script** at `bin/kubectl-credential-1password`: + ```bash + #!/bin/bash + # Fetches client cert/key from 1Password, outputs ExecCredential JSON + # Usage: kubectl-credential-1password + ``` + Symlinked to `~/.local/bin/kubectl-credential-1password` + +3. **Kubeconfig setup on gilbert:** + ```bash + # Store CA cert locally (not secret - public key for server verification) + mkdir -p ~/.kube/minikube-indri + op --vault item get --fields ca-cert | sed 's/^"//; s/"$//' > ~/.kube/minikube-indri/ca.crt + + # Configure cluster + kubectl config set-cluster minikube-indri \ + --server=https://indri: \ + --certificate-authority=/Users/eblume/.kube/minikube-indri/ca.crt + + # Configure credentials with exec plugin + kubectl config set-credentials minikube-indri \ + --exec-api-version=client.authentication.k8s.io/v1beta1 \ + --exec-command=kubectl-credential-1password \ + --exec-arg= \ + --exec-arg= \ + --exec-arg=client-cert \ + --exec-arg=client-key + + # Create context + kubectl config set-context minikube-indri \ + --cluster=minikube-indri \ + --user=minikube-indri + ``` + +4. **Usage:** + ```bash + kubectl --context=minikube-indri get nodes + # or + kubectl config use-context minikube-indri + kubectl get nodes + ``` + +**Security Notes:** +- Client private key never stored on disk - fetched from 1Password on each kubectl command +- CA cert stored on disk (not secret - it's a public key for server verification) +- 1Password biometric/password prompt required for credential access +- `op` command strips quotes from text fields with `sed 's/^"//; s/"$//'` + +**References:** +- [minikube start options](https://minikube.sigs.k8s.io/docs/commands/start/) +- [Using kubectl via SSH Tunnel](https://blog.scottlowe.org/2020/06/16/using-kubectl-via-an-ssh-tunnel/) +- [SOCKS5 Proxy Access to K8s API](https://kubernetes.ltd/docs/tasks/extend-kubernetes/socks5-proxy-access-api/) +- [kubectl-tokensshtunnel](https://github.com/jordiprats/kubectl-tokensshtunnel) +- [Securing kubectl config with 1Password](https://blog.mikael.green/post/1password-kubeconfig/) + --- ### Step 0.11: Add Minikube to indri-services-check @@ -623,6 +742,10 @@ mise run indri-services-check # k8s-apiserver... OK ``` +**Implementation Notes:** +- Added a third check `k8s-apiserver (remote)` that verifies kubectl access from gilbert, not just via SSH to indri. This ensures the 1Password credential flow and remote API server access are working. +- The remote check uses both `--kubeconfig` and `--context` flags explicitly since the script runs in bash (not fish) and doesn't inherit the KUBECONFIG environment variable from fish config. + --- ### Step 0.12: Create Zettelkasten Documentation @@ -631,6 +754,45 @@ mise run indri-services-check - `~/code/personal/zk/zot.md` - `~/code/personal/zk/minikube.md` +**Files to update:** +- `~/code/personal/zk/1767747119-YCPO.md` (main blumeops card) + +**Updates to main blumeops card:** + +1. Add to **Device Tags** table: + | `tag:registry` | indri | Container registry access | + +2. Add to **Services** table: + | **Registry** | https://registry.tail8d86e.ts.net | OCI container registry (Zot) | [[zot]] | + | **Kubernetes** | https://indri: | Minikube cluster | [[minikube]] | + +3. Add to **Port Map (Indri)** table: + | 5050 | Zot | HTTP | localhost | Container registry | + | | K8s API | HTTPS | 0.0.0.0 | Minikube API server | + +4. Add new section **Remote Kubernetes Access**: + ```markdown + ## Remote Kubernetes Access (from Gilbert) + + The minikube cluster on indri is accessible from gilbert via direct connection. + Cluster was created with `--apiserver-names=indri --listen-address=0.0.0.0`. + + **Fish abbreviations** (in `~/.config/fish/config.fish`): + - `ki` → `kubectl --context=minikube-indri` + - `k9i` → `k9s --context=minikube-indri` + - `k9` → `k9s` + + ```bash + # Quick access via abbreviations + ki get nodes + k9i + + # Or explicitly set context + kubectl config use-context minikube-indri + kubectl get nodes + ``` + ``` + **Template for zot.md:** ```markdown --- @@ -651,7 +813,7 @@ Zot is an OCI-native container registry running on Indri, providing: ## Service Details - URL: https://registry.tail8d86e.ts.net -- Local port: 5000 +- Local port: 5050 - Data directory: ~/zot - Config: ~/.config/zot/config.json - Managed via: mcquack LaunchAgent @@ -669,10 +831,10 @@ Zot is an OCI-native container registry running on Indri, providing: \`\`\`bash # List all images -curl -s http://localhost:5000/v2/_catalog | jq +curl -s http://localhost:5050/v2/_catalog | jq # Pull via cache (from indri or k8s) -podman pull localhost:5000/docker.io/library/nginx:latest +podman pull localhost:5050/docker.io/library/nginx:latest # Build and push private image (from gilbert) podman build -t registry.tail8d86e.ts.net/blumeops/myapp:v1 . @@ -691,6 +853,91 @@ tail -f ~/Library/Logs/mcquack.zot.err.log - Initial setup for k8s migration Phase 0 ``` +**Template for minikube.md:** +```markdown +--- +id: minikube +aliases: + - minikube + - kubernetes + - k8s +tags: + - blumeops +--- + +# Minikube Management Log + +Minikube provides a single-node Kubernetes cluster on Indri for running containerized services. + +## Cluster Details + +- Driver: podman (rootless) +- Container runtime: CRI-O +- Kubernetes version: v1.34.0 +- Resources: 4 CPUs, 7800MB RAM, 200GB disk +- API server: https://indri: (accessible from gilbert via Tailscale) + +## Remote Access from Gilbert + +Cluster was created with `--apiserver-names=indri --listen-address=0.0.0.0` to allow remote kubectl access. + +\`\`\`bash +# Switch context +kubectl config use-context minikube-indri + +# Verify +kubectl get nodes +kubectl get namespaces + +# Use k9s +k9s --context minikube-indri +\`\`\` + +## Useful Commands (on indri) + +\`\`\`bash +# Cluster status +minikube status + +# Start/stop cluster +minikube start +minikube stop + +# Access dashboard +minikube dashboard + +# SSH into node +minikube ssh + +# View logs +minikube logs +\`\`\` + +## Podman Machine (prerequisite) + +Minikube uses podman as the container runtime. The podman machine must be running: + +\`\`\`bash +# Check podman machine +podman machine list + +# Start if needed +podman machine start +\`\`\` + +## Log + +### [DATE] +- Initial cluster setup for k8s migration Phase 0 +- Configured for remote access with --apiserver-names=indri +``` + +**Implementation Notes:** +- Created zot.md and minikube.md in ~/code/personal/zk/ +- Updated 1767747119-YCPO.md (main blumeops card) with all specified changes +- Added 1Password credential plugin reference to minikube docs +- K8s API port is 39535 (dynamically assigned by minikube, may change on cluster recreation) + --- ### Step 0.13: Update Main Playbook @@ -711,6 +958,10 @@ tail -f ~/Library/Logs/mcquack.zot.err.log tags: minikube ``` +**Implementation Notes:** +- Roles were added incrementally during Steps 0.3, 0.5, 0.8, and 0.9 +- All four roles (zot, zot_metrics, podman, minikube) confirmed present in indri.yml + --- ### Phase 0 Verification Checklist diff --git a/pulumi/__main__.py b/pulumi/__main__.py index 33f3d0d..3b8b817 100644 --- a/pulumi/__main__.py +++ b/pulumi/__main__.py @@ -52,6 +52,7 @@ indri_tags = tailscale.DeviceTags( "tag:loki", "tag:pg", "tag:feed", + "tag:registry", # Zot container registry ], ) diff --git a/pulumi/policy.hujson b/pulumi/policy.hujson index d215ef7..ff4d98b 100644 --- a/pulumi/policy.hujson +++ b/pulumi/policy.hujson @@ -101,6 +101,7 @@ "tag:loki": ["autogroup:admin", "tag:blumeops"], "tag:pg": ["autogroup:admin", "tag:blumeops"], "tag:feed": ["autogroup:admin", "tag:blumeops"], + "tag:registry": ["autogroup:admin", "tag:blumeops"], }, // ============== ACL Tests ============== @@ -108,13 +109,13 @@ // Erich can access everything { "src": "blume.erich@gmail.com", - "accept": ["tag:grafana:443", "tag:kiwix:443", "tag:feed:443", "tag:loki:3100", "tag:pg:5432", "tag:homelab:22"], + "accept": ["tag:grafana:443", "tag:kiwix:443", "tag:feed:443", "tag:loki:3100", "tag:pg:5432", "tag:homelab:22", "tag:registry:443"], }, // Allison can access user services but NOT grafana, loki, or NAS { "src": "acmdavis@gmail.com", "accept": ["tag:kiwix:443", "tag:forge:443", "tag:feed:443", "tag:pg:5432"], - "deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445"], + "deny": ["tag:grafana:443", "tag:loki:3100", "tag:nas:445", "tag:registry:443"], }, // Homelab can reach homelab and NAS {