2026-01-14 13:23:05 -08:00
|
|
|
#!/usr/bin/env bash
|
2026-02-04 07:49:15 -08:00
|
|
|
#MISE description="Check that all services are online and responding"
|
2026-01-14 13:23:05 -08:00
|
|
|
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
|
|
|
|
# Colors for output
|
|
|
|
|
RED='\033[0;31m'
|
|
|
|
|
GREEN='\033[0;32m'
|
2026-03-22 14:52:56 -07:00
|
|
|
YELLOW='\033[0;33m'
|
2026-01-14 13:23:05 -08:00
|
|
|
NC='\033[0m' # No Color
|
|
|
|
|
|
|
|
|
|
FAILED=0
|
|
|
|
|
|
|
|
|
|
check_service() {
|
|
|
|
|
local name="$1"
|
|
|
|
|
local check_cmd="$2"
|
|
|
|
|
|
2026-01-22 12:06:02 -08:00
|
|
|
printf "%-24s " "$name..."
|
2026-01-14 13:23:05 -08:00
|
|
|
if eval "$check_cmd" > /dev/null 2>&1; then
|
|
|
|
|
echo -e "${GREEN}OK${NC}"
|
|
|
|
|
else
|
|
|
|
|
echo -e "${RED}FAILED${NC}"
|
|
|
|
|
FAILED=1
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
check_http() {
|
|
|
|
|
local name="$1"
|
|
|
|
|
local url="$2"
|
|
|
|
|
|
2026-01-22 12:06:02 -08:00
|
|
|
printf "%-24s " "$name..."
|
2026-01-14 13:23:05 -08:00
|
|
|
if curl -sf --max-time 5 "$url" > /dev/null 2>&1; then
|
|
|
|
|
echo -e "${GREEN}OK${NC}"
|
|
|
|
|
else
|
|
|
|
|
echo -e "${RED}FAILED${NC}"
|
|
|
|
|
FAILED=1
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-22 14:52:56 -07:00
|
|
|
# ============== Grafana Alerting API ==============
|
|
|
|
|
|
|
|
|
|
GRAFANA_URL="https://grafana.ops.eblu.me"
|
|
|
|
|
GRAFANA_CREDS=""
|
|
|
|
|
|
|
|
|
|
fetch_alerts() {
|
|
|
|
|
if [ -z "$GRAFANA_CREDS" ]; then
|
|
|
|
|
local pass
|
|
|
|
|
pass=$(op read 'op://vg6xf6vvfmoh5hqjjhlhbeoaie/oxkcr3xtxnewy7noep2izvyr6y/password' 2>/dev/null) || true
|
|
|
|
|
if [ -n "$pass" ]; then
|
|
|
|
|
GRAFANA_CREDS=$(echo -n "admin:$pass" | base64)
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ -z "$GRAFANA_CREDS" ]; then
|
|
|
|
|
echo ""
|
|
|
|
|
return
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
curl -sf --max-time 10 \
|
|
|
|
|
-H "Authorization: Basic $GRAFANA_CREDS" \
|
|
|
|
|
"$GRAFANA_URL/api/prometheus/grafana/api/v1/alerts" 2>/dev/null || echo ""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# Fetch all alerts once
|
|
|
|
|
ALERTS_JSON=$(fetch_alerts)
|
|
|
|
|
|
|
|
|
|
check_alert() {
|
|
|
|
|
local name="$1"
|
|
|
|
|
local alertname="$2"
|
|
|
|
|
# Optional: filter by a label key=value
|
|
|
|
|
local filter_key="${3:-}"
|
|
|
|
|
local filter_value="${4:-}"
|
|
|
|
|
|
|
|
|
|
printf "%-24s " "$name..."
|
|
|
|
|
|
|
|
|
|
if [ -z "$ALERTS_JSON" ]; then
|
|
|
|
|
echo -e "${YELLOW}NO DATA${NC} (can't reach Grafana alerting API)"
|
|
|
|
|
return
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
local firing
|
|
|
|
|
firing=$(echo "$ALERTS_JSON" | python3 -c "
|
|
|
|
|
import json, sys
|
|
|
|
|
try:
|
|
|
|
|
data = json.load(sys.stdin)
|
|
|
|
|
except:
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
alerts = data.get('data', {}).get('alerts', [])
|
|
|
|
|
for a in alerts:
|
|
|
|
|
if a['labels'].get('alertname') != '$alertname':
|
|
|
|
|
continue
|
|
|
|
|
if '$filter_key' and a['labels'].get('$filter_key') != '$filter_value':
|
|
|
|
|
continue
|
2026-04-08 11:12:42 -07:00
|
|
|
if a['state'] in ('Alerting', 'Pending') or a['state'].startswith('Alerting'):
|
2026-03-22 14:52:56 -07:00
|
|
|
url = a.get('annotations', {}).get('runbook_url', '')
|
|
|
|
|
summary = a.get('annotations', {}).get('summary', '')
|
|
|
|
|
print(f'{summary}|{url}')
|
|
|
|
|
" 2>/dev/null)
|
|
|
|
|
|
|
|
|
|
if [ -z "$firing" ]; then
|
|
|
|
|
echo -e "${GREEN}OK${NC}"
|
|
|
|
|
else
|
|
|
|
|
echo -e "${RED}FIRING${NC}"
|
2026-04-10 19:10:09 -07:00
|
|
|
local runbook_printed=false
|
|
|
|
|
while IFS='|' read -r summary runbook; do
|
|
|
|
|
if [ -n "$summary" ]; then
|
|
|
|
|
echo -e " $summary"
|
|
|
|
|
fi
|
|
|
|
|
if [ -n "$runbook" ] && [ "$runbook_printed" = false ]; then
|
|
|
|
|
echo -e " Runbook: $runbook"
|
|
|
|
|
runbook_printed=true
|
|
|
|
|
fi
|
|
|
|
|
done <<< "$firing"
|
2026-03-22 14:52:56 -07:00
|
|
|
FAILED=1
|
|
|
|
|
fi
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-04 07:49:15 -08:00
|
|
|
echo "Checking services..."
|
|
|
|
|
echo "===================="
|
2026-01-14 13:23:05 -08:00
|
|
|
echo ""
|
|
|
|
|
|
2026-03-22 14:52:56 -07:00
|
|
|
# Local services on indri (not yet covered by alerting)
|
2026-01-22 12:06:02 -08:00
|
|
|
echo "Local services on indri:"
|
2026-03-28 08:21:51 -07:00
|
|
|
check_service "forgejo" "ssh indri 'launchctl list mcquack.eblume.forgejo | grep -v \"^-\"'"
|
2026-01-22 12:06:02 -08:00
|
|
|
check_service "alloy" "ssh indri 'launchctl list mcquack.eblume.alloy | grep -v \"^-\"'"
|
|
|
|
|
check_service "borgmatic" "ssh indri 'launchctl list mcquack.eblume.borgmatic | grep -v \"^-\"'"
|
|
|
|
|
check_service "borgmatic-metrics" "ssh indri 'launchctl list mcquack.borgmatic-metrics | grep -v \"^-\"'"
|
|
|
|
|
check_service "zot" "ssh indri 'launchctl list mcquack.eblume.zot | grep -v \"^-\"'"
|
|
|
|
|
check_service "zot-metrics" "ssh indri 'launchctl list mcquack.zot-metrics | grep -v \"^-\"'"
|
|
|
|
|
check_service "minikube-metrics" "ssh indri 'launchctl list mcquack.minikube-metrics | grep -v \"^-\"'"
|
2026-01-30 17:06:00 -08:00
|
|
|
check_service "jellyfin-metrics" "ssh indri 'launchctl list mcquack.eblume.jellyfin-metrics | grep -v \"^-\"'"
|
2026-01-14 13:23:05 -08:00
|
|
|
|
|
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "Metrics textfiles (via alerting):"
|
|
|
|
|
check_alert "textfile-freshness" "TextfileStale"
|
2026-01-18 12:06:28 -08:00
|
|
|
|
|
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "Kubernetes cluster (not yet covered by alerting):"
|
2026-01-18 12:06:28 -08:00
|
|
|
check_service "minikube" "ssh indri 'minikube status --format={{.Host}} | grep -q Running'"
|
|
|
|
|
check_service "k8s-apiserver (indri)" "ssh indri 'kubectl get --raw /healthz'"
|
|
|
|
|
check_service "k8s-apiserver (remote)" "kubectl --kubeconfig=$HOME/.kube/minikube-indri/config.yml --context=minikube-indri get --raw /healthz"
|
2026-01-14 13:23:05 -08:00
|
|
|
|
2026-01-19 09:49:52 -08:00
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "HTTP endpoints (via alerting):"
|
|
|
|
|
check_alert "Prometheus" "ServiceProbeFailure" "service" "prometheus"
|
|
|
|
|
check_alert "Loki" "ServiceProbeFailure" "service" "loki"
|
|
|
|
|
check_alert "Grafana" "ServiceProbeFailure" "service" "grafana"
|
|
|
|
|
check_alert "ArgoCD" "ServiceProbeFailure" "service" "argocd"
|
|
|
|
|
check_alert "Kiwix" "ServiceProbeFailure" "service" "kiwix"
|
|
|
|
|
check_alert "Miniflux" "ServiceProbeFailure" "service" "miniflux"
|
|
|
|
|
check_alert "TeslaMate" "ServiceProbeFailure" "service" "teslamate"
|
|
|
|
|
check_alert "Devpi" "ServiceProbeFailure" "service" "devpi"
|
|
|
|
|
check_alert "Transmission" "ServiceProbeFailure" "service" "transmission"
|
|
|
|
|
check_alert "Immich" "ServiceProbeFailure" "service" "immich"
|
|
|
|
|
check_alert "Navidrome" "ServiceProbeFailure" "service" "navidrome"
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
echo "HTTP endpoints (not yet covered by alerting):"
|
Expose Forgejo publicly at forge.eblu.me (#278)
## Summary
Expose Forgejo publicly at `forge.eblu.me` via the Fly.io reverse proxy — the first dynamic, authenticated public-facing service.
- **Forgejo hardening:** Domain changed to forge.eblu.me, SSH stays on forge.ops.eblu.me, reverse proxy trust headers configured, local registration locked to external-only (Authentik SSO)
- **Tailscale Ingress:** ExternalName Service + Ingress in tailscale-operator creates forge.tail8d86e.ts.net endpoint
- **Fly.io proxy:** nginx server block with rate-limited auth endpoints (3r/s), fail2ban with custom nginx-deny action, security headers, /swagger blocked, WebSocket support, 512m body limit
- **Authentik:** OAuth callback updated to forge.eblu.me
- **DNS/TLS:** CNAME record in Pulumi, cert in fly-setup
- **Rename:** ~29 files updated from forge.ops.eblu.me to forge.eblu.me (HTTPS refs only; SSH, container builds, and Caddy table kept as-is)
## Deployment Order
1. `mise run provision-indri -- --tags forgejo` (config changes)
2. Verify forge.ops.eblu.me still works
3. `argocd app set tailscale-operator --revision feature/forge-public && argocd app sync tailscale-operator`
4. Verify `curl https://forge.tail8d86e.ts.net`
5. `cd fly && fly deploy`
6. Verify pre-DNS: `curl -H "Host: forge.eblu.me" https://blumeops-proxy.fly.dev/`
7. `fly certs add forge.eblu.me -a blumeops-proxy`
8. `argocd app set authentik --revision feature/forge-public && argocd app sync authentik`
9. `mise run dns-preview && mise run dns-up`
10. Full verification (see below)
11. Rehearse `mise run fly-shutoff`
12. After merge: reset ArgoCD revisions to main, re-sync
## Verification Checklist
- [ ] forge.eblu.me loads, shows public repos
- [ ] forge.ops.eblu.me still works from tailnet
- [ ] SSH clone via forge.ops.eblu.me:2222 works
- [ ] HTTPS clone via forge.eblu.me works
- [ ] UI shows forge.eblu.me for HTTPS clone, forge.ops.eblu.me for SSH
- [ ] /swagger returns 403
- [ ] Rapid login attempts trigger 429 rate limit
- [ ] fail2ban bans after 5 failed logins in 10 minutes
- [ ] ArgoCD can still sync (SSH unaffected)
- [ ] `mise run fly-shutoff` stops all public traffic
- [ ] `mise run services-check` passes
Reviewed-on: https://forge.eblu.me/eblume/blumeops/pulls/278
2026-03-03 08:40:41 -08:00
|
|
|
check_http "Forgejo" "https://forge.eblu.me/"
|
2026-01-25 12:06:15 -08:00
|
|
|
check_http "Zot Registry" "https://registry.ops.eblu.me/v2/_catalog"
|
2026-02-12 14:10:03 -08:00
|
|
|
check_http "CV" "https://cv.ops.eblu.me/"
|
Deploy Frigate NVR stack with Mosquitto, Ntfy, and frigate-notify (#190)
## Summary
Deploy a cloud-free NVR stack for the GableCam (ReoLink Elite Floodlight at 192.168.1.159):
- **Mosquitto** — shared MQTT broker in `mqtt` namespace (cluster-internal, no auth)
- **Ntfy** — self-hosted push notifications in `ntfy` namespace, exposed at `ntfy.tail8d86e.ts.net` / `ntfy.ops.eblu.me`
- **Frigate** — NVR with GableCam via HTTP-FLV, ONNX CPU detection, NFS recordings on sifaka, exposed at `nvr.tail8d86e.ts.net` / `nvr.ops.eblu.me`
- **frigate-notify** — bridges Frigate detection events (person, car, dog, cat) to Ntfy alerts via MQTT
Also includes:
- Prometheus scrape target for Frigate metrics
- Grafana dashboard for Frigate (status, inference speed, FPS, CPU/memory, storage)
- Caddy reverse proxy entries for `nvr.ops.eblu.me` and `ntfy.ops.eblu.me`
## Prerequisites
- [ ] Create NFS share `frigate` on sifaka (`/volume1/frigate`, RW for indri)
- [ ] Create 1Password item "Reolink Floodlight Camera" in `blumeops` vault with `username` and `password` fields
## Deployment (after merge)
```bash
argocd app sync apps
argocd app sync mosquitto
argocd app sync ntfy
argocd app sync frigate
argocd app sync grafana-config
argocd app sync prometheus
mise run provision-indri -- --tags caddy
mise run services-check
```
## Verification
- [ ] Mosquitto pod running, accepting connections on 1883
- [ ] Ntfy web UI accessible at `ntfy.ops.eblu.me`
- [ ] Frigate web UI at `nvr.ops.eblu.me` showing GableCam live feed
- [ ] Object detection working (ONNX, person/car/dog/cat)
- [ ] Recordings appearing in NFS share on sifaka
- [ ] frigate-notify sending detection alerts to Ntfy
- [ ] Prometheus scraping Frigate metrics
- [ ] Grafana dashboard showing Frigate data
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/190
2026-02-14 21:27:44 -08:00
|
|
|
check_http "Ntfy" "https://ntfy.ops.eblu.me/v1/health"
|
2026-02-20 12:55:59 -08:00
|
|
|
check_http "Authentik" "https://authentik.ops.eblu.me/-/health/live/"
|
Deploy Frigate NVR stack with Mosquitto, Ntfy, and frigate-notify (#190)
## Summary
Deploy a cloud-free NVR stack for the GableCam (ReoLink Elite Floodlight at 192.168.1.159):
- **Mosquitto** — shared MQTT broker in `mqtt` namespace (cluster-internal, no auth)
- **Ntfy** — self-hosted push notifications in `ntfy` namespace, exposed at `ntfy.tail8d86e.ts.net` / `ntfy.ops.eblu.me`
- **Frigate** — NVR with GableCam via HTTP-FLV, ONNX CPU detection, NFS recordings on sifaka, exposed at `nvr.tail8d86e.ts.net` / `nvr.ops.eblu.me`
- **frigate-notify** — bridges Frigate detection events (person, car, dog, cat) to Ntfy alerts via MQTT
Also includes:
- Prometheus scrape target for Frigate metrics
- Grafana dashboard for Frigate (status, inference speed, FPS, CPU/memory, storage)
- Caddy reverse proxy entries for `nvr.ops.eblu.me` and `ntfy.ops.eblu.me`
## Prerequisites
- [ ] Create NFS share `frigate` on sifaka (`/volume1/frigate`, RW for indri)
- [ ] Create 1Password item "Reolink Floodlight Camera" in `blumeops` vault with `username` and `password` fields
## Deployment (after merge)
```bash
argocd app sync apps
argocd app sync mosquitto
argocd app sync ntfy
argocd app sync frigate
argocd app sync grafana-config
argocd app sync prometheus
mise run provision-indri -- --tags caddy
mise run services-check
```
## Verification
- [ ] Mosquitto pod running, accepting connections on 1883
- [ ] Ntfy web UI accessible at `ntfy.ops.eblu.me`
- [ ] Frigate web UI at `nvr.ops.eblu.me` showing GableCam live feed
- [ ] Object detection working (ONNX, person/car/dog/cat)
- [ ] Recordings appearing in NFS share on sifaka
- [ ] frigate-notify sending detection alerts to Ntfy
- [ ] Prometheus scraping Frigate metrics
- [ ] Grafana dashboard showing Frigate data
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/190
2026-02-14 21:27:44 -08:00
|
|
|
check_http "Frigate" "https://nvr.ops.eblu.me/api/version"
|
2026-01-22 12:06:02 -08:00
|
|
|
|
Polish ringtail NixOS config and add documentation (#208)
## Summary
- Fix Super+Return keybinding to launch wezterm in sway
- Set fish as default login shell
- Remove `initialPassword` (real password already set)
- Add 1Password CLI + GUI, chezmoi, and dev tool packages (neovim, eza, fd, fzf, zoxide, starship, atuin, bat, ripgrep)
- Add ringtail reference card, update host inventory and reference index
- Changelog fragment
## Post-merge deployment
- `mise run provision-ringtail` to rebuild NixOS
- On ringtail: launch 1Password GUI, enable CLI integration (Settings > Developer > CLI integration)
- Chezmoi needs `.chezmoiignore` updates in the dotfiles repo (separate task)
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/208
2026-02-18 17:53:47 -08:00
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "Frigate (via alerting):"
|
|
|
|
|
check_alert "camera-fps" "FrigateCameraDown"
|
|
|
|
|
echo "Frigate (not yet covered by alerting):"
|
|
|
|
|
check_service "frigate-storage" "curl -sf --max-time 5 https://nvr.ops.eblu.me/api/stats | jq -e '.service.storage | to_entries | map(select(.key | startswith(\"/media\"))) | length > 0 and all(.[]; .value.free > 0)'"
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Ringtail (not yet covered by alerting):"
|
Polish ringtail NixOS config and add documentation (#208)
## Summary
- Fix Super+Return keybinding to launch wezterm in sway
- Set fish as default login shell
- Remove `initialPassword` (real password already set)
- Add 1Password CLI + GUI, chezmoi, and dev tool packages (neovim, eza, fd, fzf, zoxide, starship, atuin, bat, ripgrep)
- Add ringtail reference card, update host inventory and reference index
- Changelog fragment
## Post-merge deployment
- `mise run provision-ringtail` to rebuild NixOS
- On ringtail: launch 1Password GUI, enable CLI integration (Settings > Developer > CLI integration)
- Chezmoi needs `.chezmoiignore` updates in the dotfiles repo (separate task)
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/208
2026-02-18 17:53:47 -08:00
|
|
|
check_service "ssh" "ssh -o ConnectTimeout=5 ringtail true"
|
|
|
|
|
check_service "tailscale" "ssh ringtail 'tailscale status --self --json' | jq -e '.Self.Online' > /dev/null"
|
2026-02-18 21:26:00 -08:00
|
|
|
check_service "k3s" "ssh ringtail 'KUBECONFIG=/etc/rancher/k3s/k3s.yaml k3s kubectl get nodes --no-headers | grep -q Ready'"
|
Add k3s, 1Password Connect, and systemd nix-container-builder to ringtail (#209)
## Summary
Extends ringtail from a desktop/gaming NixOS box into an infrastructure node with a k3s cluster, secrets management, and a Forgejo Actions
runner for building containers with Nix.
### K3s cluster
- Single-node k3s with Traefik/ServiceLB/metrics-server disabled (minimal footprint)
- TLS SAN set to `ringtail.tail8d86e.ts.net` so ArgoCD on indri can manage it via Tailscale
- Containerd registry mirrors pull through Zot on indri (`k3s-registries.yaml`)
- Tailscale interface added to `trustedInterfaces` for cross-node ArgoCD access
- `kubectl` added to system packages
### 1Password Connect + External Secrets Operator
- Four new ArgoCD apps targeting `k3s-ringtail`: `1password-connect-ringtail`, `external-secrets-crds-ringtail`, `external-secrets-ringtail`,
`external-secrets-config-ringtail`
- Reuses the same Helm charts/values as indri, just pointed at ringtail's k3s API server
- Bootstrap secrets (`op-credentials`, `onepassword-token`) provisioned by Ansible pre_tasks via `op read`, then applied to the `1password`
namespace in post_tasks
### Systemd Forgejo Actions runner
- Native `services.gitea-actions-runner` with `forgejo-runner` package — no DinD, no k8s pod, runs directly on the NixOS host
- Label `nix-container-builder:host` — jobs execute on the host with `nix`, `skopeo`, `nodejs`, etc. in PATH
- Registration token fetched from 1Password (`Forgejo Secrets/runner_reg`) by Ansible and written to `/etc/forgejo-runner/token.env`
- Runner's dynamic user (`gitea-runner`) added to `nix.settings.trusted-users` for nix daemon access
### Nix container build workflow
- New `.forgejo/workflows/build-container-nix.yaml` triggers on `*-nix-v[0-9]*` tags (e.g. `nettest-nix-v1.0.0`)
- Builds with `nix build -f containers/<name>/default.nix`, pushes to Zot via `skopeo copy`
- Existing Dockerfile workflow guarded with `if: !contains(github.ref_name, '-nix-v')` to avoid double-triggering
### Mise task updates
- `container-tag-and-release` auto-detects `default.nix` vs `Dockerfile` and uses the appropriate tag format (`-nix-v` vs `-v`)
- `container-list` shows build type indicator (`[nix]` / `[dockerfile]`)
## Post-merge
1. `mise run provision-ringtail` — deploys k3s token, runner token, NixOS rebuild
2. Register k3s cluster in ArgoCD (first time only):
```fish
ssh ringtail 'sudo cat /etc/rancher/k3s/k3s.yaml' | \
sed 's|127.0.0.1|ringtail.tail8d86e.ts.net|' > /tmp/k3s-ringtail.yaml
set -x KUBECONFIG /tmp/k3s-ringtail.yaml
argocd cluster add default --name k3s-ringtail
3. Sync ArgoCD apps in order: 1password-connect-ringtail -> external-secrets-crds-ringtail -> external-secrets-ringtail ->
external-secrets-config-ringtail
4. Verify runner: ssh ringtail 'systemctl status gitea-runner-nix-container-builder'
5. Check Forgejo admin panel for ringtail-nix-builder runner online
6. Test: create containers/<name>/default.nix, tag with <name>-nix-v0.1.0
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/209
2026-02-18 21:15:30 -08:00
|
|
|
check_service "k3s-apiserver (remote)" "kubectl --context=k3s-ringtail get --raw /healthz"
|
|
|
|
|
check_service "forgejo-runner" "ssh ringtail 'systemctl is-active gitea-runner-nix_container_builder.service'"
|
Polish ringtail NixOS config and add documentation (#208)
## Summary
- Fix Super+Return keybinding to launch wezterm in sway
- Set fish as default login shell
- Remove `initialPassword` (real password already set)
- Add 1Password CLI + GUI, chezmoi, and dev tool packages (neovim, eza, fd, fzf, zoxide, starship, atuin, bat, ripgrep)
- Add ringtail reference card, update host inventory and reference index
- Changelog fragment
## Post-merge deployment
- `mise run provision-ringtail` to rebuild NixOS
- On ringtail: launch 1Password GUI, enable CLI integration (Settings > Developer > CLI integration)
- Chezmoi needs `.chezmoiignore` updates in the dotfiles repo (separate task)
Reviewed-on: https://forge.ops.eblu.me/eblume/blumeops/pulls/208
2026-02-18 17:53:47 -08:00
|
|
|
|
2026-02-19 14:38:21 -08:00
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "Pod health (via alerting):"
|
|
|
|
|
check_alert "pod-readiness" "PodNotReady"
|
|
|
|
|
|
|
|
|
|
echo ""
|
|
|
|
|
echo "Database (via alerting):"
|
|
|
|
|
check_alert "PostgreSQL" "PostgresClusterUnhealthy"
|
2026-02-19 14:38:21 -08:00
|
|
|
|
2026-02-08 02:48:15 -08:00
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "Public services (not yet covered by alerting):"
|
2026-02-08 02:48:15 -08:00
|
|
|
check_http "Docs (public)" "https://docs.eblu.me/"
|
2026-02-12 14:10:03 -08:00
|
|
|
check_http "CV (public)" "https://cv.eblu.me/"
|
2026-03-03 08:48:15 -08:00
|
|
|
check_http "Forge (public)" "https://forge.eblu.me/"
|
2026-02-08 02:48:15 -08:00
|
|
|
check_http "Fly.io healthz" "https://blumeops-proxy.fly.dev/healthz"
|
|
|
|
|
|
2026-01-22 12:06:02 -08:00
|
|
|
echo ""
|
2026-03-22 14:52:56 -07:00
|
|
|
echo "ArgoCD app sync status (via alerting):"
|
|
|
|
|
check_alert "argocd-sync" "ArgoCDAppOutOfSync"
|
|
|
|
|
# Keep the detailed table as a summary view
|
2026-01-22 12:06:02 -08:00
|
|
|
printf "%-20s %-12s %-12s %s\n" "NAME" "SYNC" "HEALTH" "TARGET"
|
|
|
|
|
while read -r name sync health target; do
|
|
|
|
|
if [[ "$sync" == "Synced" ]]; then
|
|
|
|
|
printf "%-20s ${GREEN}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
|
|
|
|
elif [[ "$sync" == "OutOfSync" ]]; then
|
|
|
|
|
printf "%-20s ${RED}%-12s${NC} %-12s %s\n" "$name" "$sync" "$health" "$target"
|
|
|
|
|
else
|
|
|
|
|
printf "%-20s %-12s %-12s %s\n" "$name" "$sync" "$health" "$target"
|
|
|
|
|
fi
|
|
|
|
|
done < <(kubectl --context=minikube-indri get applications -n argocd --no-headers -o custom-columns='NAME:.metadata.name,SYNC:.status.sync.status,HEALTH:.status.health.status,TARGET:.spec.source.targetRevision' 2>/dev/null)
|
2026-01-19 09:49:52 -08:00
|
|
|
|
2026-01-14 13:23:05 -08:00
|
|
|
echo ""
|
|
|
|
|
if [ $FAILED -eq 0 ]; then
|
|
|
|
|
echo -e "${GREEN}All services healthy!${NC}"
|
|
|
|
|
exit 0
|
|
|
|
|
else
|
|
|
|
|
echo -e "${RED}Some services failed health check${NC}"
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|