hephaestus/heph.nvim/lua/heph/daemon.lua
Erich Blume b932a814ca
Some checks failed
Build / validate (pull_request) Failing after 5m51s
heph.nvim: fix daemon.wait_ready deadlock on a stale socket
wait_ready ran the rpc health probe inside a `vim.wait` predicate, and the probe
itself uses `vim.wait` — nesting vim.wait inside another vim.wait's predicate
deadlocks Neovim. It only bit when the socket file existed: the first launch's
socket doesn't exist yet (probe short-circuits), but the second launch hit the
stale socket left by the prior daemon and froze in setup().

- wait_ready now probes in a plain Lua loop (deadline via uv.hrtime + a bare
  vim.wait(50) yield) — never a vim.wait inside a vim.wait predicate.
- stop_spawned now unlinks the socket on exit, so a clean exit leaves no stale
  socket (a crash still can — the wait_ready fix handles that too).

Verified: two-launch repro no longer hangs; a crash-left stale socket recovers
in ~460ms. 10 e2e specs green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 10:23:42 -07:00

139 lines
4.1 KiB
Lua

--- Locate, spawn, and wait on a `hephd` daemon. Shared by optional autostart
--- and by the e2e harness (so test readiness uses the same definition the
--- plugin does).
local uv = vim.uv or vim.loop
local M = {}
-- The daemon THIS nvim spawned (nil if we connected to an existing one).
-- `{ handle, exited = { done }, socket, db, bin }`.
M._managed = nil
--- Spawn a `local`-mode hephd against `opts.db` listening on `opts.socket`.
--- `opts.bin` defaults to `hephd` on PATH. Returns `{ handle, pid }`.
function M.spawn(opts)
local args = { "--mode", "local" }
if opts.db then
table.insert(args, "--db")
table.insert(args, opts.db)
end
if opts.socket then
table.insert(args, "--socket")
table.insert(args, opts.socket)
end
local handle, pid = uv.spawn(opts.bin or "hephd", {
args = args,
stdio = { nil, nil, opts.stderr },
}, function(code, signal)
if opts.on_exit then
opts.on_exit(code, signal)
end
end)
if not handle then
error("heph: failed to spawn hephd (bin=" .. (opts.bin or "hephd") .. ")")
end
return { handle = handle, pid = pid }
end
--- Wait until `socket` both exists and accepts a real RPC (`health`). The
--- existence check alone races the daemon's bind→accept, so we prove liveness
--- with a round-trip. Returns `true`, or `false, reason`.
---
--- The probe runs in a **plain Lua loop**, never inside a `vim.wait` predicate:
--- the rpc round-trip itself uses `vim.wait`, and nesting `vim.wait` inside
--- another `vim.wait`'s predicate deadlocks Neovim (a stale socket made the
--- inner connect-wait re-enter and hang).
function M.wait_ready(socket, timeout)
timeout = timeout or 5000
local rpc = require("heph.rpc")
local deadline = uv.hrtime() + timeout * 1e6 -- ns
while uv.hrtime() < deadline do
if uv.fs_stat(socket) ~= nil then
local session = rpc.new_session(socket)
local ok = pcall(function()
session:call("health", vim.empty_dict(), { timeout = 200 })
end)
session:close()
if ok then
return true
end
end
vim.wait(50) -- yield ~50ms; no predicate, so not nested
end
return false, "daemon not ready at " .. socket
end
--- Ensure a daemon is reachable at `opts.socket`. If one is already serving the
--- socket (any mode — local/server/client), connect to it and do NOT spawn. Else
--- if `opts.autostart`, spawn a local hephd we own (and manage its lifecycle).
--- Returns `reachable, spawned_by_us`.
function M.ensure(opts)
-- Already serving? A quick probe respects a daemon someone else started.
if M.wait_ready(opts.socket, opts.probe_ms or 400) then
return true, false
end
if not opts.autostart then
return false, false
end
local exited = { done = false }
local d = M.spawn({
bin = opts.bin,
socket = opts.socket,
db = opts.db,
on_exit = function()
exited.done = true
end,
})
local ok, reason = M.wait_ready(opts.socket, opts.ready_ms or 5000)
if not ok then
pcall(function()
if not d.handle:is_closing() then
d.handle:kill("sigterm")
end
end)
error("heph: spawned hephd but it never became ready: " .. tostring(reason))
end
M._managed = {
handle = d.handle,
exited = exited,
socket = opts.socket,
db = opts.db,
bin = opts.bin,
}
return true, true
end
--- True if this nvim currently owns a live spawned daemon.
function M.is_managed()
return M._managed ~= nil and not M._managed.exited.done
end
--- Stop the daemon this nvim spawned (no-op if we connected to an existing one).
function M.stop_spawned()
local m = M._managed
if not m then
return
end
M._managed = nil
if m.handle and not m.exited.done then
pcall(function()
m.handle:kill("sigterm")
end)
vim.wait(2000, function()
return m.exited.done
end, 20)
end
pcall(function()
if m.handle and not m.handle:is_closing() then
m.handle:close()
end
end)
-- hephd doesn't unlink its socket on SIGTERM; remove it so the next launch
-- doesn't probe a stale socket. (A crash still leaves one — wait_ready copes.)
pcall(function()
uv.fs_unlink(m.socket)
end)
end
return M