From f6b27414a8b46834a84c3b5735cc7049a4ab39b8 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 8 Jun 2026 13:38:47 -0700 Subject: [PATCH] fix(heph): make macOS `heph daemon restart` race-free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `restart` bootstrapped immediately after `bootout`, but `launchctl bootout` is asynchronous: launchd may still be killing/reaping the job and removing its label when the command returns. Bootstrapping into that transitional domain fails with a generic `5: Input/output error`, intermittently — the odds depend on how fast hephd (sync client + SQLite + a heph-quickadd child) shuts down. - Wait for the label to actually clear (poll `launchctl print`, bounded) before re-bootstrapping, and retry the bootstrap to cover the residual settle window. - When the plist is unchanged (the common binary-upgrade restart), use `launchctl kickstart -k` to restart the loaded job atomically — no bootout/bootstrap, no race. The full reload path is reserved for genuine config changes, where launchd must re-read the plist. Start's bootstrap shares the same retry helper. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/heph/src/service.rs | 71 ++++++++++++++++--- .../changelog.d/daemon-restart-race.bugfix.md | 1 + 2 files changed, 63 insertions(+), 9 deletions(-) create mode 100644 docs/changelog.d/daemon-restart-race.bugfix.md diff --git a/crates/heph/src/service.rs b/crates/heph/src/service.rs index 1c90924..0b8928b 100644 --- a/crates/heph/src/service.rs +++ b/crates/heph/src/service.rs @@ -13,6 +13,7 @@ use std::path::{Path, PathBuf}; use std::process::Command; +use std::time::{Duration, Instant}; use anyhow::{bail, Context, Result}; use clap::{Args, Subcommand}; @@ -494,6 +495,51 @@ fn launchd_loaded(domain_target: &str) -> bool { .unwrap_or(false) } +/// Block until `target` is no longer loaded, up to `timeout`. `launchctl bootout` +/// is asynchronous in effect — it requests teardown and returns, but launchd may +/// still be killing/reaping the job and removing its label from the domain. +/// Bootstrapping while the label lingers fails with a generic `5: Input/output +/// error`, so we wait for the label to actually disappear before re-bootstrapping. +fn wait_until_unloaded(target: &str, timeout: Duration) { + let start = Instant::now(); + while launchd_loaded(target) { + if start.elapsed() >= timeout { + break; // fall through; bootstrap's own retry covers the residual window + } + std::thread::sleep(Duration::from_millis(100)); + } +} + +/// Bootstrap the service, retrying briefly. Even once the old instance is gone, +/// launchd can momentarily return EIO while the domain settles, so a couple of +/// short retries make `start`/`restart` reliable instead of intermittently failing. +fn launchd_bootstrap(domain: &str, plist: &str) -> Result<()> { + let mut last = String::new(); + for attempt in 0..5 { + if attempt > 0 { + std::thread::sleep(Duration::from_millis(200)); + } + let (ok, err) = run_cmd("launchctl", &["bootstrap", domain, plist])?; + if ok { + return Ok(()); + } + last = err; + } + bail!("launchctl bootstrap failed: {}", last.trim()); +} + +/// Restart an already-loaded job in place (kills it, then launchd's KeepAlive — +/// `-k` forces the kill). This restarts the *loaded* job definition, so it does +/// not pick up an edited plist — callers use it only when the on-disk plist is +/// unchanged, where it sidesteps the bootout→bootstrap race entirely. +fn launchd_kickstart(target: &str) -> Result<()> { + let (ok, err) = run_cmd("launchctl", &["kickstart", "-k", target])?; + if !ok { + bail!("launchctl kickstart failed: {}", err.trim()); + } + Ok(()) +} + fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> { let plist = launchd_plist_path()?; let uid = uid()?; @@ -512,10 +558,7 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> { if launchd_loaded(&target) { println!("heph daemon already running ({LABEL})."); } else { - let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?; - if !ok { - bail!("launchctl bootstrap failed: {}", err.trim()); - } + launchd_bootstrap(&domain, &plist_str(&plist)?)?; println!("heph daemon started ({LABEL})."); } } @@ -527,14 +570,24 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> { let cfg = args .to_config() .fill_from(existing_config(&plist, &Manager::Launchd)); - write_if_changed( + let changed = write_if_changed( &plist, &launchd_plist(&p.hephd, &p.db, &p.socket, &p.log, &cfg), )?; - let _ = run_cmd("launchctl", &["bootout", &target])?; - let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?; - if !ok { - bail!("launchctl bootstrap failed: {}", err.trim()); + if !launchd_loaded(&target) { + // Not currently loaded — nothing to tear down, just bring it up. + launchd_bootstrap(&domain, &plist_str(&plist)?)?; + } else if changed { + // The plist changed, so launchd must re-read it: a full reload is + // required. bootout is async, so wait for the label to clear + // before bootstrapping (and bootstrap retries the residual EIO). + let _ = run_cmd("launchctl", &["bootout", &target])?; + wait_until_unloaded(&target, Duration::from_secs(5)); + launchd_bootstrap(&domain, &plist_str(&plist)?)?; + } else { + // Same definition (e.g. binary upgraded in place) — restart the + // loaded job atomically, sidestepping the bootout→bootstrap race. + launchd_kickstart(&target)?; } println!("heph daemon restarted ({LABEL})."); } diff --git a/docs/changelog.d/daemon-restart-race.bugfix.md b/docs/changelog.d/daemon-restart-race.bugfix.md new file mode 100644 index 0000000..c13a257 --- /dev/null +++ b/docs/changelog.d/daemon-restart-race.bugfix.md @@ -0,0 +1 @@ +`heph daemon restart` on macOS no longer intermittently fails with `launchctl bootstrap failed: 5: Input/output error`. The old code bootstrapped immediately after `bootout`, racing launchd's asynchronous teardown; it now waits for the service to fully unload and retries the bootstrap. When the plist is unchanged (e.g. a plain binary upgrade) it uses `launchctl kickstart -k` to restart the loaded job atomically, sidestepping the bootout→bootstrap dance entirely. -- 2.50.1 (Apple Git-155)