Merge pull request 'Fix macOS heph daemon restart bootout→bootstrap race (5: Input/output error)' (#13) from feature/daemon-restart-race into main
All checks were successful
Build / validate (push) Successful in 11m52s

This commit is contained in:
Erich Blume 2026-06-08 13:43:55 -07:00
commit b82264892f
2 changed files with 63 additions and 9 deletions

View file

@ -13,6 +13,7 @@
use std::path::{Path, PathBuf};
use std::process::Command;
use std::time::{Duration, Instant};
use anyhow::{bail, Context, Result};
use clap::{Args, Subcommand};
@ -494,6 +495,51 @@ fn launchd_loaded(domain_target: &str) -> bool {
.unwrap_or(false)
}
/// Block until `target` is no longer loaded, up to `timeout`. `launchctl bootout`
/// is asynchronous in effect — it requests teardown and returns, but launchd may
/// still be killing/reaping the job and removing its label from the domain.
/// Bootstrapping while the label lingers fails with a generic `5: Input/output
/// error`, so we wait for the label to actually disappear before re-bootstrapping.
fn wait_until_unloaded(target: &str, timeout: Duration) {
let start = Instant::now();
while launchd_loaded(target) {
if start.elapsed() >= timeout {
break; // fall through; bootstrap's own retry covers the residual window
}
std::thread::sleep(Duration::from_millis(100));
}
}
/// Bootstrap the service, retrying briefly. Even once the old instance is gone,
/// launchd can momentarily return EIO while the domain settles, so a couple of
/// short retries make `start`/`restart` reliable instead of intermittently failing.
fn launchd_bootstrap(domain: &str, plist: &str) -> Result<()> {
let mut last = String::new();
for attempt in 0..5 {
if attempt > 0 {
std::thread::sleep(Duration::from_millis(200));
}
let (ok, err) = run_cmd("launchctl", &["bootstrap", domain, plist])?;
if ok {
return Ok(());
}
last = err;
}
bail!("launchctl bootstrap failed: {}", last.trim());
}
/// Restart an already-loaded job in place (kills it, then launchd's KeepAlive —
/// `-k` forces the kill). This restarts the *loaded* job definition, so it does
/// not pick up an edited plist — callers use it only when the on-disk plist is
/// unchanged, where it sidesteps the bootout→bootstrap race entirely.
fn launchd_kickstart(target: &str) -> Result<()> {
let (ok, err) = run_cmd("launchctl", &["kickstart", "-k", target])?;
if !ok {
bail!("launchctl kickstart failed: {}", err.trim());
}
Ok(())
}
fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> {
let plist = launchd_plist_path()?;
let uid = uid()?;
@ -512,10 +558,7 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> {
if launchd_loaded(&target) {
println!("heph daemon already running ({LABEL}).");
} else {
let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?;
if !ok {
bail!("launchctl bootstrap failed: {}", err.trim());
}
launchd_bootstrap(&domain, &plist_str(&plist)?)?;
println!("heph daemon started ({LABEL}).");
}
}
@ -527,14 +570,24 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> {
let cfg = args
.to_config()
.fill_from(existing_config(&plist, &Manager::Launchd));
write_if_changed(
let changed = write_if_changed(
&plist,
&launchd_plist(&p.hephd, &p.db, &p.socket, &p.log, &cfg),
)?;
let _ = run_cmd("launchctl", &["bootout", &target])?;
let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?;
if !ok {
bail!("launchctl bootstrap failed: {}", err.trim());
if !launchd_loaded(&target) {
// Not currently loaded — nothing to tear down, just bring it up.
launchd_bootstrap(&domain, &plist_str(&plist)?)?;
} else if changed {
// The plist changed, so launchd must re-read it: a full reload is
// required. bootout is async, so wait for the label to clear
// before bootstrapping (and bootstrap retries the residual EIO).
let _ = run_cmd("launchctl", &["bootout", &target])?;
wait_until_unloaded(&target, Duration::from_secs(5));
launchd_bootstrap(&domain, &plist_str(&plist)?)?;
} else {
// Same definition (e.g. binary upgraded in place) — restart the
// loaded job atomically, sidestepping the bootout→bootstrap race.
launchd_kickstart(&target)?;
}
println!("heph daemon restarted ({LABEL}).");
}

View file

@ -0,0 +1 @@
`heph daemon restart` on macOS no longer intermittently fails with `launchctl bootstrap failed: 5: Input/output error`. The old code bootstrapped immediately after `bootout`, racing launchd's asynchronous teardown; it now waits for the service to fully unload and retries the bootstrap. When the plist is unchanged (e.g. a plain binary upgrade) it uses `launchctl kickstart -k` to restart the loaded job atomically, sidestepping the bootout→bootstrap dance entirely.