generated from eblume/project-template
Merge pull request 'Fix macOS heph daemon restart bootout→bootstrap race (5: Input/output error)' (#13) from feature/daemon-restart-race into main
All checks were successful
Build / validate (push) Successful in 11m52s
All checks were successful
Build / validate (push) Successful in 11m52s
This commit is contained in:
commit
b82264892f
2 changed files with 63 additions and 9 deletions
|
|
@ -13,6 +13,7 @@
|
|||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use anyhow::{bail, Context, Result};
|
||||
use clap::{Args, Subcommand};
|
||||
|
|
@ -494,6 +495,51 @@ fn launchd_loaded(domain_target: &str) -> bool {
|
|||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
/// Block until `target` is no longer loaded, up to `timeout`. `launchctl bootout`
|
||||
/// is asynchronous in effect — it requests teardown and returns, but launchd may
|
||||
/// still be killing/reaping the job and removing its label from the domain.
|
||||
/// Bootstrapping while the label lingers fails with a generic `5: Input/output
|
||||
/// error`, so we wait for the label to actually disappear before re-bootstrapping.
|
||||
fn wait_until_unloaded(target: &str, timeout: Duration) {
|
||||
let start = Instant::now();
|
||||
while launchd_loaded(target) {
|
||||
if start.elapsed() >= timeout {
|
||||
break; // fall through; bootstrap's own retry covers the residual window
|
||||
}
|
||||
std::thread::sleep(Duration::from_millis(100));
|
||||
}
|
||||
}
|
||||
|
||||
/// Bootstrap the service, retrying briefly. Even once the old instance is gone,
|
||||
/// launchd can momentarily return EIO while the domain settles, so a couple of
|
||||
/// short retries make `start`/`restart` reliable instead of intermittently failing.
|
||||
fn launchd_bootstrap(domain: &str, plist: &str) -> Result<()> {
|
||||
let mut last = String::new();
|
||||
for attempt in 0..5 {
|
||||
if attempt > 0 {
|
||||
std::thread::sleep(Duration::from_millis(200));
|
||||
}
|
||||
let (ok, err) = run_cmd("launchctl", &["bootstrap", domain, plist])?;
|
||||
if ok {
|
||||
return Ok(());
|
||||
}
|
||||
last = err;
|
||||
}
|
||||
bail!("launchctl bootstrap failed: {}", last.trim());
|
||||
}
|
||||
|
||||
/// Restart an already-loaded job in place (kills it, then launchd's KeepAlive —
|
||||
/// `-k` forces the kill). This restarts the *loaded* job definition, so it does
|
||||
/// not pick up an edited plist — callers use it only when the on-disk plist is
|
||||
/// unchanged, where it sidesteps the bootout→bootstrap race entirely.
|
||||
fn launchd_kickstart(target: &str) -> Result<()> {
|
||||
let (ok, err) = run_cmd("launchctl", &["kickstart", "-k", target])?;
|
||||
if !ok {
|
||||
bail!("launchctl kickstart failed: {}", err.trim());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> {
|
||||
let plist = launchd_plist_path()?;
|
||||
let uid = uid()?;
|
||||
|
|
@ -512,10 +558,7 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> {
|
|||
if launchd_loaded(&target) {
|
||||
println!("heph daemon already running ({LABEL}).");
|
||||
} else {
|
||||
let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?;
|
||||
if !ok {
|
||||
bail!("launchctl bootstrap failed: {}", err.trim());
|
||||
}
|
||||
launchd_bootstrap(&domain, &plist_str(&plist)?)?;
|
||||
println!("heph daemon started ({LABEL}).");
|
||||
}
|
||||
}
|
||||
|
|
@ -527,14 +570,24 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> {
|
|||
let cfg = args
|
||||
.to_config()
|
||||
.fill_from(existing_config(&plist, &Manager::Launchd));
|
||||
write_if_changed(
|
||||
let changed = write_if_changed(
|
||||
&plist,
|
||||
&launchd_plist(&p.hephd, &p.db, &p.socket, &p.log, &cfg),
|
||||
)?;
|
||||
let _ = run_cmd("launchctl", &["bootout", &target])?;
|
||||
let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?;
|
||||
if !ok {
|
||||
bail!("launchctl bootstrap failed: {}", err.trim());
|
||||
if !launchd_loaded(&target) {
|
||||
// Not currently loaded — nothing to tear down, just bring it up.
|
||||
launchd_bootstrap(&domain, &plist_str(&plist)?)?;
|
||||
} else if changed {
|
||||
// The plist changed, so launchd must re-read it: a full reload is
|
||||
// required. bootout is async, so wait for the label to clear
|
||||
// before bootstrapping (and bootstrap retries the residual EIO).
|
||||
let _ = run_cmd("launchctl", &["bootout", &target])?;
|
||||
wait_until_unloaded(&target, Duration::from_secs(5));
|
||||
launchd_bootstrap(&domain, &plist_str(&plist)?)?;
|
||||
} else {
|
||||
// Same definition (e.g. binary upgraded in place) — restart the
|
||||
// loaded job atomically, sidestepping the bootout→bootstrap race.
|
||||
launchd_kickstart(&target)?;
|
||||
}
|
||||
println!("heph daemon restarted ({LABEL}).");
|
||||
}
|
||||
|
|
|
|||
1
docs/changelog.d/daemon-restart-race.bugfix.md
Normal file
1
docs/changelog.d/daemon-restart-race.bugfix.md
Normal file
|
|
@ -0,0 +1 @@
|
|||
`heph daemon restart` on macOS no longer intermittently fails with `launchctl bootstrap failed: 5: Input/output error`. The old code bootstrapped immediately after `bootout`, racing launchd's asynchronous teardown; it now waits for the service to fully unload and retries the bootstrap. When the plist is unchanged (e.g. a plain binary upgrade) it uses `launchctl kickstart -k` to restart the loaded job atomically, sidestepping the bootout→bootstrap dance entirely.
|
||||
Loading…
Add table
Add a link
Reference in a new issue