From f6b27414a8b46834a84c3b5735cc7049a4ab39b8 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 8 Jun 2026 13:38:47 -0700 Subject: [PATCH 1/7] fix(heph): make macOS `heph daemon restart` race-free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `restart` bootstrapped immediately after `bootout`, but `launchctl bootout` is asynchronous: launchd may still be killing/reaping the job and removing its label when the command returns. Bootstrapping into that transitional domain fails with a generic `5: Input/output error`, intermittently — the odds depend on how fast hephd (sync client + SQLite + a heph-quickadd child) shuts down. - Wait for the label to actually clear (poll `launchctl print`, bounded) before re-bootstrapping, and retry the bootstrap to cover the residual settle window. - When the plist is unchanged (the common binary-upgrade restart), use `launchctl kickstart -k` to restart the loaded job atomically — no bootout/bootstrap, no race. The full reload path is reserved for genuine config changes, where launchd must re-read the plist. Start's bootstrap shares the same retry helper. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/heph/src/service.rs | 71 ++++++++++++++++--- .../changelog.d/daemon-restart-race.bugfix.md | 1 + 2 files changed, 63 insertions(+), 9 deletions(-) create mode 100644 docs/changelog.d/daemon-restart-race.bugfix.md diff --git a/crates/heph/src/service.rs b/crates/heph/src/service.rs index 1c90924..0b8928b 100644 --- a/crates/heph/src/service.rs +++ b/crates/heph/src/service.rs @@ -13,6 +13,7 @@ use std::path::{Path, PathBuf}; use std::process::Command; +use std::time::{Duration, Instant}; use anyhow::{bail, Context, Result}; use clap::{Args, Subcommand}; @@ -494,6 +495,51 @@ fn launchd_loaded(domain_target: &str) -> bool { .unwrap_or(false) } +/// Block until `target` is no longer loaded, up to `timeout`. `launchctl bootout` +/// is asynchronous in effect — it requests teardown and returns, but launchd may +/// still be killing/reaping the job and removing its label from the domain. +/// Bootstrapping while the label lingers fails with a generic `5: Input/output +/// error`, so we wait for the label to actually disappear before re-bootstrapping. +fn wait_until_unloaded(target: &str, timeout: Duration) { + let start = Instant::now(); + while launchd_loaded(target) { + if start.elapsed() >= timeout { + break; // fall through; bootstrap's own retry covers the residual window + } + std::thread::sleep(Duration::from_millis(100)); + } +} + +/// Bootstrap the service, retrying briefly. Even once the old instance is gone, +/// launchd can momentarily return EIO while the domain settles, so a couple of +/// short retries make `start`/`restart` reliable instead of intermittently failing. +fn launchd_bootstrap(domain: &str, plist: &str) -> Result<()> { + let mut last = String::new(); + for attempt in 0..5 { + if attempt > 0 { + std::thread::sleep(Duration::from_millis(200)); + } + let (ok, err) = run_cmd("launchctl", &["bootstrap", domain, plist])?; + if ok { + return Ok(()); + } + last = err; + } + bail!("launchctl bootstrap failed: {}", last.trim()); +} + +/// Restart an already-loaded job in place (kills it, then launchd's KeepAlive — +/// `-k` forces the kill). This restarts the *loaded* job definition, so it does +/// not pick up an edited plist — callers use it only when the on-disk plist is +/// unchanged, where it sidesteps the bootout→bootstrap race entirely. +fn launchd_kickstart(target: &str) -> Result<()> { + let (ok, err) = run_cmd("launchctl", &["kickstart", "-k", target])?; + if !ok { + bail!("launchctl kickstart failed: {}", err.trim()); + } + Ok(()) +} + fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> { let plist = launchd_plist_path()?; let uid = uid()?; @@ -512,10 +558,7 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> { if launchd_loaded(&target) { println!("heph daemon already running ({LABEL})."); } else { - let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?; - if !ok { - bail!("launchctl bootstrap failed: {}", err.trim()); - } + launchd_bootstrap(&domain, &plist_str(&plist)?)?; println!("heph daemon started ({LABEL})."); } } @@ -527,14 +570,24 @@ fn launchd(action: &DaemonAction, p: &Paths) -> Result<()> { let cfg = args .to_config() .fill_from(existing_config(&plist, &Manager::Launchd)); - write_if_changed( + let changed = write_if_changed( &plist, &launchd_plist(&p.hephd, &p.db, &p.socket, &p.log, &cfg), )?; - let _ = run_cmd("launchctl", &["bootout", &target])?; - let (ok, err) = run_cmd("launchctl", &["bootstrap", &domain, &plist_str(&plist)?])?; - if !ok { - bail!("launchctl bootstrap failed: {}", err.trim()); + if !launchd_loaded(&target) { + // Not currently loaded — nothing to tear down, just bring it up. + launchd_bootstrap(&domain, &plist_str(&plist)?)?; + } else if changed { + // The plist changed, so launchd must re-read it: a full reload is + // required. bootout is async, so wait for the label to clear + // before bootstrapping (and bootstrap retries the residual EIO). + let _ = run_cmd("launchctl", &["bootout", &target])?; + wait_until_unloaded(&target, Duration::from_secs(5)); + launchd_bootstrap(&domain, &plist_str(&plist)?)?; + } else { + // Same definition (e.g. binary upgraded in place) — restart the + // loaded job atomically, sidestepping the bootout→bootstrap race. + launchd_kickstart(&target)?; } println!("heph daemon restarted ({LABEL})."); } diff --git a/docs/changelog.d/daemon-restart-race.bugfix.md b/docs/changelog.d/daemon-restart-race.bugfix.md new file mode 100644 index 0000000..c13a257 --- /dev/null +++ b/docs/changelog.d/daemon-restart-race.bugfix.md @@ -0,0 +1 @@ +`heph daemon restart` on macOS no longer intermittently fails with `launchctl bootstrap failed: 5: Input/output error`. The old code bootstrapped immediately after `bootout`, racing launchd's asynchronous teardown; it now waits for the service to fully unload and retries the bootstrap. When the plist is unchanged (e.g. a plain binary upgrade) it uses `launchctl kickstart -k` to restart the loaded job atomically, sidestepping the bootout→bootstrap dance entirely. From e943a940f147a9cf0d1f930c8a5a51e1606424e6 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 8 Jun 2026 14:06:08 -0700 Subject: [PATCH 2/7] feat(hephd,heph,heph-tui): distinguish IdP rejection from unreachable + actionable re-auth MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The spoke OAuth path funneled every failure into one `AuthError::Provider` whose Display was hardcoded "identity provider unreachable". So a reachable IdP returning `400 invalid_grant` on a refresh was reported as "unreachable", misdirecting incident response toward the network when the fix is re-auth. The real refresh cause was also swallowed — `bearer()` logged it and returned None, so sync health only ever showed the downstream 401 on /sync/pull. Wording fix (auth.rs / oauth.rs): - Split AuthError into Unreachable (transport), Rejected (IdP returned an HTTP error — carries the RFC 6749 §5.2 error/error_description), and Other (keyring / malformed response, previously mislabeled too). - refresh()/discover()/start()/poll() classify transport vs status; refresh reads the OAuth error body on a non-2xx. - Hub-side token verify maps IdP-infra failures → 503, token failures → 401. Recovery UX (server.rs / heph / heph-tui): - bearer() returns Result; the sync paths record the real acquisition failure (with a re-login hint when it's a rejection) instead of a masked 401. - sync health's last_error carries the exact `heph auth login --hub-url … --issuer … --client-id …` command (keyed to the configured hub); sync.status also returns issuer/client_id + the command. - New `heph auth status` prints auth health and the re-login command. - heph-tui's auth chip points at it: `⚠ auth · heph auth status`. Closes the duplicate "misleading identity provider unreachable" tasks and the "actionable re-auth guidance" task. Also corrects a now-stale set-up-sync-hub gap note (daemon config baking landed in the prior PR). Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/heph-tui/src/ui.rs | 6 +- crates/heph/src/main.rs | 76 +++++++++- crates/hephd/src/auth.rs | 84 ++++++++++- crates/hephd/src/oauth.rs | 54 +++---- crates/hephd/src/server.rs | 137 +++++++++++++++--- crates/hephd/src/sync.rs | 10 +- crates/hephd/tests/oauth.rs | 66 ++++++++- docs/changelog.d/auth-error-clarity.bugfix.md | 1 + .../changelog.d/auth-error-clarity.feature.md | 1 + docs/how-to/set-up-sync-hub.md | 32 +++- 10 files changed, 393 insertions(+), 74 deletions(-) create mode 100644 docs/changelog.d/auth-error-clarity.bugfix.md create mode 100644 docs/changelog.d/auth-error-clarity.feature.md diff --git a/crates/heph-tui/src/ui.rs b/crates/heph-tui/src/ui.rs index 6e15453..f6d2f37 100644 --- a/crates/heph-tui/src/ui.rs +++ b/crates/heph-tui/src/ui.rs @@ -570,7 +570,9 @@ fn sync_indicator(sync: &SyncStatus, now: i64) -> Vec> { let health = sync.health.clone().unwrap_or_default(); let mut spans = vec![if health.auth_failure { - Span::styled("⚠ auth", red) + // Point at the recovery command — `heph auth status` prints the exact + // `heph auth login …` to run (the full command is too long for the bar). + Span::styled("⚠ auth · heph auth status", red) } else if let Some(ts) = health.last_success_ms { Span::styled(format!("⟳ {}", fmt_age(now, ts)), dim) } else if health.last_error.is_some() { @@ -639,7 +641,7 @@ mod tests { }, 0, ); - assert_eq!(render(&auth, NOW), "⚠ auth"); + assert_eq!(render(&auth, NOW), "⚠ auth · heph auth status"); // Errored with no prior success → offline. let offline = spoke( diff --git a/crates/heph/src/main.rs b/crates/heph/src/main.rs index c327f1d..28d3b5e 100644 --- a/crates/heph/src/main.rs +++ b/crates/heph/src/main.rs @@ -344,7 +344,7 @@ enum ConflictAction { }, } -#[derive(Subcommand, Debug)] +#[derive(Subcommand, Debug, Clone)] enum AuthAction { /// Log in via the device-code flow; caches the bearer token for hub sync. Login { @@ -367,6 +367,9 @@ enum AuthAction { #[arg(long)] hub_url: String, }, + /// Show this spoke's auth health and, if re-auth is needed, the exact + /// `heph auth login` command to run. Queries the daemon. + Status, } /// Run the device-code flow (or clear a token) — no daemon needed. @@ -396,10 +399,63 @@ fn run_auth(action: AuthAction) -> Result<()> { KeyringTokenStore::new(hub_url.as_str()).clear()?; println!("Logged out of {hub_url}."); } + AuthAction::Status => unreachable!("auth status is handled via the daemon"), } Ok(()) } +/// Render `heph auth status` from a `sync.status` RPC response: hub/issuer/client +/// id, whether auth is healthy or needs re-login, and — when it does — the exact +/// command to run (built daemon-side, keyed under the right hub URL). +fn print_auth_status(status: &Value) { + let Some(hub) = status.get("hub_url").and_then(Value::as_str) else { + println!("This instance is standalone (no hub configured); auth does not apply."); + return; + }; + let auth = status.get("auth"); + let issuer = auth.and_then(|a| a.get("issuer")).and_then(Value::as_str); + let client_id = auth + .and_then(|a| a.get("client_id")) + .and_then(Value::as_str); + let health = status.get("health"); + let auth_failure = health + .and_then(|h| h.get("auth_failure")) + .and_then(Value::as_bool) + .unwrap_or(false); + let last_error = health + .and_then(|h| h.get("last_error")) + .and_then(Value::as_str); + let last_success = health + .and_then(|h| h.get("last_success_ms")) + .and_then(Value::as_i64); + + println!("hub : {hub}"); + if let Some(iss) = issuer { + println!("issuer : {iss}"); + } + if let Some(cid) = client_id { + println!("client id : {cid}"); + } + println!( + "auth : {}", + if auth_failure { + "FAILED — re-authentication required" + } else if last_success.is_some() { + "ok" + } else { + "unknown (no successful sync yet)" + } + ); + if let Some(err) = last_error { + println!("last error : {err}"); + } + if auth_failure { + if let Some(cmd) = status.get("reauth_command").and_then(Value::as_str) { + println!("\nTo re-authenticate, run:\n {cmd}"); + } + } +} + fn main() -> Result<()> { let cli = Cli::parse(); @@ -407,9 +463,13 @@ fn main() -> Result<()> { if let Command::Daemon { action } = &cli.command { return service::run(action); } - // `auth` runs locally (device-code flow + keyring); it needs no daemon. - if let Command::Auth { action } = cli.command { - return run_auth(action); + // `auth login`/`logout` run locally (device-code flow + keyring); they need + // no daemon. `auth status` reads live sync health, so it falls through to the + // connected path below. + if let Command::Auth { action } = &cli.command { + if !matches!(action, AuthAction::Status) { + return run_auth(action.clone()); + } } let socket = cli.socket.unwrap_or_else(default_socket_path); @@ -790,7 +850,13 @@ fn main() -> Result<()> { let n = result.as_u64().unwrap_or(0); println!("Rewrote legacy [[Name]] links to [[id]] in {n} node(s)."); } - Command::Auth { .. } => unreachable!("auth is handled before connecting"), + Command::Auth { + action: AuthAction::Status, + } => { + let result = client.call("sync.status", json!({}))?; + print_auth_status(&result); + } + Command::Auth { .. } => unreachable!("auth login/logout handled before connecting"), Command::Daemon { .. } => unreachable!("daemon is handled before connecting"), } Ok(()) diff --git a/crates/hephd/src/auth.rs b/crates/hephd/src/auth.rs index c601d90..6b80e95 100644 --- a/crates/hephd/src/auth.rs +++ b/crates/hephd/src/auth.rs @@ -38,9 +38,45 @@ pub enum AuthError { /// The token was present but failed validation. #[error("invalid token: {0}")] Invalid(String), - /// The identity provider could not be reached to fetch keys. + /// The identity provider could not be reached at all (DNS, TLS, connection + /// refused, timeout) — a transport failure, distinct from a rejection. #[error("identity provider unreachable: {0}")] - Provider(String), + Unreachable(String), + /// The identity provider *was* reached but returned an HTTP error response — + /// e.g. `400 invalid_grant` on a refresh, meaning the token was rejected + /// (expired/rotated/session-invalidated), not that the IdP was down. The + /// distinction matters: "unreachable" sends debugging toward the network; + /// this points at the token/authorization. + #[error("identity provider rejected the request: {0}")] + Rejected(String), + /// Some other failure in the auth path that is neither a transport failure + /// nor an HTTP rejection — a malformed/unparseable IdP response, or a local + /// credential-store (keyring) error. Kept distinct so neither is mislabeled + /// as "unreachable". + #[error("auth error: {0}")] + Other(String), +} + +impl AuthError { + /// Build a [`AuthError::Rejected`] from an HTTP status and the OAuth error + /// body (RFC 6749 §5.2), e.g. `HTTP 400 (invalid_grant): Token is expired`. + pub fn rejected(status: u16, error: Option<&str>, description: Option<&str>) -> AuthError { + let mut msg = format!("HTTP {status}"); + if let Some(e) = error.filter(|e| !e.is_empty()) { + msg.push_str(&format!(" ({e})")); + } + if let Some(d) = description.filter(|d| !d.is_empty()) { + msg.push_str(&format!(": {d}")); + } + AuthError::Rejected(msg) + } + + /// Whether this is an authorization-level rejection (the IdP refused the + /// grant) rather than a transport failure — i.e. re-authentication is the + /// likely fix, not network troubleshooting. + pub fn is_rejection(&self) -> bool { + matches!(self, AuthError::Rejected(_)) + } } /// Verifies a bearer token and returns its [`Claims`]. A trait so the hub can be @@ -92,16 +128,13 @@ impl OidcVerifier { .http .get(url) .call() - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Unreachable(e.to_string()))?; if !resp.status().is_success() { - return Err(AuthError::Provider(format!( - "{url} returned {}", - resp.status() - ))); + return Err(AuthError::rejected(resp.status().as_u16(), None, None)); } resp.body_mut() .read_json() - .map_err(|e| AuthError::Provider(e.to_string())) + .map_err(|e| AuthError::Unreachable(e.to_string())) } /// Resolve the JWKS URI from the provider's discovery document. @@ -169,3 +202,38 @@ impl TokenVerifier for OidcVerifier { Some((&self.issuer, &self.audience)) } } + +#[cfg(test)] +mod tests { + use super::AuthError; + + #[test] + fn rejected_formats_status_error_and_description() { + let e = AuthError::rejected(400, Some("invalid_grant"), Some("Token is not active")); + assert!(e.is_rejection()); + assert_eq!( + e.to_string(), + "identity provider rejected the request: HTTP 400 (invalid_grant): Token is not active" + ); + } + + #[test] + fn rejected_omits_absent_or_empty_oauth_fields() { + // No OAuth body (e.g. a bare 503) → just the status. + assert_eq!( + AuthError::rejected(503, None, None).to_string(), + "identity provider rejected the request: HTTP 503" + ); + // Empty strings are treated as absent, not rendered as "()" / ": ". + assert_eq!( + AuthError::rejected(400, Some(""), Some("")).to_string(), + "identity provider rejected the request: HTTP 400" + ); + } + + #[test] + fn unreachable_is_not_a_rejection() { + assert!(!AuthError::Unreachable("connection refused".into()).is_rejection()); + assert!(!AuthError::Other("keyring locked".into()).is_rejection()); + } +} diff --git a/crates/hephd/src/oauth.rs b/crates/hephd/src/oauth.rs index 53ee5f0..4af704f 100644 --- a/crates/hephd/src/oauth.rs +++ b/crates/hephd/src/oauth.rs @@ -109,7 +109,7 @@ impl KeyringTokenStore { } }); keyring_core::Entry::new(&self.service, &self.account) - .map_err(|e| AuthError::Provider(e.to_string())) + .map_err(|e| AuthError::Other(e.to_string())) } } @@ -119,16 +119,16 @@ impl TokenStore for KeyringTokenStore { serde_json::from_str(&secret).ok() } fn save(&self, token: &StoredToken) -> Result<(), AuthError> { - let json = serde_json::to_string(token).map_err(|e| AuthError::Provider(e.to_string()))?; + let json = serde_json::to_string(token).map_err(|e| AuthError::Other(e.to_string()))?; self.entry()? .set_password(&json) - .map_err(|e| AuthError::Provider(e.to_string())) + .map_err(|e| AuthError::Other(e.to_string())) } fn clear(&self) -> Result<(), AuthError> { match self.entry()?.delete_credential() { Ok(()) => Ok(()), Err(keyring_core::Error::NoEntry) => Ok(()), - Err(e) => Err(AuthError::Provider(e.to_string())), + Err(e) => Err(AuthError::Other(e.to_string())), } } } @@ -187,6 +187,9 @@ impl TokenResponse { #[derive(Debug, Deserialize)] struct TokenErrorBody { error: String, + /// Human-readable detail the provider may include (RFC 6749 §5.2). + #[serde(default)] + error_description: Option, } /// Drives the OAuth 2.0 device-code flow against one provider. @@ -208,17 +211,14 @@ impl DeviceFlow { let mut resp = http .get(&url) .call() - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Unreachable(e.to_string()))?; if !resp.status().is_success() { - return Err(AuthError::Provider(format!( - "discovery returned {}", - resp.status() - ))); + return Err(AuthError::rejected(resp.status().as_u16(), None, None)); } let doc: DiscoveryDoc = resp .body_mut() .read_json() - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Other(e.to_string()))?; Ok(DeviceFlow { client_id: client_id.to_string(), http, @@ -233,16 +233,13 @@ impl DeviceFlow { .http .post(&self.device_authorization_endpoint) .send_form([("client_id", self.client_id.as_str()), ("scope", scope)]) - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Unreachable(e.to_string()))?; if !resp.status().is_success() { - return Err(AuthError::Provider(format!( - "device authorization returned {}", - resp.status() - ))); + return Err(AuthError::rejected(resp.status().as_u16(), None, None)); } resp.body_mut() .read_json() - .map_err(|e| AuthError::Provider(e.to_string())) + .map_err(|e| AuthError::Other(e.to_string())) } /// Poll the token endpoint until the user authorizes, the code expires, or @@ -267,13 +264,13 @@ impl DeviceFlow { ("device_code", auth.device_code.as_str()), ("client_id", self.client_id.as_str()), ]) - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Unreachable(e.to_string()))?; if response.status().is_success() { let token: TokenResponse = response .body_mut() .read_json() - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Other(e.to_string()))?; return Ok(token.into_stored()); } @@ -281,7 +278,7 @@ impl DeviceFlow { let body: TokenErrorBody = response .body_mut() .read_json() - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Other(e.to_string()))?; match body.error.as_str() { "authorization_pending" => {} "slow_down" => interval += 5, @@ -301,17 +298,24 @@ impl DeviceFlow { ("refresh_token", refresh_token), ("client_id", self.client_id.as_str()), ]) - .map_err(|e| AuthError::Provider(e.to_string()))?; + .map_err(|e| AuthError::Unreachable(e.to_string()))?; if !response.status().is_success() { - return Err(AuthError::Provider(format!( - "token refresh returned {}", - response.status() - ))); + // The IdP was reached and refused the grant (typically a `400 + // invalid_grant` once the refresh token is expired/rotated). Report + // it as a *rejection* with the OAuth error body — not "unreachable", + // which would misdirect debugging toward the network. + let status = response.status().as_u16(); + let body = response.body_mut().read_json::().ok(); + return Err(AuthError::rejected( + status, + body.as_ref().map(|b| b.error.as_str()), + body.as_ref().and_then(|b| b.error_description.as_deref()), + )); } let mut token: StoredToken = response .body_mut() .read_json::() - .map_err(|e| AuthError::Provider(e.to_string()))? + .map_err(|e| AuthError::Other(e.to_string()))? .into_stored(); // Providers may omit the refresh token on refresh — keep the old one. if token.refresh_token.is_none() { diff --git a/crates/hephd/src/server.rs b/crates/hephd/src/server.rs index 30c5d5a..89dee78 100644 --- a/crates/hephd/src/server.rs +++ b/crates/hephd/src/server.rs @@ -20,6 +20,7 @@ use tokio::net::{UnixListener, UnixStream}; use heph_core::Store; +use crate::auth::AuthError; use crate::oauth::{self, TokenStore}; use crate::rpc::{self, Request, Response, RpcError, INTERNAL_ERROR, PARSE_ERROR}; use crate::selfupdate::{self, SelfUpdateConfig}; @@ -80,10 +81,25 @@ fn is_auth_error(e: &anyhow::Error) -> bool { .is_some_and(|s| s == reqwest::StatusCode::UNAUTHORIZED) } -/// Fold one exchange outcome into the shared [`SyncHealth`]. -fn record_sync_outcome(health: &Arc>, result: &Result) { +/// The exact `heph auth login …` command that re-authenticates this spoke, built +/// from the hub URL + issuer + client id the daemon is configured with — so the +/// surfaced error tells the user *what to run*, not just that auth failed. +/// `None` for an unauthenticated / standalone instance. The hub-URL string must +/// match what the credential store is keyed under, which is exactly `hub_url`. +fn reauth_command(hub_url: Option<&str>, auth: Option<&SpokeAuth>) -> Option { + let (hub, auth) = (hub_url?, auth?); + Some(format!( + "heph auth login --hub-url {hub} --issuer {} --client-id {}", + auth.issuer, auth.client_id + )) +} + +/// Fold one exchange outcome into the shared [`SyncHealth`]. On an auth failure +/// (a 401 from the hub) the recorded error carries the actionable re-login +/// command, so `heph sync --status` / `heph auth status` / the TUI show the fix. +fn record_sync_outcome(ctx: &Ctx, result: &Result) { let now = now_ms(); - let mut h = health.lock().expect("sync_health mutex poisoned"); + let mut h = ctx.sync_health.lock().expect("sync_health mutex poisoned"); h.last_attempt_ms = Some(now); match result { Ok(_) => { @@ -92,28 +108,67 @@ fn record_sync_outcome(health: &Arc>, result: &Result { - h.auth_failure = is_auth_error(e); - h.last_error = Some(e.to_string()); + let auth_failure = is_auth_error(e); + h.auth_failure = auth_failure; + h.last_error = Some(annotate_reauth( + e.to_string(), + auth_failure, + ctx.hub_url.as_deref(), + ctx.auth.as_ref(), + )); } } } +/// Record a failure to obtain a bearer token (the refresh step, before any hub +/// request). A *rejection* (the IdP refused the refresh) is an auth failure and +/// gets the re-login hint; a transport failure stays a transient error. Surfacing +/// this here means `last_error` reflects the real cause (e.g. `invalid_grant`) +/// instead of only the downstream 401 on `/sync/pull`. +fn record_bearer_failure(ctx: &Ctx, err: &AuthError) { + let now = now_ms(); + let auth_failure = err.is_rejection(); + let mut h = ctx.sync_health.lock().expect("sync_health mutex poisoned"); + h.last_attempt_ms = Some(now); + h.auth_failure = auth_failure; + h.last_error = Some(annotate_reauth( + format!("could not obtain bearer token: {err}"), + auth_failure, + ctx.hub_url.as_deref(), + ctx.auth.as_ref(), + )); +} + +/// Append the actionable re-login command to `msg` when this is an auth failure +/// and the spoke has auth configured. +fn annotate_reauth( + msg: String, + auth_failure: bool, + hub_url: Option<&str>, + auth: Option<&SpokeAuth>, +) -> String { + match reauth_command(hub_url, auth) { + Some(cmd) if auth_failure => format!("{msg} — re-authenticate: {cmd}"), + _ => msg, + } +} + impl Ctx { - /// The current bearer token for hub sync (refreshing if expired), or `None` - /// if this spoke has no auth configured / no usable token. - async fn bearer(&self) -> Option { - let auth = self.auth.clone()?; - let result = tokio::task::spawn_blocking(move || { + /// The current bearer token for hub sync (refreshing if expired). `Ok(None)` + /// means this spoke has no auth configured / no token stored (it syncs + /// unauthenticated); `Err` means token acquisition genuinely failed (the + /// caller records it and skips the attempt rather than 401ing the hub). + async fn bearer(&self) -> Result, AuthError> { + let Some(auth) = self.auth.clone() else { + return Ok(None); + }; + match tokio::task::spawn_blocking(move || { oauth::current_bearer(auth.store.as_ref(), &auth.issuer, &auth.client_id) }) - .await; - match result { - Ok(Ok(token)) => token, - Ok(Err(e)) => { - tracing::warn!("could not obtain bearer token: {e}"); - None - } - Err(_) => None, + .await + { + Ok(res) => res, + Err(_join) => Ok(None), // the blocking task panicked; treat as no token } } } @@ -223,10 +278,20 @@ impl Daemon { let mut tick = tokio::time::interval(interval); loop { tick.tick().await; - let bearer = ctx.bearer().await; + let bearer = match ctx.bearer().await { + Ok(b) => b, + Err(e) => { + // Couldn't get a token — record the real cause (e.g. a + // rejected refresh) and skip; sending an unauthenticated + // request would only 401 and mask it. + record_bearer_failure(&ctx, &e); + tracing::warn!("background sync: could not obtain bearer token: {e}"); + continue; + } + }; let result = sync::sync_once(ctx.store.clone(), &hub, &ctx.http, bearer.as_deref()).await; - record_sync_outcome(&ctx.sync_health, &result); + record_sync_outcome(&ctx, &result); match result { Ok(report) => tracing::debug!(?report, "background sync"), Err(e) => tracing::warn!("background sync failed: {e}"), @@ -321,9 +386,25 @@ async fn sync_now(ctx: &Ctx) -> Result { message: "no hub_url configured; this instance is standalone".into(), }); }; - let bearer = ctx.bearer().await; + let bearer = match ctx.bearer().await { + Ok(b) => b, + Err(e) => { + // Token acquisition failed — record the real cause (with a re-login + // hint when it's a rejection) and surface it instead of a downstream 401. + record_bearer_failure(ctx, &e); + return Err(RpcError { + code: INTERNAL_ERROR, + message: annotate_reauth( + format!("sync failed: could not obtain bearer token: {e}"), + e.is_rejection(), + ctx.hub_url.as_deref(), + ctx.auth.as_ref(), + ), + }); + } + }; let result = sync::sync_once(ctx.store.clone(), &hub_url, &ctx.http, bearer.as_deref()).await; - record_sync_outcome(&ctx.sync_health, &result); + record_sync_outcome(ctx, &result); match result { Ok(report) => Ok(json!(report)), Err(e) => Err(RpcError { @@ -374,10 +455,22 @@ async fn sync_status(ctx: &Ctx) -> Result { .expect("sync_health mutex poisoned") .clone(); + // Non-secret OIDC params (issuer/client-id) + the exact re-login command, so + // `heph auth status` can show the fix without reconstructing it client-side + // (and keyed under the right hub URL — see the per-URL token-keying gotcha). + let auth = ctx.auth.as_ref().map(|a| { + json!({ + "issuer": a.issuer, + "client_id": a.client_id, + }) + }); + Ok(json!({ "hub_url": hub_url, "cursors": cursors, "conflicts": conflicts, "health": health, + "auth": auth, + "reauth_command": reauth_command(Some(&hub_url), ctx.auth.as_ref()), })) } diff --git a/crates/hephd/src/sync.rs b/crates/hephd/src/sync.rs index bfaa323..9beac05 100644 --- a/crates/hephd/src/sync.rs +++ b/crates/hephd/src/sync.rs @@ -261,8 +261,14 @@ async fn require_auth( .await .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)? .map_err(|e| match e { - AuthError::Provider(_) => StatusCode::SERVICE_UNAVAILABLE, - _ => StatusCode::UNAUTHORIZED, + // The token itself is missing/bad → tell the client it's unauthorized. + AuthError::Missing | AuthError::Invalid(_) => StatusCode::UNAUTHORIZED, + // We couldn't reach/process the IdP to fetch verification keys — a + // transient hub-side problem, not the client's token. Ask them to + // retry rather than claiming their token is invalid. + AuthError::Unreachable(_) | AuthError::Rejected(_) | AuthError::Other(_) => { + StatusCode::SERVICE_UNAVAILABLE + } })?; // Multi-tenancy seam: resolve the token's identity to the owner it may act diff --git a/crates/hephd/tests/oauth.rs b/crates/hephd/tests/oauth.rs index f61c872..0a1c709 100644 --- a/crates/hephd/tests/oauth.rs +++ b/crates/hephd/tests/oauth.rs @@ -90,11 +90,25 @@ async fn token(State(s): State, Form(form): Form Json(json!({ - "access_token": "access-2", - "expires_in": 3600, - })) - .into_response(), + Some("refresh_token") => { + // A rotated/expired refresh token is refused with `400 invalid_grant` + // (RFC 6749 §5.2) — the case that used to be mislabeled "unreachable". + if form.get("refresh_token").map(String::as_str) == Some("refresh-expired") { + return ( + StatusCode::BAD_REQUEST, + Json(json!({ + "error": "invalid_grant", + "error_description": "Token is not active", + })), + ) + .into_response(); + } + Json(json!({ + "access_token": "access-2", + "expires_in": 3600, + })) + .into_response() + } _ => ( StatusCode::BAD_REQUEST, Json(json!({ "error": "unsupported_grant_type" })), @@ -129,6 +143,48 @@ fn refresh_keeps_the_old_refresh_token_when_omitted() { assert_eq!(refreshed.refresh_token.as_deref(), Some("refresh-1")); } +#[test] +fn refresh_rejected_by_idp_is_a_rejection_not_unreachable() { + let issuer = start_idp(); + let flow = DeviceFlow::discover(&issuer, "heph-cli").unwrap(); + let err = flow.refresh("refresh-expired").unwrap_err(); + // The whole point of the fix: a reachable IdP that returns 400 is a + // *rejection*, carrying the OAuth error body — not "unreachable". + assert!(err.is_rejection(), "expected a rejection, got: {err}"); + let msg = err.to_string(); + assert!( + msg.contains("rejected"), + "message should say rejected: {msg}" + ); + assert!( + msg.contains("invalid_grant"), + "should include the OAuth error: {msg}" + ); + assert!( + msg.contains("Token is not active"), + "should include error_description: {msg}" + ); + assert!( + !msg.contains("unreachable"), + "must NOT claim the IdP was unreachable: {msg}" + ); +} + +#[test] +fn discovery_against_a_dead_idp_is_unreachable_not_a_rejection() { + use hephd::AuthError; + // Port 1 refuses the connection → a genuine transport failure. + let err = match DeviceFlow::discover("http://127.0.0.1:1/application/o/heph/", "heph-cli") { + Ok(_) => panic!("discovery should fail against a dead IdP"), + Err(e) => e, + }; + assert!( + matches!(err, AuthError::Unreachable(_)), + "a connection failure must be Unreachable, got: {err}" + ); + assert!(!err.is_rejection()); +} + #[test] fn memory_token_store_round_trips_and_reports_expiry() { let store = MemoryTokenStore::default(); diff --git a/docs/changelog.d/auth-error-clarity.bugfix.md b/docs/changelog.d/auth-error-clarity.bugfix.md new file mode 100644 index 0000000..83ba854 --- /dev/null +++ b/docs/changelog.d/auth-error-clarity.bugfix.md @@ -0,0 +1 @@ +hephd no longer reports a rejected OAuth refresh as "identity provider unreachable". A reachable IdP that returns an HTTP error (e.g. `400 invalid_grant` once a refresh token expires/rotates) is now surfaced as a *rejection* — `identity provider rejected the request: HTTP 400 (invalid_grant): …` — with the OAuth error body, distinct from a genuine transport failure. This stops the wording from misdirecting incident response toward the network when the real fix is re-authentication. diff --git a/docs/changelog.d/auth-error-clarity.feature.md b/docs/changelog.d/auth-error-clarity.feature.md new file mode 100644 index 0000000..ab67867 --- /dev/null +++ b/docs/changelog.d/auth-error-clarity.feature.md @@ -0,0 +1 @@ +Spoke auth failures now tell you how to recover. When a refresh token is rejected or the hub returns 401, `hephd` records the real cause plus the exact `heph auth login --hub-url … --issuer … --client-id …` command (keyed to this spoke's hub) in its sync health. A new `heph auth status` prints that health and the re-login command, `heph sync --status`'s `last_error` carries it, and `heph-tui`'s status line points at it with a `⚠ auth · heph auth status` chip. diff --git a/docs/how-to/set-up-sync-hub.md b/docs/how-to/set-up-sync-hub.md index a5b56ea..4d654a9 100644 --- a/docs/how-to/set-up-sync-hub.md +++ b/docs/how-to/set-up-sync-hub.md @@ -130,19 +130,41 @@ spoke is visible at a glance rather than buried in the daemon log. Make a change on `gilbert`, force a sync, and confirm it appears via the hub. +### When sync stops authenticating + +A spoke's refresh token can expire or be rotated (e.g. the IdP session lapses). +The spoke then can't refresh on its own and needs a re-login — but this is +**visible, not silent**: + +- `heph-tui` shows a red `⚠ auth · heph auth status` chip in the status line. +- `heph auth status` prints the auth health and the **exact** re-login command, + pre-filled with this spoke's hub URL / issuer / client id: + + ```bash + heph auth status + ``` + +- `heph sync --status`'s `last_error` names the real cause — a refresh + *rejection* (e.g. `HTTP 400 (invalid_grant)`), not a misleading "identity + provider unreachable" — and carries the same `heph auth login …` hint. + +Run the printed `heph auth login …` command to restore sync. + ## Current gaps (finalized by the blumeops deployment) -The flag-level flow above works today; two enablers make it a clean, managed +The flag-level flow above works today; one enabler makes it a clean, managed deployment rather than a hand-run process — tracked in the `Hephaestus` project: -- **`heph daemon` only generates a `--mode local` service** (no `--hub-url` / - `--oidc-*`). So for now the hub and the spoke config are expressed as `hephd` - flags (run directly, or via the blumeops-managed systemd unit), not via - `heph daemon start`. - **Path A seeding is manual** (copy the store + reset the device origin). A small enabler — seed a hub from a snapshot with a fresh origin, or `hephd --owner-id` — would make this one step. +> `heph daemon start`/`restart` can now bake the spoke/hub config (`--hub-url`, +> `--mode server`, `--http-addr`, `--oidc-*`) into the generated service (see +> [[run-the-daemon]]). The canonical hub on `indri` is still provisioned via the +> blumeops-managed systemd unit by deployment choice, not because `heph daemon` +> can't express it. + ## Related - [[run-the-daemon]] — manage the local daemon as an OS service From 2ca1e246f0b178de046bb781c60c595a7fdaab16 Mon Sep 17 00:00:00 2001 From: Forgejo Actions Date: Mon, 8 Jun 2026 14:15:03 -0700 Subject: [PATCH 3/7] Update changelog for v1.3.0 [skip ci] --- CHANGELOG.md | 14 ++++++++++++++ docs/changelog.d/+sync-age-seconds.feature.md | 1 - docs/changelog.d/auth-error-clarity.bugfix.md | 1 - docs/changelog.d/auth-error-clarity.feature.md | 1 - docs/changelog.d/daemon-restart-race.bugfix.md | 1 - .../daemon-self-update-interval.feature.md | 1 - 6 files changed, 14 insertions(+), 5 deletions(-) delete mode 100644 docs/changelog.d/+sync-age-seconds.feature.md delete mode 100644 docs/changelog.d/auth-error-clarity.bugfix.md delete mode 100644 docs/changelog.d/auth-error-clarity.feature.md delete mode 100644 docs/changelog.d/daemon-restart-race.bugfix.md delete mode 100644 docs/changelog.d/daemon-self-update-interval.feature.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 9799e3f..1acdbe1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,20 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [v1.3.0] - 2026-06-08 + +### Features + +- Spoke auth failures now tell you how to recover. When a refresh token is rejected or the hub returns 401, `hephd` records the real cause plus the exact `heph auth login --hub-url … --issuer … --client-id …` command (keyed to this spoke's hub) in its sync health. A new `heph auth status` prints that health and the re-login command, `heph sync --status`'s `last_error` carries it, and `heph-tui`'s status line points at it with a `⚠ auth · heph auth status` chip. +- `heph daemon start`/`restart` can now bake the daemon's full runtime config into the managed service — `--mode`, `--hub-url`, `--http-addr`, `--oidc-issuer`/`--oidc-audience`/`--oidc-client-id`, and `--self-update-interval-secs` (previously only the bare `--self-update` bool was wired). Regenerating preserves whatever is already baked into the on-disk plist/unit, so a bare `start`/`restart` no longer silently drops spoke/hub or self-update config. +- heph-tui's sync indicator now shows the last-sync age in seconds under a minute (`⟳ 26s`) instead of a flat `just now`, so the chip reads as a live heartbeat and a missed sync (the loop runs every 30s) shows up as the age climbing. + +### Bug Fixes + +- hephd no longer reports a rejected OAuth refresh as "identity provider unreachable". A reachable IdP that returns an HTTP error (e.g. `400 invalid_grant` once a refresh token expires/rotates) is now surfaced as a *rejection* — `identity provider rejected the request: HTTP 400 (invalid_grant): …` — with the OAuth error body, distinct from a genuine transport failure. This stops the wording from misdirecting incident response toward the network when the real fix is re-authentication. +- `heph daemon restart` on macOS no longer intermittently fails with `launchctl bootstrap failed: 5: Input/output error`. The old code bootstrapped immediately after `bootout`, racing launchd's asynchronous teardown; it now waits for the service to fully unload and retries the bootstrap. When the plist is unchanged (e.g. a plain binary upgrade) it uses `launchctl kickstart -k` to restart the loaded job atomically, sidestepping the bootout→bootstrap dance entirely. + + ## [v1.2.3] - 2026-06-06 ### Features diff --git a/docs/changelog.d/+sync-age-seconds.feature.md b/docs/changelog.d/+sync-age-seconds.feature.md deleted file mode 100644 index cf453c2..0000000 --- a/docs/changelog.d/+sync-age-seconds.feature.md +++ /dev/null @@ -1 +0,0 @@ -heph-tui's sync indicator now shows the last-sync age in seconds under a minute (`⟳ 26s`) instead of a flat `just now`, so the chip reads as a live heartbeat and a missed sync (the loop runs every 30s) shows up as the age climbing. diff --git a/docs/changelog.d/auth-error-clarity.bugfix.md b/docs/changelog.d/auth-error-clarity.bugfix.md deleted file mode 100644 index 83ba854..0000000 --- a/docs/changelog.d/auth-error-clarity.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -hephd no longer reports a rejected OAuth refresh as "identity provider unreachable". A reachable IdP that returns an HTTP error (e.g. `400 invalid_grant` once a refresh token expires/rotates) is now surfaced as a *rejection* — `identity provider rejected the request: HTTP 400 (invalid_grant): …` — with the OAuth error body, distinct from a genuine transport failure. This stops the wording from misdirecting incident response toward the network when the real fix is re-authentication. diff --git a/docs/changelog.d/auth-error-clarity.feature.md b/docs/changelog.d/auth-error-clarity.feature.md deleted file mode 100644 index ab67867..0000000 --- a/docs/changelog.d/auth-error-clarity.feature.md +++ /dev/null @@ -1 +0,0 @@ -Spoke auth failures now tell you how to recover. When a refresh token is rejected or the hub returns 401, `hephd` records the real cause plus the exact `heph auth login --hub-url … --issuer … --client-id …` command (keyed to this spoke's hub) in its sync health. A new `heph auth status` prints that health and the re-login command, `heph sync --status`'s `last_error` carries it, and `heph-tui`'s status line points at it with a `⚠ auth · heph auth status` chip. diff --git a/docs/changelog.d/daemon-restart-race.bugfix.md b/docs/changelog.d/daemon-restart-race.bugfix.md deleted file mode 100644 index c13a257..0000000 --- a/docs/changelog.d/daemon-restart-race.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -`heph daemon restart` on macOS no longer intermittently fails with `launchctl bootstrap failed: 5: Input/output error`. The old code bootstrapped immediately after `bootout`, racing launchd's asynchronous teardown; it now waits for the service to fully unload and retries the bootstrap. When the plist is unchanged (e.g. a plain binary upgrade) it uses `launchctl kickstart -k` to restart the loaded job atomically, sidestepping the bootout→bootstrap dance entirely. diff --git a/docs/changelog.d/daemon-self-update-interval.feature.md b/docs/changelog.d/daemon-self-update-interval.feature.md deleted file mode 100644 index b5ec9b8..0000000 --- a/docs/changelog.d/daemon-self-update-interval.feature.md +++ /dev/null @@ -1 +0,0 @@ -`heph daemon start`/`restart` can now bake the daemon's full runtime config into the managed service — `--mode`, `--hub-url`, `--http-addr`, `--oidc-issuer`/`--oidc-audience`/`--oidc-client-id`, and `--self-update-interval-secs` (previously only the bare `--self-update` bool was wired). Regenerating preserves whatever is already baked into the on-disk plist/unit, so a bare `start`/`restart` no longer silently drops spoke/hub or self-update config. From 5c2b4bde2cf18f0ba45b1edff7c474cf092d3b9f Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 8 Jun 2026 14:35:10 -0700 Subject: [PATCH 4/7] Relabel changelog v1.3.0 section as v1.4.0 [skip ci] A double workflow_dispatch produced both v1.3.0 and an empty duplicate v1.4.0 (the version actually deployed via self-update). Move the release notes onto v1.4.0 to match what shipped; v1.3.0 release+tag are being removed. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1acdbe1..aa29354 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,7 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [v1.3.0] - 2026-06-08 +## [v1.4.0] - 2026-06-08 ### Features From b04a71421ed437320080e1d9cd1a64f6439f3c01 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 8 Jun 2026 15:19:10 -0700 Subject: [PATCH 5/7] fix(hephd): reconnect the socket client across daemon restarts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Client` connected to the unix socket once and never reconnected, so after an opt-in self-update or `heph daemon restart` dropped the socket, every later `call()` failed — `heph-tui` would sit on errors until relaunched (and the work we just shipped makes restarts more frequent). `Client` now stores the socket path and reconnects on a dropped connection, classifying the failure to stay safe: - write-side failure (request never reached the daemon) → reconnect + retry once; - reply lost after sending (daemon closed mid-request) → reconnect for next time but surface this one, so a mutation is never silently double-applied; - genuine RPC errors are passed through untouched. heph-tui and the CLI use `Client` unchanged, so the TUI self-heals on its next refresh tick. Adds an integration test driving a mock daemon that drops the connection after each request. Closes the "heph-tui: reconnect on a dropped daemon socket" backlog task. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/hephd/src/client.rs | 112 +++++++++++++++++--- crates/hephd/tests/client_reconnect.rs | 96 +++++++++++++++++ docs/changelog.d/client-reconnect.bugfix.md | 1 + docs/how-to/run-the-daemon.md | 8 ++ 4 files changed, 204 insertions(+), 13 deletions(-) create mode 100644 crates/hephd/tests/client_reconnect.rs create mode 100644 docs/changelog.d/client-reconnect.bugfix.md diff --git a/crates/hephd/src/client.rs b/crates/hephd/src/client.rs index c3c008b..8a2bd5d 100644 --- a/crates/hephd/src/client.rs +++ b/crates/hephd/src/client.rs @@ -2,59 +2,145 @@ //! //! Used by the `heph` CLI and by tests. Surfaces never touch SQLite directly //! (tech-spec §3) — they go through the daemon socket, which this wraps. +//! +//! The connection self-heals across daemon restarts (opt-in self-update, `heph +//! daemon restart`): a [`call`](Client::call) that finds the socket dropped +//! reconnects. It only auto-retries when the request provably never reached the +//! daemon (a write-side failure); a reply lost *after* sending is surfaced +//! rather than retried, so a mutation is never silently double-applied. use std::io::{BufRead, BufReader, Write}; use std::os::unix::net::UnixStream; -use std::path::Path; +use std::path::{Path, PathBuf}; -use anyhow::{bail, Context, Result}; +use anyhow::{anyhow, Context, Result}; use serde_json::{json, Value}; use crate::rpc::Response; /// A connected client. One request/response per [`call`](Client::call). pub struct Client { + socket_path: PathBuf, reader: BufReader, writer: UnixStream, next_id: u64, } +/// How a single request/response exchange failed — drives the retry decision. +enum ExchangeError { + /// The request could not be written (broken pipe, reset): it never reached + /// the daemon, so retrying on a fresh connection is safe. + Send(anyhow::Error), + /// The request was sent but no reply came back (the daemon closed mid-flight, + /// e.g. it restarted): it may or may not have applied — do not retry. + Recv(anyhow::Error), + /// A well-formed RPC-level error (or an unparseable reply): the connection is + /// fine; nothing to reconnect. + Rpc(anyhow::Error), +} + +impl ExchangeError { + fn into_inner(self) -> anyhow::Error { + match self { + ExchangeError::Send(e) | ExchangeError::Recv(e) | ExchangeError::Rpc(e) => e, + } + } +} + impl Client { /// Connect to a daemon listening at `socket_path`. pub fn connect(socket_path: &Path) -> Result { - let stream = UnixStream::connect(socket_path) - .with_context(|| format!("connecting to hephd at {}", socket_path.display()))?; - let reader = BufReader::new(stream.try_clone()?); + let (reader, writer) = Self::open(socket_path)?; Ok(Client { + socket_path: socket_path.to_path_buf(), reader, - writer: stream, + writer, next_id: 1, }) } + /// Open a fresh reader/writer pair on the socket. + fn open(socket_path: &Path) -> Result<(BufReader, UnixStream)> { + let stream = UnixStream::connect(socket_path) + .with_context(|| format!("connecting to hephd at {}", socket_path.display()))?; + let reader = BufReader::new(stream.try_clone()?); + Ok((reader, stream)) + } + + /// Re-establish the connection (after the daemon restarted and dropped it). + fn reconnect(&mut self) -> Result<()> { + let (reader, writer) = Self::open(&self.socket_path)?; + self.reader = reader; + self.writer = writer; + Ok(()) + } + /// Call `method` with `params`, returning the `result` value (or an error /// carrying the RPC error's code and message). + /// + /// If the daemon has restarted and dropped the socket, this reconnects: it + /// retries transparently when the request never went out, and otherwise + /// reconnects for the next call while surfacing an error for this one (so a + /// mutation whose reply was lost is not silently re-applied). pub fn call(&mut self, method: &str, params: Value) -> Result { let id = self.next_id; self.next_id += 1; - let mut line = serde_json::to_string(&json!({ "id": id, "method": method, "params": params, }))?; line.push('\n'); - self.writer.write_all(line.as_bytes())?; - self.writer.flush()?; + + match self.exchange(&line) { + Ok(v) => Ok(v), + Err(ExchangeError::Rpc(e)) => Err(e), + Err(ExchangeError::Send(_)) => { + // The request never reached the daemon — reconnect and retry once. + self.reconnect() + .context("hephd connection lost and reconnect failed")?; + self.exchange(&line) + .map_err(ExchangeError::into_inner) + .with_context(|| format!("retrying `{method}` after reconnect")) + } + Err(ExchangeError::Recv(e)) => { + // Sent but no reply: the daemon likely restarted mid-request. Don't + // retry (a mutation may have applied); reconnect for next time and + // surface this one. + let _ = self.reconnect(); + Err(e).context( + "hephd closed the connection mid-request (it likely restarted); \ + reconnected — re-run the action if it didn't take effect", + ) + } + } + } + + /// One request/response over the current connection, classifying failures. + fn exchange(&mut self, line: &str) -> std::result::Result { + self.writer + .write_all(line.as_bytes()) + .map_err(|e| ExchangeError::Send(e.into()))?; + self.writer + .flush() + .map_err(|e| ExchangeError::Send(e.into()))?; let mut response_line = String::new(); - let read = self.reader.read_line(&mut response_line)?; + let read = self + .reader + .read_line(&mut response_line) + .map_err(|e| ExchangeError::Recv(e.into()))?; if read == 0 { - bail!("hephd closed the connection"); + return Err(ExchangeError::Recv(anyhow!("hephd closed the connection"))); } - let response: Response = serde_json::from_str(&response_line)?; + let response: Response = + serde_json::from_str(&response_line).map_err(|e| ExchangeError::Rpc(e.into()))?; if let Some(err) = response.error { - bail!("rpc error {}: {}", err.code, err.message); + return Err(ExchangeError::Rpc(anyhow!( + "rpc error {}: {}", + err.code, + err.message + ))); } Ok(response.result.unwrap_or(Value::Null)) } diff --git a/crates/hephd/tests/client_reconnect.rs b/crates/hephd/tests/client_reconnect.rs new file mode 100644 index 0000000..a4d0074 --- /dev/null +++ b/crates/hephd/tests/client_reconnect.rs @@ -0,0 +1,96 @@ +//! [`Client`] survives the daemon dropping the socket (opt-in self-update, `heph +//! daemon restart`). A mock daemon serves exactly one request per connection +//! then closes it, forcing the client to reconnect — without auto-reconnect, +//! every call after the first would fail forever. + +use std::io::{BufRead, BufReader, Write}; +use std::os::unix::net::UnixListener; +use std::path::PathBuf; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::thread; +use std::time::Duration; + +use hephd::Client; +use serde_json::{json, Value}; + +/// A mock daemon that handles ONE request per connection then closes it, looping +/// to accept the next connection. `served` counts total requests answered. +fn spawn_one_shot_daemon(socket: PathBuf, served: Arc) { + thread::spawn(move || { + let listener = UnixListener::bind(&socket).unwrap(); + for conn in listener.incoming() { + let Ok(mut stream) = conn else { continue }; + let mut reader = BufReader::new(stream.try_clone().unwrap()); + let mut line = String::new(); + if reader.read_line(&mut line).unwrap_or(0) == 0 { + continue; // client opened then went away; wait for the next one + } + let req: Value = serde_json::from_str(&line).unwrap(); + let n = served.fetch_add(1, Ordering::SeqCst) + 1; + let mut out = serde_json::to_string(&json!({ + "id": req["id"], + "result": { "served": n }, + })) + .unwrap(); + out.push('\n'); + let _ = stream.write_all(out.as_bytes()); + let _ = stream.flush(); + // `stream` drops here → the connection closes after one request. + } + }); +} + +fn wait_for(socket: &std::path::Path) { + for _ in 0..400 { + if socket.exists() { + return; + } + thread::sleep(Duration::from_millis(5)); + } + panic!("mock daemon socket never appeared"); +} + +#[test] +fn client_reconnects_after_the_daemon_drops_the_socket() { + let dir = tempfile::tempdir().unwrap(); + let socket = dir.path().join("d.sock"); + let served = Arc::new(AtomicUsize::new(0)); + spawn_one_shot_daemon(socket.clone(), served.clone()); + wait_for(&socket); + + let mut c = Client::connect(&socket).unwrap(); + + // First call works on the initial connection. + let r1 = c.call("ping", json!({})).unwrap(); + assert_eq!(r1["served"], 1); + + // The daemon has now closed that connection. With reconnect, the client + // recovers within a call or two (depending on whether the dead socket fails + // on write or on read); without it, every further call would fail forever. + let mut recovered = None; + for _ in 0..2 { + if let Ok(v) = c.call("ping", json!({})) { + recovered = Some(v); + break; + } + } + let r = recovered.expect("client should reconnect after the socket was dropped"); + // The recovered call was served exactly once on the new connection — no + // double-serve from a spurious retry. + assert_eq!(r["served"], 2); + assert_eq!(served.load(Ordering::SeqCst), 2); + + // And it keeps working across subsequent drops. + let r3 = { + let mut got = None; + for _ in 0..2 { + if let Ok(v) = c.call("ping", json!({})) { + got = Some(v); + break; + } + } + got.expect("client should keep reconnecting") + }; + assert_eq!(r3["served"], 3); +} diff --git a/docs/changelog.d/client-reconnect.bugfix.md b/docs/changelog.d/client-reconnect.bugfix.md new file mode 100644 index 0000000..ae987b8 --- /dev/null +++ b/docs/changelog.d/client-reconnect.bugfix.md @@ -0,0 +1 @@ +The `heph` CLI and `heph-tui` now survive a daemon restart. Previously the unix-socket client connected once and never reconnected, so an opt-in self-update or `heph daemon restart` left every subsequent call failing — `heph-tui` would sit on errors until relaunched. The client now reconnects on a dropped socket: a request that never went out is retried transparently, while a reply lost mid-request is surfaced (not silently retried) so a mutation is never double-applied. A long-running TUI self-heals on its next refresh tick. diff --git a/docs/how-to/run-the-daemon.md b/docs/how-to/run-the-daemon.md index cb9e56d..545b3be 100644 --- a/docs/how-to/run-the-daemon.md +++ b/docs/how-to/run-the-daemon.md @@ -86,6 +86,14 @@ still the old binary until you restart it: heph daemon restart ``` +A restart (or an opt-in self-update) drops the daemon's unix socket out from +under any connected surface. The CLI and `heph-tui` **reconnect automatically**: +a read transparently retries on a fresh connection, and a long-running TUI +self-heals on its next tick — so a daemon restart no longer leaves the agenda +view stuck on errors. (A mutating action whose reply is lost mid-restart reports +"reconnected — re-run the action if it didn't take effect" rather than risk +applying twice.) + ## Self-update (opt-in) `hephd` can keep itself current: `heph daemon start --self-update` generates a From 470ef1de0e5551183649df45b8118a63d9f44450 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Mon, 8 Jun 2026 20:08:07 -0700 Subject: [PATCH 6/7] fix(quickadd): return focus to the previous app when the popover hides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The global ⌘' quick-add overlay is a borderless, transparent, always-on-top accessory window that winit hides with `Visible(false)`. That orders the window out visually but leaves heph-quickadd the *active* application — so after a capture (or Esc / toggle) keyboard focus never returns to the app the user was in, and the lingering overlay can keep intercepting clicks where it used to sit. Hide at the application level instead via `NSApplication.hide:`, which fully orders our windows out and activates the next app in line (the previously focused one). On re-show, `unhide:` clears that hidden flag before the existing viewport `Focus` command makes the field key again. Both are macOS-only no-ops elsewhere, wired through new `app_yield_focus`/`app_take_focus` helpers backed by objc2 / objc2-app-kit (unified to the 0.6/0.3 line global-hotkey already pulls). Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 2 + crates/heph-quickadd/Cargo.toml | 11 ++++- crates/heph-quickadd/src/app.rs | 43 +++++++++++++++++++ .../feature-quickadd-focus-return.bugfix.md | 1 + 4 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 docs/changelog.d/feature-quickadd-focus-return.bugfix.md diff --git a/Cargo.lock b/Cargo.lock index be8f974..cc9b3a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2237,6 +2237,8 @@ dependencies = [ "heph-core", "hephd", "libc", + "objc2 0.6.4", + "objc2-app-kit 0.3.2", "serde_json", "winit", ] diff --git a/crates/heph-quickadd/Cargo.toml b/crates/heph-quickadd/Cargo.toml index 5b1889b..57bbb98 100644 --- a/crates/heph-quickadd/Cargo.toml +++ b/crates/heph-quickadd/Cargo.toml @@ -19,7 +19,16 @@ global-hotkey = "0.8" # macOS-only: winit for the accessory-mode activation policy (no Dock icon), # pinned to the same minor eframe carries so cargo unifies to one winit; libc -# for getppid() (orphan detection — self-exit when the supervising daemon dies). +# for getppid() (orphan detection — self-exit when the supervising daemon dies); +# objc2 + objc2-app-kit to hand keyboard focus back to the previously active app +# when the popover hides (NSApplication.hide:/unhide:). Pinned to the 0.6/0.3 +# line global-hotkey already pulls in, so cargo unifies to one copy. [target.'cfg(target_os = "macos")'.dependencies] winit = "0.30" libc = "0.2" +objc2 = "0.6" +objc2-app-kit = { version = "0.3", default-features = false, features = [ + "std", + "NSApplication", + "NSResponder", +] } diff --git a/crates/heph-quickadd/src/app.rs b/crates/heph-quickadd/src/app.rs index b08bf03..a334b22 100644 --- a/crates/heph-quickadd/src/app.rs +++ b/crates/heph-quickadd/src/app.rs @@ -226,6 +226,9 @@ impl QuickAdd { } fn show(&mut self, ctx: &egui::Context) { + // Undo the app-level hide from the previous `hide()` so we can take focus + // again (no-op the first time / off macOS). + app_take_focus(); self.visible = true; self.focus_pending = true; self.current_hint = random_hint(self.current_hint); @@ -256,6 +259,13 @@ impl QuickAdd { ctx.send_viewport_cmd(egui::ViewportCommand::InnerSize(egui::vec2(WIN_W, BASE_H))); self.win_h_applied = BASE_H; } + // Hand keyboard focus back to the app underneath us. winit's + // `Visible(false)` alone leaves *us* the active application, so focus + // never returns and the borderless always-on-top overlay can keep eating + // clicks where it used to sit. `NSApplication.hide:` orders our windows + // fully out and activates the next app in line — exactly the one the user + // was in (no-op off macOS). + app_yield_focus(); } /// Optimistic submit: hide now, create in the background. @@ -596,6 +606,39 @@ impl QuickAdd { } } +/// Hide the popover at the *application* level so macOS hands keyboard focus +/// back to the previously active app. `NSApplication.hide:` orders all our +/// windows out and activates the next app in line — the one the user was in — +/// which a plain winit `Visible(false)` does not do. No-op off macOS. +#[cfg(target_os = "macos")] +fn app_yield_focus() { + use objc2::MainThreadMarker; + use objc2_app_kit::NSApplication; + // eframe's `update` runs on the main thread, so this marker is always Some. + if let Some(mtm) = MainThreadMarker::new() { + NSApplication::sharedApplication(mtm).hide(None); + } +} + +#[cfg(not(target_os = "macos"))] +fn app_yield_focus() {} + +/// Undo [`app_yield_focus`]: clear the app-level hidden flag before re-showing, +/// so the window the viewport `Focus` command then makes key actually appears. +/// (`unhide:` also re-activates us; the per-window `Focus`/`Visible` viewport +/// commands do the rest.) No-op off macOS. +#[cfg(target_os = "macos")] +fn app_take_focus() { + use objc2::MainThreadMarker; + use objc2_app_kit::NSApplication; + if let Some(mtm) = MainThreadMarker::new() { + NSApplication::sharedApplication(mtm).unhide(None); + } +} + +#[cfg(not(target_os = "macos"))] +fn app_take_focus() {} + /// The current parent process id, for orphan detection. `None` off macOS (where /// hephd does not supervise a helper — there is no Aqua session to inherit). fn current_parent_pid() -> Option { diff --git a/docs/changelog.d/feature-quickadd-focus-return.bugfix.md b/docs/changelog.d/feature-quickadd-focus-return.bugfix.md new file mode 100644 index 0000000..6835eb5 --- /dev/null +++ b/docs/changelog.d/feature-quickadd-focus-return.bugfix.md @@ -0,0 +1 @@ +Quick-add popover (⌘'): hand keyboard focus back to the previously active app when it hides, and stop the (now invisible) overlay from intercepting clicks where it used to sit. From b34371af873f2d9a6ba3f097093b5efef22e07ef Mon Sep 17 00:00:00 2001 From: Forgejo Actions Date: Mon, 8 Jun 2026 20:24:38 -0700 Subject: [PATCH 7/7] Update changelog for v1.4.1 [skip ci] --- CHANGELOG.md | 8 ++++++++ docs/changelog.d/client-reconnect.bugfix.md | 1 - docs/changelog.d/feature-quickadd-focus-return.bugfix.md | 1 - 3 files changed, 8 insertions(+), 2 deletions(-) delete mode 100644 docs/changelog.d/client-reconnect.bugfix.md delete mode 100644 docs/changelog.d/feature-quickadd-focus-return.bugfix.md diff --git a/CHANGELOG.md b/CHANGELOG.md index aa29354..1900ad9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,14 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [v1.4.1] - 2026-06-08 + +### Bug Fixes + +- The `heph` CLI and `heph-tui` now survive a daemon restart. Previously the unix-socket client connected once and never reconnected, so an opt-in self-update or `heph daemon restart` left every subsequent call failing — `heph-tui` would sit on errors until relaunched. The client now reconnects on a dropped socket: a request that never went out is retried transparently, while a reply lost mid-request is surfaced (not silently retried) so a mutation is never double-applied. A long-running TUI self-heals on its next refresh tick. +- Quick-add popover (⌘'): hand keyboard focus back to the previously active app when it hides, and stop the (now invisible) overlay from intercepting clicks where it used to sit. + + ## [v1.4.0] - 2026-06-08 ### Features diff --git a/docs/changelog.d/client-reconnect.bugfix.md b/docs/changelog.d/client-reconnect.bugfix.md deleted file mode 100644 index ae987b8..0000000 --- a/docs/changelog.d/client-reconnect.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -The `heph` CLI and `heph-tui` now survive a daemon restart. Previously the unix-socket client connected once and never reconnected, so an opt-in self-update or `heph daemon restart` left every subsequent call failing — `heph-tui` would sit on errors until relaunched. The client now reconnects on a dropped socket: a request that never went out is retried transparently, while a reply lost mid-request is surfaced (not silently retried) so a mutation is never double-applied. A long-running TUI self-heals on its next refresh tick. diff --git a/docs/changelog.d/feature-quickadd-focus-return.bugfix.md b/docs/changelog.d/feature-quickadd-focus-return.bugfix.md deleted file mode 100644 index 6835eb5..0000000 --- a/docs/changelog.d/feature-quickadd-focus-return.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Quick-add popover (⌘'): hand keyboard focus back to the previously active app when it hides, and stop the (now invisible) overlay from intercepting clicks where it used to sit.