From 9213a943f5b755339ee056e744ce8cbe13eecd48 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Thu, 4 Jun 2026 13:47:26 -0700 Subject: [PATCH] C2(hephd-self-update): impl verify hub-dropout resilience (+ client timeout) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lock in the base-case guarantee that a self-updating hub (which restarts under its spokes) relies on. New sync_http test: a spoke whose hub is unreachable keeps serving + accepting writes, a sync attempt fails fast (Err, not hang/panic), and when the hub returns the accumulated ops reconcile with no special recovery. The verification surfaced one non-graceful path — the daemon's shared reqwest client had no timeout, so a black-hole hub (connects, never replies) could stall the sync/self-update loop. Give it a 30s timeout so 'the hub can vanish at any moment' holds even mid-request. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/hephd/src/server.rs | 9 ++++- crates/hephd/tests/sync_http.rs | 59 +++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/crates/hephd/src/server.rs b/crates/hephd/src/server.rs index 0de5278..e6129d5 100644 --- a/crates/hephd/src/server.rs +++ b/crates/hephd/src/server.rs @@ -77,7 +77,14 @@ impl Daemon { ctx: Ctx { store: Arc::new(Mutex::new(store)), hub_url: None, - http: reqwest::Client::new(), + // Bound every hub request so a black-hole hub (one that accepts + // a connection but never replies) can't stall the sync / + // self-update loops — "the hub can vanish at any moment" is the + // base case, including vanishing mid-request. + http: reqwest::Client::builder() + .timeout(Duration::from_secs(30)) + .build() + .expect("building the daemon HTTP client"), auth: None, self_update: None, }, diff --git a/crates/hephd/tests/sync_http.rs b/crates/hephd/tests/sync_http.rs index de8b7bf..ed093ce 100644 --- a/crates/hephd/tests/sync_http.rs +++ b/crates/hephd/tests/sync_http.rs @@ -84,6 +84,65 @@ async fn a_node_propagates_a_to_hub_to_b() { assert_eq!(on_b.body.as_deref(), Some("shingles need work")); } +#[tokio::test] +async fn spoke_survives_an_unreachable_hub_then_reconciles_when_it_returns() { + // "The hub can vanish at any moment" is the base case, not a guarded edge: + // a spoke whose hub is down keeps serving + accepting writes, and when the + // hub returns its accumulated ops reconcile with no special recovery. This + // is what makes a self-updating hub (which restarts under its spokes) safe. + let http = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(5)) // never hang the test + .build() + .unwrap(); + let (a, _ca, _da) = replica(1000); + + // Hub down: work happens locally, and a sync attempt fails *fast* (Err — not + // a panic, not a hang) and leaves the store untouched. + let id = { + let mut ga = a.lock().unwrap(); + ga.create_node(NewNode::doc( + "Offline note", + "written while the hub was down", + )) + .unwrap() + .id + }; + let dead_hub = "http://127.0.0.1:1"; // nothing listens → connection refused + assert!( + sync::sync_once(a.clone(), dead_hub, &http, None) + .await + .is_err(), + "sync against a dead hub should error, not hang or panic" + ); + + // The spoke is unharmed: the note is intact and further writes still succeed. + { + let mut ga = a.lock().unwrap(); + assert_eq!(ga.get_node(&id).unwrap().unwrap().title, "Offline note"); + ga.create_node(NewNode::doc("Another", "still working offline")) + .unwrap(); + } + + // The hub returns: the spoke pushes everything it accumulated while offline, + // and a fresh replica pulls it — convergence resumes, no manual recovery. + let hub_url = start_hub().await; + let up = sync::sync_once(a.clone(), &hub_url, &http, None) + .await + .unwrap(); + assert!(up.pushed > 0, "spoke pushed nothing after the hub returned"); + let (b, _cb, _db) = replica(1000); + sync::sync_once(b.clone(), &hub_url, &http, None) + .await + .unwrap(); + let on_b = b + .lock() + .unwrap() + .get_node(&id) + .unwrap() + .expect("offline-authored node reached B after the hub recovered"); + assert_eq!(on_b.title, "Offline note"); +} + #[tokio::test] async fn divergent_scalar_edits_converge_through_the_hub_with_a_conflict() { let hub_url = start_hub().await;