C2(hephd-self-update): impl verify hub-dropout resilience (+ client timeout)

Lock in the base-case guarantee that a self-updating hub (which restarts under its spokes) relies on. New sync_http test: a spoke whose hub is unreachable keeps serving + accepting writes, a sync attempt fails fast (Err, not hang/panic), and when the hub returns the accumulated ops reconcile with no special recovery. The verification surfaced one non-graceful path — the daemon's shared reqwest client had no timeout, so a black-hole hub (connects, never replies) could stall the sync/self-update loop. Give it a 30s timeout so 'the hub can vanish at any moment' holds even mid-request. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 15:03:23 -07:00 · 2026-06-04 13:47:26 -07:00 · 2026-06-04 13:47:26 -07:00
commit 9213a943f5
2 changed files with 67 additions and 1 deletions
--- a/crates/hephd/src/server.rs
+++ b/crates/hephd/src/server.rs
@ -77,7 +77,14 @@ impl Daemon {
            ctx: Ctx {
                store: Arc::new(Mutex::new(store)),
                hub_url: None,
-                http: reqwest::Client::new(),
+                // Bound every hub request so a black-hole hub (one that accepts
+                // a connection but never replies) can't stall the sync /
+                // self-update loops — "the hub can vanish at any moment" is the
+                // base case, including vanishing mid-request.
+                http: reqwest::Client::builder()
+                    .timeout(Duration::from_secs(30))
+                    .build()
+                    .expect("building the daemon HTTP client"),
                auth: None,
                self_update: None,
            },
--- a/crates/hephd/tests/sync_http.rs
+++ b/crates/hephd/tests/sync_http.rs
@ -84,6 +84,65 @@ async fn a_node_propagates_a_to_hub_to_b() {
    assert_eq!(on_b.body.as_deref(), Some("shingles need work"));
 }

+#[tokio::test]
+async fn spoke_survives_an_unreachable_hub_then_reconciles_when_it_returns() {
+    // "The hub can vanish at any moment" is the base case, not a guarded edge:
+    // a spoke whose hub is down keeps serving + accepting writes, and when the
+    // hub returns its accumulated ops reconcile with no special recovery. This
+    // is what makes a self-updating hub (which restarts under its spokes) safe.
+    let http = reqwest::Client::builder()
+        .timeout(std::time::Duration::from_secs(5)) // never hang the test
+        .build()
+        .unwrap();
+    let (a, _ca, _da) = replica(1000);
+
+    // Hub down: work happens locally, and a sync attempt fails *fast* (Err — not
+    // a panic, not a hang) and leaves the store untouched.
+    let id = {
+        let mut ga = a.lock().unwrap();
+        ga.create_node(NewNode::doc(
+            "Offline note",
+            "written while the hub was down",
+        ))
+        .unwrap()
+        .id
+    };
+    let dead_hub = "http://127.0.0.1:1"; // nothing listens → connection refused
+    assert!(
+        sync::sync_once(a.clone(), dead_hub, &http, None)
+            .await
+            .is_err(),
+        "sync against a dead hub should error, not hang or panic"
+    );
+
+    // The spoke is unharmed: the note is intact and further writes still succeed.
+    {
+        let mut ga = a.lock().unwrap();
+        assert_eq!(ga.get_node(&id).unwrap().unwrap().title, "Offline note");
+        ga.create_node(NewNode::doc("Another", "still working offline"))
+            .unwrap();
+    }
+
+    // The hub returns: the spoke pushes everything it accumulated while offline,
+    // and a fresh replica pulls it — convergence resumes, no manual recovery.
+    let hub_url = start_hub().await;
+    let up = sync::sync_once(a.clone(), &hub_url, &http, None)
+        .await
+        .unwrap();
+    assert!(up.pushed > 0, "spoke pushed nothing after the hub returned");
+    let (b, _cb, _db) = replica(1000);
+    sync::sync_once(b.clone(), &hub_url, &http, None)
+        .await
+        .unwrap();
+    let on_b = b
+        .lock()
+        .unwrap()
+        .get_node(&id)
+        .unwrap()
+        .expect("offline-authored node reached B after the hub recovered");
+    assert_eq!(on_b.title, "Offline note");
+}
+
 #[tokio::test]
 async fn divergent_scalar_edits_converge_through_the_hub_with_a_conflict() {
    let hub_url = start_hub().await;