C2: hephd self-update (Mikado plan — cards for review) #7

Merged
eblume merged 19 commits from mikado/hephd-self-update into main 2026-06-04 15:03:23 -07:00
2 changed files with 67 additions and 1 deletions
Showing only changes of commit 9213a943f5 - Show all commits

C2(hephd-self-update): impl verify hub-dropout resilience (+ client timeout)

Lock in the base-case guarantee that a self-updating hub (which restarts
under its spokes) relies on. New sync_http test: a spoke whose hub is
unreachable keeps serving + accepting writes, a sync attempt fails fast
(Err, not hang/panic), and when the hub returns the accumulated ops
reconcile with no special recovery.

The verification surfaced one non-graceful path — the daemon's shared
reqwest client had no timeout, so a black-hole hub (connects, never
replies) could stall the sync/self-update loop. Give it a 30s timeout so
'the hub can vanish at any moment' holds even mid-request.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Erich Blume 2026-06-04 13:47:26 -07:00

View file

@ -77,7 +77,14 @@ impl Daemon {
ctx: Ctx {
store: Arc::new(Mutex::new(store)),
hub_url: None,
http: reqwest::Client::new(),
// Bound every hub request so a black-hole hub (one that accepts
// a connection but never replies) can't stall the sync /
// self-update loops — "the hub can vanish at any moment" is the
// base case, including vanishing mid-request.
http: reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.build()
.expect("building the daemon HTTP client"),
auth: None,
self_update: None,
},

View file

@ -84,6 +84,65 @@ async fn a_node_propagates_a_to_hub_to_b() {
assert_eq!(on_b.body.as_deref(), Some("shingles need work"));
}
#[tokio::test]
async fn spoke_survives_an_unreachable_hub_then_reconciles_when_it_returns() {
// "The hub can vanish at any moment" is the base case, not a guarded edge:
// a spoke whose hub is down keeps serving + accepting writes, and when the
// hub returns its accumulated ops reconcile with no special recovery. This
// is what makes a self-updating hub (which restarts under its spokes) safe.
let http = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(5)) // never hang the test
.build()
.unwrap();
let (a, _ca, _da) = replica(1000);
// Hub down: work happens locally, and a sync attempt fails *fast* (Err — not
// a panic, not a hang) and leaves the store untouched.
let id = {
let mut ga = a.lock().unwrap();
ga.create_node(NewNode::doc(
"Offline note",
"written while the hub was down",
))
.unwrap()
.id
};
let dead_hub = "http://127.0.0.1:1"; // nothing listens → connection refused
assert!(
sync::sync_once(a.clone(), dead_hub, &http, None)
.await
.is_err(),
"sync against a dead hub should error, not hang or panic"
);
// The spoke is unharmed: the note is intact and further writes still succeed.
{
let mut ga = a.lock().unwrap();
assert_eq!(ga.get_node(&id).unwrap().unwrap().title, "Offline note");
ga.create_node(NewNode::doc("Another", "still working offline"))
.unwrap();
}
// The hub returns: the spoke pushes everything it accumulated while offline,
// and a fresh replica pulls it — convergence resumes, no manual recovery.
let hub_url = start_hub().await;
let up = sync::sync_once(a.clone(), &hub_url, &http, None)
.await
.unwrap();
assert!(up.pushed > 0, "spoke pushed nothing after the hub returned");
let (b, _cb, _db) = replica(1000);
sync::sync_once(b.clone(), &hub_url, &http, None)
.await
.unwrap();
let on_b = b
.lock()
.unwrap()
.get_node(&id)
.unwrap()
.expect("offline-authored node reached B after the hub recovered");
assert_eq!(on_b.title, "Offline note");
}
#[tokio::test]
async fn divergent_scalar_edits_converge_through_the_hub_with_a_conflict() {
let hub_url = start_hub().await;