From e7ced4f8f9b361b8ce64923239ef257b683ba0b5 Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Tue, 9 Jun 2026 13:03:10 -0700 Subject: [PATCH] test(fuzz): add cargo-fuzz targets for CRDT and extraction surfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tier 2 fuzzing: a nightly cargo-fuzz crate at crates/heph-core/fuzz/ with three targets (crdt_merge, crdt_write, extract), reaching crate-private CRDT internals through heph-core's new 'fuzzing' feature. Driven ad-hoc via 'mise run fuzz'; not in CI (needs nightly + wall clock). crdt_merge immediately surfaced robustness gaps in yrs 0.27 on malformed sync deltas (a 4-byte input OOMs; other inputs abort/UB) — uncatchable, limited blast radius (authenticated /sync/push), documented as a known limitation. extract and crdt_write ran clean over ~1M cases. Co-Authored-By: Claude Fable 5 --- crates/heph-core/Cargo.toml | 6 +++ crates/heph-core/fuzz/.gitignore | 5 +++ crates/heph-core/fuzz/Cargo.toml | 41 ++++++++++++++++++ .../heph-core/fuzz/fuzz_targets/crdt_merge.rs | 15 +++++++ .../heph-core/fuzz/fuzz_targets/crdt_write.rs | 21 +++++++++ crates/heph-core/fuzz/fuzz_targets/extract.rs | 27 ++++++++++++ crates/heph-core/src/crdt.rs | 34 ++++++++++++--- crates/heph-core/src/lib.rs | 3 ++ .../changelog.d/feature-fuzz-testing.infra.md | 1 + docs/how-to/fuzz-testing.md | 28 ++++++++++++ docs/reference/reference.md | 1 + mise-tasks/fuzz | 43 +++++++++++++++++++ 12 files changed, 220 insertions(+), 5 deletions(-) create mode 100644 crates/heph-core/fuzz/.gitignore create mode 100644 crates/heph-core/fuzz/Cargo.toml create mode 100644 crates/heph-core/fuzz/fuzz_targets/crdt_merge.rs create mode 100644 crates/heph-core/fuzz/fuzz_targets/crdt_write.rs create mode 100644 crates/heph-core/fuzz/fuzz_targets/extract.rs create mode 100644 docs/changelog.d/feature-fuzz-testing.infra.md create mode 100755 mise-tasks/fuzz diff --git a/crates/heph-core/Cargo.toml b/crates/heph-core/Cargo.toml index 0f0763f..5e939a2 100644 --- a/crates/heph-core/Cargo.toml +++ b/crates/heph-core/Cargo.toml @@ -8,6 +8,12 @@ publish.workspace = true authors.workspace = true rust-version.workspace = true +[features] +# Exposes thin public wrappers over crate-private internals (the body CRDT) for +# the cargo-fuzz targets in `fuzz/`. Never enabled in normal builds — the +# wrappers are test scaffolding, not part of the public API. +fuzzing = [] + [dependencies] rusqlite.workspace = true ulid.workspace = true diff --git a/crates/heph-core/fuzz/.gitignore b/crates/heph-core/fuzz/.gitignore new file mode 100644 index 0000000..ab0eaa1 --- /dev/null +++ b/crates/heph-core/fuzz/.gitignore @@ -0,0 +1,5 @@ +target/ +corpus/ +artifacts/ +coverage/ +Cargo.lock diff --git a/crates/heph-core/fuzz/Cargo.toml b/crates/heph-core/fuzz/Cargo.toml new file mode 100644 index 0000000..caeeb98 --- /dev/null +++ b/crates/heph-core/fuzz/Cargo.toml @@ -0,0 +1,41 @@ +# cargo-fuzz harness for heph-core's parsing/CRDT surfaces. Its own workspace +# (the empty `[workspace]` table) so it never pulls into the main build; it is +# nightly-only and run ad-hoc via `mise run fuzz`. See docs/how-to/fuzz-testing.md. +[package] +name = "heph-core-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = "0.4" + +[dependencies.heph-core] +path = ".." +features = ["fuzzing"] + +[[bin]] +name = "crdt_merge" +path = "fuzz_targets/crdt_merge.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "crdt_write" +path = "fuzz_targets/crdt_write.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "extract" +path = "fuzz_targets/extract.rs" +test = false +doc = false +bench = false + +[workspace] diff --git a/crates/heph-core/fuzz/fuzz_targets/crdt_merge.rs b/crates/heph-core/fuzz/fuzz_targets/crdt_merge.rs new file mode 100644 index 0000000..ae573f3 --- /dev/null +++ b/crates/heph-core/fuzz/fuzz_targets/crdt_merge.rs @@ -0,0 +1,15 @@ +#![no_main] +//! Fuzz `merge_body` with arbitrary `delta` bytes — the untrusted sync-ingest +//! surface. A peer's update payload is decoded and applied here; a crash is a +//! remote-input daemon crash. yrs 0.27 is known to `SIGABRT`/UB on some +//! malformed inputs (see `crdt::merge_body`'s docs) — surfacing and shrinking +//! such an input is exactly this target's job. + +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let (state, body) = heph_core::crdt_fuzz::merge_body(None, data); + // Idempotence: applying the same delta again must not change the body. + let (_, body_again) = heph_core::crdt_fuzz::merge_body(Some(&state), data); + assert_eq!(body, body_again, "merge of the same delta was not idempotent"); +}); diff --git a/crates/heph-core/fuzz/fuzz_targets/crdt_write.rs b/crates/heph-core/fuzz/fuzz_targets/crdt_write.rs new file mode 100644 index 0000000..bb36b04 --- /dev/null +++ b/crates/heph-core/fuzz/fuzz_targets/crdt_write.rs @@ -0,0 +1,21 @@ +#![no_main] +//! Fuzz `write_body`: diff two arbitrary bodies into the text CRDT and check +//! the round-trip — the materialized body and the re-materialized stored state +//! must both equal the new body exactly. Stresses the UTF-8 boundary alignment +//! in the prefix/suffix diff with arbitrary (incl. multibyte) strings. + +use libfuzzer_sys::fuzz_target; + +const CLIENT: u64 = 0xAAAA; + +fuzz_target!(|data: (String, String)| { + let (prev, new) = data; + let (base, _, _) = heph_core::crdt_fuzz::write_body(CLIENT, None, &prev); + let (state, _delta, body) = heph_core::crdt_fuzz::write_body(CLIENT, Some(&base), &new); + assert_eq!(body, new, "write did not materialize the new body"); + assert_eq!( + heph_core::crdt_fuzz::body_of(&state), + new, + "stored state did not re-materialize to the new body" + ); +}); diff --git a/crates/heph-core/fuzz/fuzz_targets/extract.rs b/crates/heph-core/fuzz/fuzz_targets/extract.rs new file mode 100644 index 0000000..042da45 --- /dev/null +++ b/crates/heph-core/fuzz/fuzz_targets/extract.rs @@ -0,0 +1,27 @@ +#![no_main] +//! Fuzz `extract` over arbitrary markdown. Asserts the invariants promotion and +//! the context-item index depend on: wiki-links are non-empty and de-duplicated, +//! and `context_item_lines` stays 1:1 with `context_items`. + +use libfuzzer_sys::fuzz_target; +use std::collections::HashSet; + +fuzz_target!(|data: &[u8]| { + let Ok(s) = std::str::from_utf8(data) else { + return; + }; + let e = heph_core::extract(s); + + let mut seen = HashSet::new(); + for link in &e.wiki_links { + assert!(!link.is_empty(), "empty wiki-link target"); + assert_eq!(link.trim(), link.as_str(), "untrimmed wiki-link target"); + assert!(seen.insert(link.clone()), "duplicate wiki-link {link:?}"); + } + + assert_eq!( + heph_core::extract::context_item_lines(s).len(), + e.context_items.len(), + "context_item_lines diverged from context_items", + ); +}); diff --git a/crates/heph-core/src/crdt.rs b/crates/heph-core/src/crdt.rs index 9a35745..276fac4 100644 --- a/crates/heph-core/src/crdt.rs +++ b/crates/heph-core/src/crdt.rs @@ -119,10 +119,12 @@ pub(crate) struct BodyMerge { /// contains the unwinding subset so a corrupt payload degrades to a no-op /// merge rather than crashing a debug daemon; it cannot stop the abort/UB /// class. The blast radius is limited — `/sync/push` is authenticated — but a -/// buggy or hostile *authenticated* peer can still feed bad bytes here. The -/// real fix is upstream (or a pre-apply validator yrs doesn't yet expose); -/// tracked in the Hephaestus project and exercised by the `crdt_merge` fuzz -/// target. See [[fuzz-testing]]. +/// buggy or hostile *authenticated* peer can still feed bad bytes here. Beyond +/// the unwinding panics, fuzzing also found a tiny delta (`[255,255,255,126]`) +/// that drives yrs into a huge allocation (OOM) — `catch_unwind` can't help +/// that. The real fix is upstream (or a pre-apply validator yrs doesn't yet +/// expose). Findings and the `crdt_merge` fuzz target are documented in +/// [[fuzz-testing]]. pub(crate) fn merge_body(prev_state: Option<&[u8]>, delta: &[u8]) -> BodyMerge { let merged = std::panic::catch_unwind(|| { let doc = load(0, prev_state); @@ -145,11 +147,33 @@ pub(crate) fn merge_body(prev_state: Option<&[u8]>, delta: &[u8]) -> BodyMerge { } /// Materialize a stored CRDT state blob to its body text. -#[cfg(test)] +#[cfg(any(test, feature = "fuzzing"))] pub(crate) fn body_of(state: &[u8]) -> String { materialize(&load(0, Some(state))) } +/// Thin public wrappers over the crate-private CRDT for the cargo-fuzz targets +/// (`fuzz/fuzz_targets/`). Compiled only under the `fuzzing` feature, re-exported +/// from the crate root as `crdt_fuzz`. Tuples instead of the private `BodyWrite` +/// / `BodyMerge` structs so the fuzz crate needs no access to those types. +#[cfg(feature = "fuzzing")] +pub mod fuzz { + /// `(state, delta, body)` from diffing `new` into the CRDT seeded by `prev`. + pub fn write_body(client: u64, prev: Option<&[u8]>, new: &str) -> (Vec, Vec, String) { + let w = super::write_body(client, prev, new); + (w.state, w.delta, w.body) + } + /// `(state, body)` from merging an untrusted `delta` into `prev`. + pub fn merge_body(prev: Option<&[u8]>, delta: &[u8]) -> (Vec, String) { + let m = super::merge_body(prev, delta); + (m.state, m.body) + } + /// Materialize a stored state blob to its body text. + pub fn body_of(state: &[u8]) -> String { + super::body_of(state) + } +} + /// Common prefix/suffix diff over byte indices, cut points aligned to UTF-8 /// char boundaries. Returns `(start, delete_len, inserted)` such that replacing /// `cur[start..start+delete_len]` with `inserted` yields `new`. diff --git a/crates/heph-core/src/lib.rs b/crates/heph-core/src/lib.rs index 6554d48..5980dd6 100644 --- a/crates/heph-core/src/lib.rs +++ b/crates/heph-core/src/lib.rs @@ -16,6 +16,9 @@ pub const VERSION: &str = concat!(env!("CARGO_PKG_VERSION"), " (", env!("HEPH_BU pub mod clock; mod crdt; +/// Public CRDT wrappers for the cargo-fuzz targets (`fuzzing` feature only). +#[cfg(feature = "fuzzing")] +pub use crdt::fuzz as crdt_fuzz; pub mod error; pub mod export; pub mod extract; diff --git a/docs/changelog.d/feature-fuzz-testing.infra.md b/docs/changelog.d/feature-fuzz-testing.infra.md new file mode 100644 index 0000000..1881a7c --- /dev/null +++ b/docs/changelog.d/feature-fuzz-testing.infra.md @@ -0,0 +1 @@ +Added cargo-fuzz targets for the CRDT and extraction surfaces (`crates/heph-core/fuzz/`, behind heph-core's `fuzzing` feature) plus a `mise run fuzz` task. Nightly-only and ad-hoc, not wired into CI. These targets surfaced robustness gaps in `yrs` 0.27 on malformed sync deltas (OOM, abort/UB) — documented as a known limitation in [[fuzz-testing]]. diff --git a/docs/how-to/fuzz-testing.md b/docs/how-to/fuzz-testing.md index 2f4eacc..87b45d5 100644 --- a/docs/how-to/fuzz-testing.md +++ b/docs/how-to/fuzz-testing.md @@ -75,6 +75,34 @@ clock to earn its keep. Run it ad-hoc after touching `crdt.rs`, `extract.rs`, or the sync payload path. If it ever moves to CI, a scheduled (not per-push) workflow with a persistent corpus is the right shape. +## Findings so far + +The first runs paid for themselves. Tier 1 proptests found two reachable +panics on user input, both fixed in the same change: + +- **`datespec::parse_offset`** panicked on a large relative offset (e.g. + `+999999999999d`) because chrono's `+` overflows; now uses checked + arithmetic and returns an out-of-range error. +- **`datespec::parse_month_day`** sliced a token on a non-char boundary for + multibyte input (e.g. an `every ` phrase containing `𐻂`); now + takes the first three *chars*. + +Tier 2 (`crdt_merge`) surfaced **robustness gaps in `yrs` 0.27 on malformed +update bytes**, reachable through the authenticated `/sync/push` path: + +- a tiny delta `[255, 255, 255, 126]` triggers a huge allocation → **OOM**; +- some inputs trip a `debug_assert!` in the yrs block decoder (unwinding + panic — contained by the `catch_unwind` in `merge_body`); +- at least one class hits genuine UB (an invalid `char`) → `SIGABRT` under + debug UB-checks, silent UB in release. + +These are not fully fixable in-tree: `yrs` exposes no pre-apply validator, and +the OOM/abort classes are uncatchable. The blast radius is limited (the sync +endpoint is authenticated), but a buggy or hostile authenticated peer can still +crash a daemon. The `catch_unwind` in `merge_body` is partial mitigation; +durable fixes need upstream `yrs` work or a bounded decoder. Until then this is +a known limitation, tracked here and reproduced by the `crdt_merge` target. + ## Why these targets The high-value surfaces, ranked when this was set up: diff --git a/docs/reference/reference.md b/docs/reference/reference.md index a6eed8f..a9d7730 100644 --- a/docs/reference/reference.md +++ b/docs/reference/reference.md @@ -58,6 +58,7 @@ Technical reference material for the repository tooling that ships with this pro | `mise run docs-check-links` | Validate wiki-links against existing doc filenames | | `mise run docs-mikado` | Inspect active Mikado chains and resume C2 work | | `mise run docs-preview ` | Extract and serve a released docs tarball locally | +| `mise run fuzz [seconds] [target]` | Run the nightly cargo-fuzz targets briefly — see [[fuzz-testing]] | | `mise run import-todoist` | Seed a heph store from Todoist (dry-run by default; `-- --commit` to write) — see [[import-todoist]] | | `mise run mikado-branch-invariant-check` | Validate `mikado/*` branch commit discipline | | `mise run pr-comments ` | List unresolved PR comments | diff --git a/mise-tasks/fuzz b/mise-tasks/fuzz new file mode 100755 index 0000000..1a5597b --- /dev/null +++ b/mise-tasks/fuzz @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +#MISE description="Run the cargo-fuzz targets briefly (nightly). Usage: mise run fuzz [seconds] [target]" + +# Tier 2 fuzzing (see docs/how-to/fuzz-testing.md). Nightly-only and ad-hoc — +# not part of `cargo test` or CI. Runs each libFuzzer target for a bounded time +# so it terminates; the corpus under fuzz/corpus/ persists between runs. +# +# mise run fuzz # all targets, 60s each +# mise run fuzz 300 # all targets, 5 min each +# mise run fuzz 600 crdt_merge # one target, 10 min + +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +FUZZ_DIR="$ROOT/crates/heph-core/fuzz" + +SECONDS_PER="${1:-60}" +ONLY="${2:-}" + +if ! command -v cargo-fuzz >/dev/null 2>&1 && ! cargo +nightly fuzz --version >/dev/null 2>&1; then + echo "cargo-fuzz not found. Install with:" >&2 + echo " rustup toolchain install nightly && cargo install cargo-fuzz" >&2 + exit 1 +fi + +if [[ -n "$ONLY" ]]; then + targets=("$ONLY") +else + # No `mapfile` — macOS ships bash 3.2. Target names are bare words. + # shellcheck disable=SC2207 + targets=($(cargo +nightly fuzz list --fuzz-dir "$FUZZ_DIR")) +fi + +rc=0 +for t in "${targets[@]}"; do + echo "=== fuzzing $t for ${SECONDS_PER}s ===" + if ! cargo +nightly fuzz run "$t" --fuzz-dir "$FUZZ_DIR" -- -max_total_time="$SECONDS_PER"; then + echo "!!! $t produced a crash — artifact in $FUZZ_DIR/artifacts/$t/" >&2 + rc=1 + fi +done + +exit "$rc"