From 1995e0e3cf556555ab910a520e2a46e31e77390c Mon Sep 17 00:00:00 2001 From: Erich Blume Date: Sun, 31 May 2026 18:56:59 -0700 Subject: [PATCH] heph-core: markdown extraction (wiki-links + checkboxes) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Slice 2 (tech-spec §5). Pure, deterministic derivation from a body: - `[[wiki-links]]` → wiki-link targets, in first-seen order, deduped, honoring `[[target|display]]`. Scans the raw body (CommonMark mangles `[[ ]]` brackets in inline parsing) and excludes matches inside code, whose byte ranges come from pulldown-cmark's offset iterator. - GFM `- [ ]` / `- [x]` task items → the local context-item index (Fork A): label keeps raw markdown (for promotion) + checked state. - Code blocks are correctly skipped for both. 10 extraction unit tests incl. idempotency; 14 total green. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 24 +++ Cargo.toml | 1 + crates/heph-core/Cargo.toml | 1 + crates/heph-core/src/extract.rs | 231 +++++++++++++++++++++++ crates/heph-core/src/lib.rs | 2 + docs/changelog.d/v1-prototype.feature.md | 6 +- 6 files changed, 264 insertions(+), 1 deletion(-) create mode 100644 crates/heph-core/src/extract.rs diff --git a/Cargo.lock b/Cargo.lock index 766cb10..f36f4a6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -118,6 +118,7 @@ dependencies = [ name = "heph-core" version = "0.0.0" dependencies = [ + "pulldown-cmark", "rusqlite", "thiserror", "ulid", @@ -152,6 +153,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "memchr" +version = "2.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8" + [[package]] name = "once_cell" version = "1.21.4" @@ -188,6 +195,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9f068eba8e7071c5f9511831b44f32c740d5adf574e990f946ddb53db2f314e" +dependencies = [ + "bitflags", + "memchr", + "unicase", +] + [[package]] name = "quote" version = "1.0.45" @@ -311,6 +329,12 @@ dependencies = [ "web-time", ] +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + [[package]] name = "unicode-ident" version = "1.0.24" diff --git a/Cargo.toml b/Cargo.toml index ee065a0..38ddb1b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,6 +15,7 @@ rusqlite = { version = "0.32", features = ["bundled"] } ulid = "1" thiserror = "2" anyhow = "1" +pulldown-cmark = { version = "0.13", default-features = false } [profile.release] lto = "thin" diff --git a/crates/heph-core/Cargo.toml b/crates/heph-core/Cargo.toml index f70dc26..803eae7 100644 --- a/crates/heph-core/Cargo.toml +++ b/crates/heph-core/Cargo.toml @@ -12,3 +12,4 @@ rust-version.workspace = true rusqlite.workspace = true ulid.workspace = true thiserror.workspace = true +pulldown-cmark.workspace = true diff --git a/crates/heph-core/src/extract.rs b/crates/heph-core/src/extract.rs new file mode 100644 index 0000000..cf6f4e4 --- /dev/null +++ b/crates/heph-core/src/extract.rs @@ -0,0 +1,231 @@ +//! Markdown derivation (tech-spec §5). +//! +//! From a node's body we derive two things, purely and deterministically: +//! +//! - **`[[wiki-links]]`** → `wiki` link targets (resolved to nodes later, via +//! `aliases`/title; unresolved targets are allowed and recorded). +//! - **GFM task-list items** (`- [ ]` / `- [x]`) → the **local context-item +//! index** (Fork A, [[design]] §6.3). The `[ ]`/`[x]` marker *is* the item's +//! only state; this index is derived per replica, never synced. +//! +//! Derivation is **idempotent**: the same body always yields the same +//! [`Extraction`]. Code blocks are skipped (a `- [ ]` inside a fenced block is +//! not a task; a `[[link]]` inside one is not a link), which is why this goes +//! through a real CommonMark parser rather than a line scan. + +use std::collections::HashSet; +use std::ops::Range; + +use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd}; + +/// A context-item line derived from a body (Fork A). +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ContextItem { + /// The visible label text (markers and surrounding whitespace stripped). + pub text: String, + /// `true` for `- [x]` (not-outstanding), `false` for `- [ ]` (outstanding). + pub checked: bool, +} + +/// Everything derived from a single body. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct Extraction { + /// Wiki-link targets, in first-seen document order, de-duplicated. + pub wiki_links: Vec, + /// Context items, in document order. + pub context_items: Vec, +} + +/// Derive [`Extraction`] from a markdown body. +pub fn extract(body: &str) -> Extraction { + let mut options = Options::empty(); + options.insert(Options::ENABLE_TASKLISTS); + + let mut context_items: Vec = Vec::new(); + // Byte ranges covered by code (fenced/indented blocks and inline spans). + // Wiki-links found inside these are not links. + let mut code_ranges: Vec> = Vec::new(); + // Depth of nested code blocks; their inner text ranges are code. + let mut code_depth: u32 = 0; + // The task item currently being collected, if any: (checked, accumulated text). + let mut current: Option<(bool, String)> = None; + + for (event, range) in Parser::new_ext(body, options).into_offset_iter() { + match event { + Event::Start(Tag::CodeBlock(_)) => code_depth += 1, + Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1), + + Event::TaskListMarker(checked) => { + current = Some((checked, String::new())); + } + Event::End(TagEnd::Item) => { + if let Some((checked, text)) = current.take() { + context_items.push(ContextItem { + checked, + text: text.trim().to_string(), + }); + } + } + + Event::Text(text) => { + if code_depth > 0 { + code_ranges.push(range); + } + if let Some((_, label)) = current.as_mut() { + label.push_str(&text); + } + } + // Inline code is part of an item's visible label, but its contents + // are never a wiki-link source. + Event::Code(code) => { + code_ranges.push(range); + if let Some((_, label)) = current.as_mut() { + label.push_str(&code); + } + } + Event::SoftBreak | Event::HardBreak => { + if let Some((_, label)) = current.as_mut() { + label.push(' '); + } + } + _ => {} + } + } + + // Scan the raw body for wiki-links (CommonMark mangles `[[ ]]` brackets, so + // we can't rely on Text events), excluding any that start inside code. + let wiki_links = scan_wiki_links(body, &code_ranges); + + Extraction { + wiki_links, + context_items, + } +} + +/// Find `[[target]]` (or `[[target|display]]`) spans in `body`, returning each +/// unique, non-empty target in first-seen order. Matches starting inside a +/// `code` range are skipped. The `[` / `]` delimiters are ASCII, so byte +/// indexing stays on char boundaries. +fn scan_wiki_links(body: &str, code_ranges: &[Range]) -> Vec { + let mut out: Vec = Vec::new(); + let mut seen: HashSet = HashSet::new(); + let bytes = body.as_bytes(); + let mut i = 0; + while i + 1 < bytes.len() { + if bytes[i] == b'[' && bytes[i + 1] == b'[' { + let rest = &body[i + 2..]; + match rest.find("]]") { + Some(close) => { + let in_code = code_ranges.iter().any(|r| r.contains(&i)); + if !in_code { + let inner = &rest[..close]; + // `[[target|display]]` — the target is the left side. + let target = inner.split('|').next().unwrap_or("").trim(); + if !target.is_empty() && seen.insert(target.to_string()) { + out.push(target.to_string()); + } + } + i += 2 + close + 2; + continue; + } + // Unterminated `[[` — nothing more to find. + None => break, + } + } + i += 1; + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + fn links(body: &str) -> Vec { + extract(body).wiki_links + } + + fn items(body: &str) -> Vec { + extract(body).context_items + } + + #[test] + fn extracts_simple_wiki_links_in_order() { + assert_eq!( + links("See [[Roof]] then [[Contractor calls]]."), + vec!["Roof".to_string(), "Contractor calls".to_string()] + ); + } + + #[test] + fn wiki_link_target_is_left_of_pipe() { + assert_eq!(links("[[borgmatic|Borgmatic backups]]"), vec!["borgmatic"]); + } + + #[test] + fn wiki_links_are_deduplicated_first_seen_order() { + assert_eq!( + links("[[A]] [[B]] [[A]] [[a]]"), + vec!["A".to_string(), "B".to_string(), "a".to_string()] + ); + } + + #[test] + fn empty_and_unterminated_wiki_links_are_ignored() { + assert!(links("[[]] and [[ ]] and [[oops").is_empty()); + } + + #[test] + fn wiki_links_inside_code_are_not_extracted() { + let body = "real [[Keep]]\n\n```\nnot [[Skip]] here\n```\n"; + assert_eq!(links(body), vec!["Keep"]); + } + + #[test] + fn extracts_checkbox_items_with_state() { + let body = "- [ ] feed birds\n- [x] brush teeth\n"; + assert_eq!( + items(body), + vec![ + ContextItem { + text: "feed birds".to_string(), + checked: false + }, + ContextItem { + text: "brush teeth".to_string(), + checked: true + }, + ] + ); + } + + #[test] + fn checkbox_inside_code_block_is_not_an_item() { + let body = "- [ ] real item\n\n```\n- [ ] not an item\n```\n"; + assert_eq!(items(body).len(), 1); + assert_eq!(items(body)[0].text, "real item"); + } + + #[test] + fn checkbox_item_can_carry_a_wiki_link() { + // A checkbox line is both a context item and a wiki-link source. The + // item label keeps the raw markdown (`[[...]]` intact) so promotion can + // locate and rewrite the source line later (Fork A, §6). + let e = extract("- [ ] call [[Contractor]] back"); + assert_eq!(e.wiki_links, vec!["Contractor"]); + assert_eq!(e.context_items.len(), 1); + assert_eq!(e.context_items[0].text, "call [[Contractor]] back"); + assert!(!e.context_items[0].checked); + } + + #[test] + fn extraction_is_idempotent() { + let body = "# Mixed\n\n- [ ] do [[X]]\n- [x] done\n\nsee [[Y]]\n"; + assert_eq!(extract(body), extract(body)); + } + + #[test] + fn body_without_links_or_items_yields_empty() { + assert_eq!(extract("just prose, no structure"), Extraction::default()); + } +} diff --git a/crates/heph-core/src/lib.rs b/crates/heph-core/src/lib.rs index 566774d..41eea60 100644 --- a/crates/heph-core/src/lib.rs +++ b/crates/heph-core/src/lib.rs @@ -10,12 +10,14 @@ pub mod clock; pub mod error; +pub mod extract; pub mod model; pub mod sqlite; pub mod store; pub use clock::{Clock, FixedClock}; pub use error::{Error, Result}; +pub use extract::{extract, ContextItem, Extraction}; pub use model::{NewNode, Node, NodeKind}; pub use sqlite::LocalStore; pub use store::Store; diff --git a/docs/changelog.d/v1-prototype.feature.md b/docs/changelog.d/v1-prototype.feature.md index df5c325..8c80911 100644 --- a/docs/changelog.d/v1-prototype.feature.md +++ b/docs/changelog.d/v1-prototype.feature.md @@ -1 +1,5 @@ -Begin the v1 prototype (Phase 1): scaffold the Cargo workspace and the `heph-core` crate with a migration-run SQLite schema (tech-spec §4.5), a clock-injected `Store` trait, and a `LocalStore` backend with node create/get. CI now runs the Rust suite (fmt/clippy/test) via the project build hook. +Begin the v1 prototype (Phase 1, tech-spec §11.1), built in TDD slices: + +- Cargo workspace + `heph-core` crate; migration-run SQLite schema (§4.5); clock-injected `Store` trait + `LocalStore` node create/get; single local-user bootstrap. +- Markdown extraction (§5): `[[wiki-links]]` and GFM `- [ ]` checkbox context-items derived purely and idempotently from a body, skipping code blocks. +- CI runs the Rust suite (fmt/clippy/test) via the project build hook.