heph-core: markdown extraction (wiki-links + checkboxes)
Some checks failed
Build / validate (pull_request) Failing after 3s

Slice 2 (tech-spec §5). Pure, deterministic derivation from a body:

- `[[wiki-links]]` → wiki-link targets, in first-seen order, deduped,
  honoring `[[target|display]]`. Scans the raw body (CommonMark mangles
  `[[ ]]` brackets in inline parsing) and excludes matches inside code,
  whose byte ranges come from pulldown-cmark's offset iterator.
- GFM `- [ ]` / `- [x]` task items → the local context-item index
  (Fork A): label keeps raw markdown (for promotion) + checked state.
- Code blocks are correctly skipped for both.

10 extraction unit tests incl. idempotency; 14 total green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Erich Blume 2026-05-31 18:56:59 -07:00
commit 1995e0e3cf
6 changed files with 264 additions and 1 deletions

24
Cargo.lock generated
View file

@ -118,6 +118,7 @@ dependencies = [
name = "heph-core"
version = "0.0.0"
dependencies = [
"pulldown-cmark",
"rusqlite",
"thiserror",
"ulid",
@ -152,6 +153,12 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "memchr"
version = "2.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
[[package]]
name = "once_cell"
version = "1.21.4"
@ -188,6 +195,17 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "pulldown-cmark"
version = "0.13.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9f068eba8e7071c5f9511831b44f32c740d5adf574e990f946ddb53db2f314e"
dependencies = [
"bitflags",
"memchr",
"unicase",
]
[[package]]
name = "quote"
version = "1.0.45"
@ -311,6 +329,12 @@ dependencies = [
"web-time",
]
[[package]]
name = "unicase"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
[[package]]
name = "unicode-ident"
version = "1.0.24"

View file

@ -15,6 +15,7 @@ rusqlite = { version = "0.32", features = ["bundled"] }
ulid = "1"
thiserror = "2"
anyhow = "1"
pulldown-cmark = { version = "0.13", default-features = false }
[profile.release]
lto = "thin"

View file

@ -12,3 +12,4 @@ rust-version.workspace = true
rusqlite.workspace = true
ulid.workspace = true
thiserror.workspace = true
pulldown-cmark.workspace = true

View file

@ -0,0 +1,231 @@
//! Markdown derivation (tech-spec §5).
//!
//! From a node's body we derive two things, purely and deterministically:
//!
//! - **`[[wiki-links]]`** → `wiki` link targets (resolved to nodes later, via
//! `aliases`/title; unresolved targets are allowed and recorded).
//! - **GFM task-list items** (`- [ ]` / `- [x]`) → the **local context-item
//! index** (Fork A, [[design]] §6.3). The `[ ]`/`[x]` marker *is* the item's
//! only state; this index is derived per replica, never synced.
//!
//! Derivation is **idempotent**: the same body always yields the same
//! [`Extraction`]. Code blocks are skipped (a `- [ ]` inside a fenced block is
//! not a task; a `[[link]]` inside one is not a link), which is why this goes
//! through a real CommonMark parser rather than a line scan.
use std::collections::HashSet;
use std::ops::Range;
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
/// A context-item line derived from a body (Fork A).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContextItem {
/// The visible label text (markers and surrounding whitespace stripped).
pub text: String,
/// `true` for `- [x]` (not-outstanding), `false` for `- [ ]` (outstanding).
pub checked: bool,
}
/// Everything derived from a single body.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct Extraction {
/// Wiki-link targets, in first-seen document order, de-duplicated.
pub wiki_links: Vec<String>,
/// Context items, in document order.
pub context_items: Vec<ContextItem>,
}
/// Derive [`Extraction`] from a markdown body.
pub fn extract(body: &str) -> Extraction {
let mut options = Options::empty();
options.insert(Options::ENABLE_TASKLISTS);
let mut context_items: Vec<ContextItem> = Vec::new();
// Byte ranges covered by code (fenced/indented blocks and inline spans).
// Wiki-links found inside these are not links.
let mut code_ranges: Vec<Range<usize>> = Vec::new();
// Depth of nested code blocks; their inner text ranges are code.
let mut code_depth: u32 = 0;
// The task item currently being collected, if any: (checked, accumulated text).
let mut current: Option<(bool, String)> = None;
for (event, range) in Parser::new_ext(body, options).into_offset_iter() {
match event {
Event::Start(Tag::CodeBlock(_)) => code_depth += 1,
Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1),
Event::TaskListMarker(checked) => {
current = Some((checked, String::new()));
}
Event::End(TagEnd::Item) => {
if let Some((checked, text)) = current.take() {
context_items.push(ContextItem {
checked,
text: text.trim().to_string(),
});
}
}
Event::Text(text) => {
if code_depth > 0 {
code_ranges.push(range);
}
if let Some((_, label)) = current.as_mut() {
label.push_str(&text);
}
}
// Inline code is part of an item's visible label, but its contents
// are never a wiki-link source.
Event::Code(code) => {
code_ranges.push(range);
if let Some((_, label)) = current.as_mut() {
label.push_str(&code);
}
}
Event::SoftBreak | Event::HardBreak => {
if let Some((_, label)) = current.as_mut() {
label.push(' ');
}
}
_ => {}
}
}
// Scan the raw body for wiki-links (CommonMark mangles `[[ ]]` brackets, so
// we can't rely on Text events), excluding any that start inside code.
let wiki_links = scan_wiki_links(body, &code_ranges);
Extraction {
wiki_links,
context_items,
}
}
/// Find `[[target]]` (or `[[target|display]]`) spans in `body`, returning each
/// unique, non-empty target in first-seen order. Matches starting inside a
/// `code` range are skipped. The `[` / `]` delimiters are ASCII, so byte
/// indexing stays on char boundaries.
fn scan_wiki_links(body: &str, code_ranges: &[Range<usize>]) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
let bytes = body.as_bytes();
let mut i = 0;
while i + 1 < bytes.len() {
if bytes[i] == b'[' && bytes[i + 1] == b'[' {
let rest = &body[i + 2..];
match rest.find("]]") {
Some(close) => {
let in_code = code_ranges.iter().any(|r| r.contains(&i));
if !in_code {
let inner = &rest[..close];
// `[[target|display]]` — the target is the left side.
let target = inner.split('|').next().unwrap_or("").trim();
if !target.is_empty() && seen.insert(target.to_string()) {
out.push(target.to_string());
}
}
i += 2 + close + 2;
continue;
}
// Unterminated `[[` — nothing more to find.
None => break,
}
}
i += 1;
}
out
}
#[cfg(test)]
mod tests {
use super::*;
fn links(body: &str) -> Vec<String> {
extract(body).wiki_links
}
fn items(body: &str) -> Vec<ContextItem> {
extract(body).context_items
}
#[test]
fn extracts_simple_wiki_links_in_order() {
assert_eq!(
links("See [[Roof]] then [[Contractor calls]]."),
vec!["Roof".to_string(), "Contractor calls".to_string()]
);
}
#[test]
fn wiki_link_target_is_left_of_pipe() {
assert_eq!(links("[[borgmatic|Borgmatic backups]]"), vec!["borgmatic"]);
}
#[test]
fn wiki_links_are_deduplicated_first_seen_order() {
assert_eq!(
links("[[A]] [[B]] [[A]] [[a]]"),
vec!["A".to_string(), "B".to_string(), "a".to_string()]
);
}
#[test]
fn empty_and_unterminated_wiki_links_are_ignored() {
assert!(links("[[]] and [[ ]] and [[oops").is_empty());
}
#[test]
fn wiki_links_inside_code_are_not_extracted() {
let body = "real [[Keep]]\n\n```\nnot [[Skip]] here\n```\n";
assert_eq!(links(body), vec!["Keep"]);
}
#[test]
fn extracts_checkbox_items_with_state() {
let body = "- [ ] feed birds\n- [x] brush teeth\n";
assert_eq!(
items(body),
vec![
ContextItem {
text: "feed birds".to_string(),
checked: false
},
ContextItem {
text: "brush teeth".to_string(),
checked: true
},
]
);
}
#[test]
fn checkbox_inside_code_block_is_not_an_item() {
let body = "- [ ] real item\n\n```\n- [ ] not an item\n```\n";
assert_eq!(items(body).len(), 1);
assert_eq!(items(body)[0].text, "real item");
}
#[test]
fn checkbox_item_can_carry_a_wiki_link() {
// A checkbox line is both a context item and a wiki-link source. The
// item label keeps the raw markdown (`[[...]]` intact) so promotion can
// locate and rewrite the source line later (Fork A, §6).
let e = extract("- [ ] call [[Contractor]] back");
assert_eq!(e.wiki_links, vec!["Contractor"]);
assert_eq!(e.context_items.len(), 1);
assert_eq!(e.context_items[0].text, "call [[Contractor]] back");
assert!(!e.context_items[0].checked);
}
#[test]
fn extraction_is_idempotent() {
let body = "# Mixed\n\n- [ ] do [[X]]\n- [x] done\n\nsee [[Y]]\n";
assert_eq!(extract(body), extract(body));
}
#[test]
fn body_without_links_or_items_yields_empty() {
assert_eq!(extract("just prose, no structure"), Extraction::default());
}
}

View file

@ -10,12 +10,14 @@
pub mod clock;
pub mod error;
pub mod extract;
pub mod model;
pub mod sqlite;
pub mod store;
pub use clock::{Clock, FixedClock};
pub use error::{Error, Result};
pub use extract::{extract, ContextItem, Extraction};
pub use model::{NewNode, Node, NodeKind};
pub use sqlite::LocalStore;
pub use store::Store;

View file

@ -1 +1,5 @@
Begin the v1 prototype (Phase 1): scaffold the Cargo workspace and the `heph-core` crate with a migration-run SQLite schema (tech-spec §4.5), a clock-injected `Store` trait, and a `LocalStore` backend with node create/get. CI now runs the Rust suite (fmt/clippy/test) via the project build hook.
Begin the v1 prototype (Phase 1, tech-spec §11.1), built in TDD slices:
- Cargo workspace + `heph-core` crate; migration-run SQLite schema (§4.5); clock-injected `Store` trait + `LocalStore` node create/get; single local-user bootstrap.
- Markdown extraction (§5): `[[wiki-links]]` and GFM `- [ ]` checkbox context-items derived purely and idempotently from a body, skipping code blocks.
- CI runs the Rust suite (fmt/clippy/test) via the project build hook.