generated from eblume/project-template
heph-core: markdown extraction (wiki-links + checkboxes)
Some checks failed
Build / validate (pull_request) Failing after 3s
Some checks failed
Build / validate (pull_request) Failing after 3s
Slice 2 (tech-spec §5). Pure, deterministic derivation from a body: - `[[wiki-links]]` → wiki-link targets, in first-seen order, deduped, honoring `[[target|display]]`. Scans the raw body (CommonMark mangles `[[ ]]` brackets in inline parsing) and excludes matches inside code, whose byte ranges come from pulldown-cmark's offset iterator. - GFM `- [ ]` / `- [x]` task items → the local context-item index (Fork A): label keeps raw markdown (for promotion) + checked state. - Code blocks are correctly skipped for both. 10 extraction unit tests incl. idempotency; 14 total green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bbac338f76
commit
1995e0e3cf
6 changed files with 264 additions and 1 deletions
24
Cargo.lock
generated
24
Cargo.lock
generated
|
|
@ -118,6 +118,7 @@ dependencies = [
|
|||
name = "heph-core"
|
||||
version = "0.0.0"
|
||||
dependencies = [
|
||||
"pulldown-cmark",
|
||||
"rusqlite",
|
||||
"thiserror",
|
||||
"ulid",
|
||||
|
|
@ -152,6 +153,12 @@ dependencies = [
|
|||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6b947ae49db0d222b1dbc6b113ce7248a3fc3a6ca21b696717bfc000ba4484d8"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.4"
|
||||
|
|
@ -188,6 +195,17 @@ dependencies = [
|
|||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.13.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9f068eba8e7071c5f9511831b44f32c740d5adf574e990f946ddb53db2f314e"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"memchr",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.45"
|
||||
|
|
@ -311,6 +329,12 @@ dependencies = [
|
|||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.24"
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ rusqlite = { version = "0.32", features = ["bundled"] }
|
|||
ulid = "1"
|
||||
thiserror = "2"
|
||||
anyhow = "1"
|
||||
pulldown-cmark = { version = "0.13", default-features = false }
|
||||
|
||||
[profile.release]
|
||||
lto = "thin"
|
||||
|
|
|
|||
|
|
@ -12,3 +12,4 @@ rust-version.workspace = true
|
|||
rusqlite.workspace = true
|
||||
ulid.workspace = true
|
||||
thiserror.workspace = true
|
||||
pulldown-cmark.workspace = true
|
||||
|
|
|
|||
231
crates/heph-core/src/extract.rs
Normal file
231
crates/heph-core/src/extract.rs
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
//! Markdown derivation (tech-spec §5).
|
||||
//!
|
||||
//! From a node's body we derive two things, purely and deterministically:
|
||||
//!
|
||||
//! - **`[[wiki-links]]`** → `wiki` link targets (resolved to nodes later, via
|
||||
//! `aliases`/title; unresolved targets are allowed and recorded).
|
||||
//! - **GFM task-list items** (`- [ ]` / `- [x]`) → the **local context-item
|
||||
//! index** (Fork A, [[design]] §6.3). The `[ ]`/`[x]` marker *is* the item's
|
||||
//! only state; this index is derived per replica, never synced.
|
||||
//!
|
||||
//! Derivation is **idempotent**: the same body always yields the same
|
||||
//! [`Extraction`]. Code blocks are skipped (a `- [ ]` inside a fenced block is
|
||||
//! not a task; a `[[link]]` inside one is not a link), which is why this goes
|
||||
//! through a real CommonMark parser rather than a line scan.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::ops::Range;
|
||||
|
||||
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
|
||||
|
||||
/// A context-item line derived from a body (Fork A).
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct ContextItem {
|
||||
/// The visible label text (markers and surrounding whitespace stripped).
|
||||
pub text: String,
|
||||
/// `true` for `- [x]` (not-outstanding), `false` for `- [ ]` (outstanding).
|
||||
pub checked: bool,
|
||||
}
|
||||
|
||||
/// Everything derived from a single body.
|
||||
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
||||
pub struct Extraction {
|
||||
/// Wiki-link targets, in first-seen document order, de-duplicated.
|
||||
pub wiki_links: Vec<String>,
|
||||
/// Context items, in document order.
|
||||
pub context_items: Vec<ContextItem>,
|
||||
}
|
||||
|
||||
/// Derive [`Extraction`] from a markdown body.
|
||||
pub fn extract(body: &str) -> Extraction {
|
||||
let mut options = Options::empty();
|
||||
options.insert(Options::ENABLE_TASKLISTS);
|
||||
|
||||
let mut context_items: Vec<ContextItem> = Vec::new();
|
||||
// Byte ranges covered by code (fenced/indented blocks and inline spans).
|
||||
// Wiki-links found inside these are not links.
|
||||
let mut code_ranges: Vec<Range<usize>> = Vec::new();
|
||||
// Depth of nested code blocks; their inner text ranges are code.
|
||||
let mut code_depth: u32 = 0;
|
||||
// The task item currently being collected, if any: (checked, accumulated text).
|
||||
let mut current: Option<(bool, String)> = None;
|
||||
|
||||
for (event, range) in Parser::new_ext(body, options).into_offset_iter() {
|
||||
match event {
|
||||
Event::Start(Tag::CodeBlock(_)) => code_depth += 1,
|
||||
Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1),
|
||||
|
||||
Event::TaskListMarker(checked) => {
|
||||
current = Some((checked, String::new()));
|
||||
}
|
||||
Event::End(TagEnd::Item) => {
|
||||
if let Some((checked, text)) = current.take() {
|
||||
context_items.push(ContextItem {
|
||||
checked,
|
||||
text: text.trim().to_string(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Event::Text(text) => {
|
||||
if code_depth > 0 {
|
||||
code_ranges.push(range);
|
||||
}
|
||||
if let Some((_, label)) = current.as_mut() {
|
||||
label.push_str(&text);
|
||||
}
|
||||
}
|
||||
// Inline code is part of an item's visible label, but its contents
|
||||
// are never a wiki-link source.
|
||||
Event::Code(code) => {
|
||||
code_ranges.push(range);
|
||||
if let Some((_, label)) = current.as_mut() {
|
||||
label.push_str(&code);
|
||||
}
|
||||
}
|
||||
Event::SoftBreak | Event::HardBreak => {
|
||||
if let Some((_, label)) = current.as_mut() {
|
||||
label.push(' ');
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Scan the raw body for wiki-links (CommonMark mangles `[[ ]]` brackets, so
|
||||
// we can't rely on Text events), excluding any that start inside code.
|
||||
let wiki_links = scan_wiki_links(body, &code_ranges);
|
||||
|
||||
Extraction {
|
||||
wiki_links,
|
||||
context_items,
|
||||
}
|
||||
}
|
||||
|
||||
/// Find `[[target]]` (or `[[target|display]]`) spans in `body`, returning each
|
||||
/// unique, non-empty target in first-seen order. Matches starting inside a
|
||||
/// `code` range are skipped. The `[` / `]` delimiters are ASCII, so byte
|
||||
/// indexing stays on char boundaries.
|
||||
fn scan_wiki_links(body: &str, code_ranges: &[Range<usize>]) -> Vec<String> {
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
let mut seen: HashSet<String> = HashSet::new();
|
||||
let bytes = body.as_bytes();
|
||||
let mut i = 0;
|
||||
while i + 1 < bytes.len() {
|
||||
if bytes[i] == b'[' && bytes[i + 1] == b'[' {
|
||||
let rest = &body[i + 2..];
|
||||
match rest.find("]]") {
|
||||
Some(close) => {
|
||||
let in_code = code_ranges.iter().any(|r| r.contains(&i));
|
||||
if !in_code {
|
||||
let inner = &rest[..close];
|
||||
// `[[target|display]]` — the target is the left side.
|
||||
let target = inner.split('|').next().unwrap_or("").trim();
|
||||
if !target.is_empty() && seen.insert(target.to_string()) {
|
||||
out.push(target.to_string());
|
||||
}
|
||||
}
|
||||
i += 2 + close + 2;
|
||||
continue;
|
||||
}
|
||||
// Unterminated `[[` — nothing more to find.
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn links(body: &str) -> Vec<String> {
|
||||
extract(body).wiki_links
|
||||
}
|
||||
|
||||
fn items(body: &str) -> Vec<ContextItem> {
|
||||
extract(body).context_items
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_simple_wiki_links_in_order() {
|
||||
assert_eq!(
|
||||
links("See [[Roof]] then [[Contractor calls]]."),
|
||||
vec!["Roof".to_string(), "Contractor calls".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wiki_link_target_is_left_of_pipe() {
|
||||
assert_eq!(links("[[borgmatic|Borgmatic backups]]"), vec!["borgmatic"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wiki_links_are_deduplicated_first_seen_order() {
|
||||
assert_eq!(
|
||||
links("[[A]] [[B]] [[A]] [[a]]"),
|
||||
vec!["A".to_string(), "B".to_string(), "a".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_and_unterminated_wiki_links_are_ignored() {
|
||||
assert!(links("[[]] and [[ ]] and [[oops").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn wiki_links_inside_code_are_not_extracted() {
|
||||
let body = "real [[Keep]]\n\n```\nnot [[Skip]] here\n```\n";
|
||||
assert_eq!(links(body), vec!["Keep"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_checkbox_items_with_state() {
|
||||
let body = "- [ ] feed birds\n- [x] brush teeth\n";
|
||||
assert_eq!(
|
||||
items(body),
|
||||
vec![
|
||||
ContextItem {
|
||||
text: "feed birds".to_string(),
|
||||
checked: false
|
||||
},
|
||||
ContextItem {
|
||||
text: "brush teeth".to_string(),
|
||||
checked: true
|
||||
},
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn checkbox_inside_code_block_is_not_an_item() {
|
||||
let body = "- [ ] real item\n\n```\n- [ ] not an item\n```\n";
|
||||
assert_eq!(items(body).len(), 1);
|
||||
assert_eq!(items(body)[0].text, "real item");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn checkbox_item_can_carry_a_wiki_link() {
|
||||
// A checkbox line is both a context item and a wiki-link source. The
|
||||
// item label keeps the raw markdown (`[[...]]` intact) so promotion can
|
||||
// locate and rewrite the source line later (Fork A, §6).
|
||||
let e = extract("- [ ] call [[Contractor]] back");
|
||||
assert_eq!(e.wiki_links, vec!["Contractor"]);
|
||||
assert_eq!(e.context_items.len(), 1);
|
||||
assert_eq!(e.context_items[0].text, "call [[Contractor]] back");
|
||||
assert!(!e.context_items[0].checked);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extraction_is_idempotent() {
|
||||
let body = "# Mixed\n\n- [ ] do [[X]]\n- [x] done\n\nsee [[Y]]\n";
|
||||
assert_eq!(extract(body), extract(body));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn body_without_links_or_items_yields_empty() {
|
||||
assert_eq!(extract("just prose, no structure"), Extraction::default());
|
||||
}
|
||||
}
|
||||
|
|
@ -10,12 +10,14 @@
|
|||
|
||||
pub mod clock;
|
||||
pub mod error;
|
||||
pub mod extract;
|
||||
pub mod model;
|
||||
pub mod sqlite;
|
||||
pub mod store;
|
||||
|
||||
pub use clock::{Clock, FixedClock};
|
||||
pub use error::{Error, Result};
|
||||
pub use extract::{extract, ContextItem, Extraction};
|
||||
pub use model::{NewNode, Node, NodeKind};
|
||||
pub use sqlite::LocalStore;
|
||||
pub use store::Store;
|
||||
|
|
|
|||
|
|
@ -1 +1,5 @@
|
|||
Begin the v1 prototype (Phase 1): scaffold the Cargo workspace and the `heph-core` crate with a migration-run SQLite schema (tech-spec §4.5), a clock-injected `Store` trait, and a `LocalStore` backend with node create/get. CI now runs the Rust suite (fmt/clippy/test) via the project build hook.
|
||||
Begin the v1 prototype (Phase 1, tech-spec §11.1), built in TDD slices:
|
||||
|
||||
- Cargo workspace + `heph-core` crate; migration-run SQLite schema (§4.5); clock-injected `Store` trait + `LocalStore` node create/get; single local-user bootstrap.
|
||||
- Markdown extraction (§5): `[[wiki-links]]` and GFM `- [ ]` checkbox context-items derived purely and idempotently from a body, skipping code blocks.
|
||||
- CI runs the Rust suite (fmt/clippy/test) via the project build hook.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue