hephaestus/crates/heph-core/src/extract.rs
Erich Blume b97c387252
Some checks failed
Build / validate (pull_request) Failing after 3s
heph.nvim: context-item promotion + Dagger headless-nvim CI (slice 11c)
Backend (TDD):
- task.promote {container_id, item_ref, attention?, project?}: mint a committed
  task from the item_ref-th `- [ ]` context item (1-based, document order via a
  new extract::context_item_lines) and rewrite that source line into a [[link]]
  to it. Unit + rpc_socket tests.
- resolve_id now excludes canonical-context docs, so [[Task Title]] resolves to
  the task, not its identically-titled context doc (deterministic; a general fix
  surfaced by promotion's ULID-tiebreak ambiguity).

Plugin: :Heph promote / promote_under_cursor (save-if-dirty → compute item index
with a code-fence-aware scanner mirroring extract.rs → task.promote → reload the
rewritten buffer). e2e spec (f): promote a context line, assert the new task in
next, the source line became a link, and the container backlinks the task.

CI via Dagger: a test_nvim function bakes a pinned, arch-detected Neovim
(v0.11.2 — Debian's is too old for vim.uv) onto rust:1-bookworm, builds hephd,
and runs the self-contained shim suite (cargo + target cache volumes);
build.yaml calls `dagger call test-nvim`. run.lua now fails on zero specs (no
false-green). Validated end-to-end: passing suite → exit 0, failing spec →
Dagger exit 1.

117 Rust tests + 7 nvim e2e specs green.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 06:08:41 -07:00

256 lines
9.2 KiB
Rust

//! Markdown derivation (tech-spec §5).
//!
//! From a node's body we derive two things, purely and deterministically:
//!
//! - **`[[wiki-links]]`** → `wiki` link targets (resolved to nodes later, via
//! `aliases`/title; unresolved targets are allowed and recorded).
//! - **GFM task-list items** (`- [ ]` / `- [x]`) → the **local context-item
//! index** (Fork A, [[design]] §6.3). The `[ ]`/`[x]` marker *is* the item's
//! only state; this index is derived per replica, never synced.
//!
//! Derivation is **idempotent**: the same body always yields the same
//! [`Extraction`]. Code blocks are skipped (a `- [ ]` inside a fenced block is
//! not a task; a `[[link]]` inside one is not a link), which is why this goes
//! through a real CommonMark parser rather than a line scan.
use std::collections::HashSet;
use std::ops::Range;
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
/// A context-item line derived from a body (Fork A).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContextItem {
/// The visible label text (markers and surrounding whitespace stripped).
pub text: String,
/// `true` for `- [x]` (not-outstanding), `false` for `- [ ]` (outstanding).
pub checked: bool,
}
/// Everything derived from a single body.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct Extraction {
/// Wiki-link targets, in first-seen document order, de-duplicated.
pub wiki_links: Vec<String>,
/// Context items, in document order.
pub context_items: Vec<ContextItem>,
}
/// Derive [`Extraction`] from a markdown body.
pub fn extract(body: &str) -> Extraction {
let mut options = Options::empty();
options.insert(Options::ENABLE_TASKLISTS);
let mut context_items: Vec<ContextItem> = Vec::new();
// Byte ranges covered by code (fenced/indented blocks and inline spans).
// Wiki-links found inside these are not links.
let mut code_ranges: Vec<Range<usize>> = Vec::new();
// Depth of nested code blocks; their inner text ranges are code.
let mut code_depth: u32 = 0;
// The task item currently being collected, if any: (checked, accumulated text).
let mut current: Option<(bool, String)> = None;
for (event, range) in Parser::new_ext(body, options).into_offset_iter() {
match event {
Event::Start(Tag::CodeBlock(_)) => code_depth += 1,
Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1),
Event::TaskListMarker(checked) => {
current = Some((checked, String::new()));
}
Event::End(TagEnd::Item) => {
if let Some((checked, text)) = current.take() {
context_items.push(ContextItem {
checked,
text: text.trim().to_string(),
});
}
}
Event::Text(text) => {
if code_depth > 0 {
code_ranges.push(range);
}
if let Some((_, label)) = current.as_mut() {
label.push_str(&text);
}
}
// Inline code is part of an item's visible label, but its contents
// are never a wiki-link source.
Event::Code(code) => {
code_ranges.push(range);
if let Some((_, label)) = current.as_mut() {
label.push_str(&code);
}
}
Event::SoftBreak | Event::HardBreak => {
if let Some((_, label)) = current.as_mut() {
label.push(' ');
}
}
_ => {}
}
}
// Scan the raw body for wiki-links (CommonMark mangles `[[ ]]` brackets, so
// we can't rely on Text events), excluding any that start inside code.
let wiki_links = scan_wiki_links(body, &code_ranges);
Extraction {
wiki_links,
context_items,
}
}
/// The 0-based body line index of each context item, in the **same document
/// order** as [`extract`]'s `context_items` (task markers never fire inside code
/// blocks, so the two lists align 1:1). Promotion uses this to locate the source
/// `- [ ]` line it must rewrite into a link (tech-spec §4.3, §6).
pub fn context_item_lines(body: &str) -> Vec<usize> {
let mut options = Options::empty();
options.insert(Options::ENABLE_TASKLISTS);
let mut lines = Vec::new();
for (event, range) in Parser::new_ext(body, options).into_offset_iter() {
if let Event::TaskListMarker(_) = event {
lines.push(body[..range.start].bytes().filter(|&b| b == b'\n').count());
}
}
lines
}
/// Find `[[target]]` (or `[[target|display]]`) spans in `body`, returning each
/// unique, non-empty target in first-seen order. Matches starting inside a
/// `code` range are skipped. The `[` / `]` delimiters are ASCII, so byte
/// indexing stays on char boundaries.
fn scan_wiki_links(body: &str, code_ranges: &[Range<usize>]) -> Vec<String> {
let mut out: Vec<String> = Vec::new();
let mut seen: HashSet<String> = HashSet::new();
let bytes = body.as_bytes();
let mut i = 0;
while i + 1 < bytes.len() {
if bytes[i] == b'[' && bytes[i + 1] == b'[' {
let rest = &body[i + 2..];
match rest.find("]]") {
Some(close) => {
let in_code = code_ranges.iter().any(|r| r.contains(&i));
if !in_code {
let inner = &rest[..close];
// `[[target|display]]` — the target is the left side.
let target = inner.split('|').next().unwrap_or("").trim();
if !target.is_empty() && seen.insert(target.to_string()) {
out.push(target.to_string());
}
}
i += 2 + close + 2;
continue;
}
// Unterminated `[[` — nothing more to find.
None => break,
}
}
i += 1;
}
out
}
#[cfg(test)]
mod tests {
use super::*;
fn links(body: &str) -> Vec<String> {
extract(body).wiki_links
}
fn items(body: &str) -> Vec<ContextItem> {
extract(body).context_items
}
#[test]
fn extracts_simple_wiki_links_in_order() {
assert_eq!(
links("See [[Roof]] then [[Contractor calls]]."),
vec!["Roof".to_string(), "Contractor calls".to_string()]
);
}
#[test]
fn wiki_link_target_is_left_of_pipe() {
assert_eq!(links("[[borgmatic|Borgmatic backups]]"), vec!["borgmatic"]);
}
#[test]
fn wiki_links_are_deduplicated_first_seen_order() {
assert_eq!(
links("[[A]] [[B]] [[A]] [[a]]"),
vec!["A".to_string(), "B".to_string(), "a".to_string()]
);
}
#[test]
fn empty_and_unterminated_wiki_links_are_ignored() {
assert!(links("[[]] and [[ ]] and [[oops").is_empty());
}
#[test]
fn wiki_links_inside_code_are_not_extracted() {
let body = "real [[Keep]]\n\n```\nnot [[Skip]] here\n```\n";
assert_eq!(links(body), vec!["Keep"]);
}
#[test]
fn extracts_checkbox_items_with_state() {
let body = "- [ ] feed birds\n- [x] brush teeth\n";
assert_eq!(
items(body),
vec![
ContextItem {
text: "feed birds".to_string(),
checked: false
},
ContextItem {
text: "brush teeth".to_string(),
checked: true
},
]
);
}
#[test]
fn checkbox_inside_code_block_is_not_an_item() {
let body = "- [ ] real item\n\n```\n- [ ] not an item\n```\n";
assert_eq!(items(body).len(), 1);
assert_eq!(items(body)[0].text, "real item");
}
#[test]
fn checkbox_item_can_carry_a_wiki_link() {
// A checkbox line is both a context item and a wiki-link source. The
// item label keeps the raw markdown (`[[...]]` intact) so promotion can
// locate and rewrite the source line later (Fork A, §6).
let e = extract("- [ ] call [[Contractor]] back");
assert_eq!(e.wiki_links, vec!["Contractor"]);
assert_eq!(e.context_items.len(), 1);
assert_eq!(e.context_items[0].text, "call [[Contractor]] back");
assert!(!e.context_items[0].checked);
}
#[test]
fn context_item_lines_align_with_items_skipping_code() {
let body = "# Notes\n\n- [ ] first\n\n```\n- [ ] fenced\n```\n\n- [x] second\n";
let lines = context_item_lines(body);
// Two real items (the fenced one is skipped, matching `context_items`).
assert_eq!(lines.len(), extract(body).context_items.len());
assert_eq!(lines, vec![2, 8]); // 0-based lines of "- [ ] first" / "- [x] second"
}
#[test]
fn extraction_is_idempotent() {
let body = "# Mixed\n\n- [ ] do [[X]]\n- [x] done\n\nsee [[Y]]\n";
assert_eq!(extract(body), extract(body));
}
#[test]
fn body_without_links_or_items_yields_empty() {
assert_eq!(extract("just prose, no structure"), Extraction::default());
}
}