hephaestus/crates/heph-core/src/extract.rs

//! Markdown derivation (tech-spec §5).
//!
//! From a node's body we derive two things, purely and deterministically:
//!
//! - **`[[wiki-links]]`** → `wiki` link targets (resolved to nodes later, via
//!   `aliases`/title; unresolved targets are allowed and recorded).
//! - **GFM task-list items** (`- [ ]` / `- [x]`) → the **local context-item
//!   index** (Fork A, [[design]] §6.3). The `[ ]`/`[x]` marker *is* the item's
//!   only state; this index is derived per replica, never synced.
//!
//! Derivation is **idempotent**: the same body always yields the same
//! [`Extraction`]. Code blocks are skipped (a `- [ ]` inside a fenced block is
//! not a task; a `[[link]]` inside one is not a link), which is why this goes
//! through a real CommonMark parser rather than a line scan.

use std::collections::HashSet;
use std::ops::Range;

use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};

/// A context-item line derived from a body (Fork A).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ContextItem {
    /// The visible label text (markers and surrounding whitespace stripped).
    pub text: String,
    /// `true` for `- [x]` (not-outstanding), `false` for `- [ ]` (outstanding).
    pub checked: bool,
}

/// Everything derived from a single body.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct Extraction {
    /// Wiki-link targets, in first-seen document order, de-duplicated.
    pub wiki_links: Vec<String>,
    /// Context items, in document order.
    pub context_items: Vec<ContextItem>,
}

/// Derive [`Extraction`] from a markdown body.
pub fn extract(body: &str) -> Extraction {
    let mut options = Options::empty();
    options.insert(Options::ENABLE_TASKLISTS);

    let mut context_items: Vec<ContextItem> = Vec::new();
    // Byte ranges covered by code (fenced/indented blocks and inline spans).
    // Wiki-links found inside these are not links.
    let mut code_ranges: Vec<Range<usize>> = Vec::new();
    // Depth of nested code blocks; their inner text ranges are code.
    let mut code_depth: u32 = 0;
    // The task item currently being collected, if any: (checked, accumulated text).
    let mut current: Option<(bool, String)> = None;

    for (event, range) in Parser::new_ext(body, options).into_offset_iter() {
        match event {
            Event::Start(Tag::CodeBlock(_)) => code_depth += 1,
            Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1),

            Event::TaskListMarker(checked) => {
                current = Some((checked, String::new()));
            }
            Event::End(TagEnd::Item) => {
                if let Some((checked, text)) = current.take() {
                    context_items.push(ContextItem {
                        checked,
                        text: text.trim().to_string(),
                    });
                }
            }

            Event::Text(text) => {
                if code_depth > 0 {
                    code_ranges.push(range);
                }
                if let Some((_, label)) = current.as_mut() {
                    label.push_str(&text);
                }
            }
            // Inline code is part of an item's visible label, but its contents
            // are never a wiki-link source.
            Event::Code(code) => {
                code_ranges.push(range);
                if let Some((_, label)) = current.as_mut() {
                    label.push_str(&code);
                }
            }
            Event::SoftBreak | Event::HardBreak => {
                if let Some((_, label)) = current.as_mut() {
                    label.push(' ');
                }
            }
            _ => {}
        }
    }

    // Scan the raw body for wiki-links (CommonMark mangles `[[ ]]` brackets, so
    // we can't rely on Text events), excluding any that start inside code.
    let wiki_links = scan_wiki_links(body, &code_ranges);

    Extraction {
        wiki_links,
        context_items,
    }
}

/// The 0-based body line index of each context item, in the **same document
/// order** as [`extract`]'s `context_items` (task markers never fire inside code
/// blocks, so the two lists align 1:1). Promotion uses this to locate the source
/// `- [ ]` line it must rewrite into a link (tech-spec §4.3, §6).
pub fn context_item_lines(body: &str) -> Vec<usize> {
    let mut options = Options::empty();
    options.insert(Options::ENABLE_TASKLISTS);
    let mut lines = Vec::new();
    for (event, range) in Parser::new_ext(body, options).into_offset_iter() {
        if let Event::TaskListMarker(_) = event {
            lines.push(body[..range.start].bytes().filter(|&b| b == b'\n').count());
        }
    }
    lines
}

/// Find `[[target]]` (or `[[target|display]]`) spans in `body`, returning each
/// unique, non-empty target in first-seen order. Matches starting inside a
/// `code` range are skipped. The `[` / `]` delimiters are ASCII, so byte
/// indexing stays on char boundaries.
fn scan_wiki_links(body: &str, code_ranges: &[Range<usize>]) -> Vec<String> {
    let mut out: Vec<String> = Vec::new();
    let mut seen: HashSet<String> = HashSet::new();
    let bytes = body.as_bytes();
    let mut i = 0;
    while i + 1 < bytes.len() {
        if bytes[i] == b'[' && bytes[i + 1] == b'[' {
            let rest = &body[i + 2..];
            match rest.find("]]") {
                Some(close) => {
                    let in_code = code_ranges.iter().any(|r| r.contains(&i));
                    if !in_code {
                        let inner = &rest[..close];
                        // `[[target|display]]` — the target is the left side.
                        let target = inner.split('|').next().unwrap_or("").trim();
                        if !target.is_empty() && seen.insert(target.to_string()) {
                            out.push(target.to_string());
                        }
                    }
                    i += 2 + close + 2;
                    continue;
                }
                // Unterminated `[[` — nothing more to find.
                None => break,
            }
        }
        i += 1;
    }
    out
}

#[cfg(test)]
mod tests {
    use super::*;

    fn links(body: &str) -> Vec<String> {
        extract(body).wiki_links
    }

    fn items(body: &str) -> Vec<ContextItem> {
        extract(body).context_items
    }

    #[test]
    fn extracts_simple_wiki_links_in_order() {
        assert_eq!(
            links("See [[Roof]] then [[Contractor calls]]."),
            vec!["Roof".to_string(), "Contractor calls".to_string()]
        );
    }

    #[test]
    fn wiki_link_target_is_left_of_pipe() {
        assert_eq!(links("[[borgmatic|Borgmatic backups]]"), vec!["borgmatic"]);
    }

    #[test]
    fn wiki_links_are_deduplicated_first_seen_order() {
        assert_eq!(
            links("[[A]] [[B]] [[A]] [[a]]"),
            vec!["A".to_string(), "B".to_string(), "a".to_string()]
        );
    }

    #[test]
    fn empty_and_unterminated_wiki_links_are_ignored() {
        assert!(links("[[]] and [[   ]] and [[oops").is_empty());
    }

    #[test]
    fn wiki_links_inside_code_are_not_extracted() {
        let body = "real [[Keep]]\n\n```\nnot [[Skip]] here\n```\n";
        assert_eq!(links(body), vec!["Keep"]);
    }

    #[test]
    fn extracts_checkbox_items_with_state() {
        let body = "- [ ] feed birds\n- [x] brush teeth\n";
        assert_eq!(
            items(body),
            vec![
                ContextItem {
                    text: "feed birds".to_string(),
                    checked: false
                },
                ContextItem {
                    text: "brush teeth".to_string(),
                    checked: true
                },
            ]
        );
    }

    #[test]
    fn checkbox_inside_code_block_is_not_an_item() {
        let body = "- [ ] real item\n\n```\n- [ ] not an item\n```\n";
        assert_eq!(items(body).len(), 1);
        assert_eq!(items(body)[0].text, "real item");
    }

    #[test]
    fn checkbox_item_can_carry_a_wiki_link() {
        // A checkbox line is both a context item and a wiki-link source. The
        // item label keeps the raw markdown (`[[...]]` intact) so promotion can
        // locate and rewrite the source line later (Fork A, §6).
        let e = extract("- [ ] call [[Contractor]] back");
        assert_eq!(e.wiki_links, vec!["Contractor"]);
        assert_eq!(e.context_items.len(), 1);
        assert_eq!(e.context_items[0].text, "call [[Contractor]] back");
        assert!(!e.context_items[0].checked);
    }

    #[test]
    fn context_item_lines_align_with_items_skipping_code() {
        let body = "# Notes\n\n- [ ] first\n\n```\n- [ ] fenced\n```\n\n- [x] second\n";
        let lines = context_item_lines(body);
        // Two real items (the fenced one is skipped, matching `context_items`).
        assert_eq!(lines.len(), extract(body).context_items.len());
        assert_eq!(lines, vec![2, 8]); // 0-based lines of "- [ ] first" / "- [x] second"
    }

    #[test]
    fn extraction_is_idempotent() {
        let body = "# Mixed\n\n- [ ] do [[X]]\n- [x] done\n\nsee [[Y]]\n";
        assert_eq!(extract(body), extract(body));
    }

    #[test]
    fn body_without_links_or_items_yields_empty() {
        assert_eq!(extract("just prose, no structure"), Extraction::default());
    }
}