Optimize Jira ADF text extraction

2026-02-28 10:24:14 -07:00 · 2026-02-28 10:24:14 -07:00 · e24964343a
commit e24964343a
parent e3c4d41695
1 changed files with 176 additions and 127 deletions
--- a/src/jira.rs
+++ b/src/jira.rs
@ -10,146 +10,182 @@ pub use gouqi::Issue as JiraIssue;
 /// Recursively extracts plain text from an Atlassian Document Format (ADF) node.
 ///
 /// Jira Cloud API v3 returns issue descriptions as ADF — a nested JSON structure
-/// rather than a plain string. This function walks the content tree and collects
-/// all leaf `"type": "text"` node values so that secret scanners can find them.
+/// rather than a plain string. This function walks the content tree and writes
+/// leaf `"type": "text"` node values into a single output buffer so extraction
+/// remains linear in the size of the final text.
 fn extract_adf_text(node: &serde_json::Value) -> String {
-    enum FrameState {
-        Enter,
-        Exit {
-            node_type: Option<String>,
-            child_count: usize,
-        },
+    struct PendingSeparator<'a> {
+        separator: &'a str,
+        previous_ended_whitespace: bool,
    }

-    struct Frame<'a> {
-        node: &'a serde_json::Value,
-        state: FrameState,
+    struct TextAccumulator {
+        text: String,
+        last_char_is_whitespace: bool,
    }

-    let mut stack = vec![Frame {
-        node,
-        state: FrameState::Enter,
-    }];
-    let mut values: Vec<String> = Vec::new();
+    impl TextAccumulator {
+        fn new() -> Self {
+            Self { text: String::new(), last_char_is_whitespace: true }
+        }

-    while let Some(frame) = stack.pop() {
-        match frame.state {
-            FrameState::Enter => match frame.node {
-                serde_json::Value::Object(map) => {
-                    let node_type = map.get("type").and_then(|v| v.as_str());
-                    if node_type == Some("text") {
-                        values.push(
-                            map.get("text")
-                                .and_then(|v| v.as_str())
-                                .unwrap_or("")
-                                .to_string(),
-                        );
-                        continue;
-                    }
-                    if node_type == Some("hardBreak") {
-                        values.push("\n".to_string());
-                        continue;
-                    }
+        fn len(&self) -> usize {
+            self.text.len()
+        }

-                    let child_count = map
-                        .get("content")
-                        .and_then(|v| v.as_array())
-                        .map(|arr| arr.len())
-                        .unwrap_or(0);
-                    stack.push(Frame {
-                        node: frame.node,
-                        state: FrameState::Exit {
-                            node_type: node_type.map(|value| value.to_string()),
-                            child_count,
-                        },
-                    });
-                    if let Some(arr) = map.get("content").and_then(|v| v.as_array()) {
-                        for child in arr.iter().rev() {
-                            stack.push(Frame {
-                                node: child,
-                                state: FrameState::Enter,
-                            });
+        fn ends_with_newline(&self) -> bool {
+            self.text.ends_with('\n')
+        }
+
+        fn last_char_is_whitespace(&self) -> bool {
+            self.last_char_is_whitespace
+        }
+
+        fn write_text(
+            &mut self,
+            text: &str,
+            pending_separator: &mut Option<PendingSeparator<'_>>,
+        ) -> bool {
+            if text.is_empty() {
+                return false;
+            }
+
+            if let Some(pending_separator) = pending_separator.take() {
+                let starts_non_whitespace =
+                    text.chars().next().map(|ch| !ch.is_whitespace()).unwrap_or(false);
+                if !pending_separator.previous_ended_whitespace && starts_non_whitespace {
+                    self.text.push_str(pending_separator.separator);
+                    if let Some(last_char) = pending_separator.separator.chars().last() {
+                        self.last_char_is_whitespace = last_char.is_whitespace();
+                    }
+                }
+            }
+
+            self.text.push_str(text);
+            if let Some(last_char) = text.chars().last() {
+                self.last_char_is_whitespace = last_char.is_whitespace();
+            }
+            true
+        }
+
+        fn write_char(
+            &mut self,
+            ch: char,
+            pending_separator: &mut Option<PendingSeparator<'_>>,
+        ) -> bool {
+            if let Some(pending_separator) = pending_separator.take() {
+                if !pending_separator.previous_ended_whitespace && !ch.is_whitespace() {
+                    self.text.push_str(pending_separator.separator);
+                    if let Some(last_char) = pending_separator.separator.chars().last() {
+                        self.last_char_is_whitespace = last_char.is_whitespace();
+                    }
+                }
+            }
+
+            self.text.push(ch);
+            self.last_char_is_whitespace = ch.is_whitespace();
+            true
+        }
+    }
+
+    fn write_adf_text(
+        node: &serde_json::Value,
+        output: &mut TextAccumulator,
+        pending_separator: &mut Option<PendingSeparator<'_>>,
+    ) -> bool {
+        match node {
+            serde_json::Value::Object(map) => {
+                let node_type = map.get("type").and_then(|v| v.as_str());
+                if node_type == Some("text") {
+                    return output.write_text(
+                        map.get("text").and_then(|v| v.as_str()).unwrap_or(""),
+                        pending_separator,
+                    );
+                }
+                if node_type == Some("hardBreak") {
+                    return output.write_char('\n', pending_separator);
+                }
+
+                let start_len = output.len();
+                if let Some(children) = map.get("content").and_then(|v| v.as_array()) {
+                    let separator = match node_type {
+                        Some("table") => Some("\n"),
+                        Some("tableRow") => Some(" "),
+                        _ => None,
+                    };
+                    let mut wrote_child_text = false;
+                    let mut previous_ended_whitespace = true;
+                    for child in children {
+                        let mut child_pending_separator = if wrote_child_text {
+                            separator.map(|separator| PendingSeparator {
+                                separator,
+                                previous_ended_whitespace,
+                            })
+                        } else {
+                            pending_separator.take()
+                        };
+                        let child_wrote_text =
+                            write_adf_text(child, output, &mut child_pending_separator);
+                        if !wrote_child_text && !child_wrote_text {
+                            *pending_separator = child_pending_separator;
+                        }
+                        if child_wrote_text {
+                            wrote_child_text = true;
+                            previous_ended_whitespace = output.last_char_is_whitespace();
                        }
                    }
                }
-                serde_json::Value::Array(arr) => {
-                    let child_count = arr.len();
-                    stack.push(Frame {
-                        node: frame.node,
-                        state: FrameState::Exit {
-                            node_type: None,
-                            child_count,
-                        },
-                    });
-                    for child in arr.iter().rev() {
-                        stack.push(Frame {
-                            node: child,
-                            state: FrameState::Enter,
-                        });
-                    }
-                }
-                _ => values.push(String::new()),
-            },
-            FrameState::Exit {
-                node_type,
-                child_count,
-            } => {
-                let start = values.len().saturating_sub(child_count);
-                let child_texts = values.split_off(start);
-                let mut text = match node_type.as_deref() {
-                    Some("table") => join_texts_with_separator(child_texts, "\n"),
-                    Some("tableRow") => join_texts_with_separator(child_texts, " "),
-                    _ => child_texts.concat(),
-                };

                if matches!(
-                    node_type.as_deref(),
-                    Some("paragraph" | "heading" | "blockquote" | "listItem" | "codeBlock" | "tableRow" | "table")
-                ) && !text.is_empty()
-                    && !text.ends_with('\n')
+                    node_type,
+                    Some(
+                        "paragraph"
+                            | "heading"
+                            | "blockquote"
+                            | "listItem"
+                            | "codeBlock"
+                            | "tableRow"
+                            | "table"
+                    )
+                ) && output.len() > start_len
+                    && !output.ends_with_newline()
                {
-                    text.push('\n');
+                    output.text.push('\n');
+                    output.last_char_is_whitespace = true;
                }

-                values.push(text);
+                output.len() > start_len
            }
+            serde_json::Value::Array(arr) => {
+                let start_len = output.len();
+                let mut wrote_child_text = false;
+                for child in arr {
+                    let mut child_pending_separator =
+                        if wrote_child_text { None } else { pending_separator.take() };
+                    let child_wrote_text =
+                        write_adf_text(child, output, &mut child_pending_separator);
+                    if !wrote_child_text && !child_wrote_text {
+                        *pending_separator = child_pending_separator;
+                    }
+                    if child_wrote_text {
+                        wrote_child_text = true;
+                    }
+                }
+                output.len() > start_len
+            }
+            _ => false,
        }
    }

-    values.pop().unwrap_or_default()
-}
-
-fn join_texts_with_separator(child_texts: Vec<String>, separator: &str) -> String {
-    let mut text = String::new();
-    let mut last_was_whitespace = true;
-    for child_text in child_texts {
-        if child_text.is_empty() {
-            continue;
-        }
-        let child_starts_non_whitespace = child_text
-            .chars()
-            .next()
-            .map(|c| !c.is_whitespace())
-            .unwrap_or(false);
-        let needs_separator = !last_was_whitespace && child_starts_non_whitespace;
-        if needs_separator {
-            text.push_str(separator);
-        }
-        text.push_str(&child_text);
-        if let Some(last_char) = child_text.chars().rev().next() {
-            last_was_whitespace = last_char.is_whitespace();
-        }
-    }
-    text
+    let mut output = TextAccumulator::new();
+    let mut pending_separator = None;
+    write_adf_text(node, &mut output, &mut pending_separator);
+    output.text
 }

 /// Returns true if the value looks like an ADF document root.
 fn is_adf(value: &serde_json::Value) -> bool {
-    value
-        .get("type")
-        .and_then(|v| v.as_str())
-        .map(|t| t == "doc")
-        .unwrap_or(false)
+    value.get("type").and_then(|v| v.as_str()).map(|t| t == "doc").unwrap_or(false)
 }

 fn flatten_adf_fields(issue_value: &mut serde_json::Value) {
@ -159,9 +195,8 @@ fn flatten_adf_fields(issue_value: &mut serde_json::Value) {
    if let Some(desc) = issue_value.pointer("/fields/description") {
        if is_adf(desc) {
            let plain_text = extract_adf_text(desc);
-            if let Some(fields) = issue_value
-                .pointer_mut("/fields")
-                .and_then(|value| value.as_object_mut())
+            if let Some(fields) =
+                issue_value.pointer_mut("/fields").and_then(|value| value.as_object_mut())
            {
                fields.insert(
                    "description".to_string(),
@ -385,6 +420,24 @@ mod tests {
        assert_eq!(text.trim_end(), "item1\nitem2\ncode");
    }

+    #[test]
+    fn extract_adf_text_preserves_table_row_whitespace_rules() {
+        let value = json!({
+            "type": "doc",
+            "version": 1,
+            "content": [{
+                "type": "tableRow",
+                "content": [
+                    {"type": "text", "text": "foo"},
+                    {"type": "text", "text": "bar"},
+                    {"type": "text", "text": " baz"}
+                ]
+            }]
+        });
+        let text = extract_adf_text(&value);
+        assert_eq!(text.trim_end(), "foo bar baz");
+    }
+
    #[test]
    fn flatten_adf_fields_converts_comment_bodies() {
        let mut issue_value = json!({
@ -428,10 +481,8 @@ mod tests {
            }
        });
        flatten_adf_fields(&mut issue_value);
-        let desc = issue_value
-            .pointer("/fields/description")
-            .and_then(|v| v.as_str())
-            .unwrap_or("");
+        let desc =
+            issue_value.pointer("/fields/description").and_then(|v| v.as_str()).unwrap_or("");
        assert_eq!(desc, "desc");
    }

@ -443,10 +494,8 @@ mod tests {
            }
        });
        flatten_adf_fields(&mut issue_value);
-        let desc = issue_value
-            .pointer("/fields/description")
-            .and_then(|v| v.as_str())
-            .unwrap_or("");
+        let desc =
+            issue_value.pointer("/fields/description").and_then(|v| v.as_str()).unwrap_or("");
        assert_eq!(desc, "plain description");
    }