From f680f31e545999c90103a9f461c7564939f56b98 Mon Sep 17 00:00:00 2001 From: Sorevitnod Date: Sat, 21 Feb 2026 19:08:46 +0000 Subject: [PATCH 1/8] Fix Jira ADF description not being scanned on Jira Cloud MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Jira Cloud API v3 returns issue descriptions and comment bodies in Atlassian Document Format (ADF) — a nested JSON tree — rather than as plain strings. Previously, the raw ADF object was serialized to disk and the scanner found 0 bytes of meaningful content, producing no findings even when secrets were present. This commit adds `extract_adf_text()` which recursively walks the ADF `content` tree and collects all leaf `"type":"text"` node values into a single plain string. `download_issues_to_dir()` now replaces any ADF `description` or comment `body` fields with their extracted plain text before writing the issue JSON to disk. Fixes #242 Co-Authored-By: Claude Sonnet 4.6 --- src/jira.rs | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/src/jira.rs b/src/jira.rs index 2fe0ef4..a8c05ac 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -6,6 +6,52 @@ use url::Url; // Re-export the Issue type from gouqi so callers don't depend on the crate. pub use gouqi::Issue as JiraIssue; + +/// Recursively extracts plain text from an Atlassian Document Format (ADF) node. +/// +/// Jira Cloud API v3 returns issue descriptions as ADF — a nested JSON structure +/// rather than a plain string. This function walks the content tree and collects +/// all leaf `"type": "text"` node values so that secret scanners can find them. +fn extract_adf_text(node: &serde_json::Value) -> String { + match node { + serde_json::Value::Object(map) => { + if map.get("type").and_then(|v| v.as_str()) == Some("text") { + return map + .get("text") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + } + map.get("content") + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .map(extract_adf_text) + .filter(|s| !s.is_empty()) + .collect::>() + .join(" ") + }) + .unwrap_or_default() + } + serde_json::Value::Array(arr) => arr + .iter() + .map(extract_adf_text) + .filter(|s| !s.is_empty()) + .collect::>() + .join(" "), + _ => String::new(), + } +} + +/// Returns true if the value looks like an ADF document root. +fn is_adf(value: &serde_json::Value) -> bool { + value + .get("type") + .and_then(|v| v.as_str()) + .map(|t| t == "doc") + .unwrap_or(false) +} + pub async fn fetch_issues( jira_url: Url, jql: &str, @@ -44,8 +90,36 @@ pub async fn download_issues_to_dir( let issues = fetch_issues(jira_url, jql, max_results, ignore_certs).await?; let mut paths = Vec::new(); for issue in issues { + let mut issue_value = serde_json::to_value(&issue)?; + + // Jira Cloud API v3 returns descriptions as Atlassian Document Format (ADF), + // a nested JSON tree whose leaf text nodes contain the actual content. + // Flatten ADF to a plain string so the secret scanner can match against it. + if let Some(desc) = issue_value.pointer("/fields/description").cloned() { + if is_adf(&desc) { + let plain_text = extract_adf_text(&desc); + if let Some(fields) = issue_value.pointer_mut("/fields") { + fields["description"] = serde_json::Value::String(plain_text); + } + } + } + + // Apply the same ADF flattening to comment bodies. + if let Some(comments) = issue_value.pointer_mut("/fields/comment/comments") { + if let Some(arr) = comments.as_array_mut() { + for comment in arr.iter_mut() { + if let Some(body) = comment.get("body").cloned() { + if is_adf(&body) { + let plain_text = extract_adf_text(&body); + comment["body"] = serde_json::Value::String(plain_text); + } + } + } + } + } + let file = output_dir.join(format!("{}.json", issue.key)); - std::fs::write(&file, serde_json::to_vec(&issue)?)?; + std::fs::write(&file, serde_json::to_vec(&issue_value)?)?; paths.push(file); } Ok(paths) From d28bbafcf60cf306e8d5de733d258f914caabbdf Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 26 Feb 2026 23:25:50 -0700 Subject: [PATCH 2/8] Fix ADF text flattening --- src/jira.rs | 128 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 104 insertions(+), 24 deletions(-) diff --git a/src/jira.rs b/src/jira.rs index a8c05ac..f196617 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -15,30 +15,43 @@ pub use gouqi::Issue as JiraIssue; fn extract_adf_text(node: &serde_json::Value) -> String { match node { serde_json::Value::Object(map) => { - if map.get("type").and_then(|v| v.as_str()) == Some("text") { + let node_type = map.get("type").and_then(|v| v.as_str()); + if node_type == Some("text") { return map .get("text") .and_then(|v| v.as_str()) .unwrap_or("") .to_string(); } - map.get("content") - .and_then(|v| v.as_array()) - .map(|arr| { - arr.iter() - .map(extract_adf_text) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" ") - }) - .unwrap_or_default() + if node_type == Some("hardBreak") { + return "\n".to_string(); + } + + let mut text = String::new(); + if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { + for child in arr { + text.push_str(&extract_adf_text(child)); + } + } + + if matches!( + node_type, + Some("paragraph" | "heading" | "blockquote" | "listItem" | "codeBlock" | "tableRow" | "table") + ) && !text.is_empty() + && !text.ends_with('\n') + { + text.push('\n'); + } + + text + } + serde_json::Value::Array(arr) => { + let mut text = String::new(); + for child in arr { + text.push_str(&extract_adf_text(child)); + } + text } - serde_json::Value::Array(arr) => arr - .iter() - .map(extract_adf_text) - .filter(|s| !s.is_empty()) - .collect::>() - .join(" "), _ => String::new(), } } @@ -95,9 +108,9 @@ pub async fn download_issues_to_dir( // Jira Cloud API v3 returns descriptions as Atlassian Document Format (ADF), // a nested JSON tree whose leaf text nodes contain the actual content. // Flatten ADF to a plain string so the secret scanner can match against it. - if let Some(desc) = issue_value.pointer("/fields/description").cloned() { - if is_adf(&desc) { - let plain_text = extract_adf_text(&desc); + if let Some(desc) = issue_value.pointer("/fields/description") { + if is_adf(desc) { + let plain_text = extract_adf_text(desc); if let Some(fields) = issue_value.pointer_mut("/fields") { fields["description"] = serde_json::Value::String(plain_text); } @@ -108,11 +121,15 @@ pub async fn download_issues_to_dir( if let Some(comments) = issue_value.pointer_mut("/fields/comment/comments") { if let Some(arr) = comments.as_array_mut() { for comment in arr.iter_mut() { - if let Some(body) = comment.get("body").cloned() { - if is_adf(&body) { - let plain_text = extract_adf_text(&body); - comment["body"] = serde_json::Value::String(plain_text); + let plain_text = comment.get("body").and_then(|body| { + if is_adf(body) { + Some(extract_adf_text(body)) + } else { + None } + }); + if let Some(plain_text) = plain_text { + comment["body"] = serde_json::Value::String(plain_text); } } } @@ -124,3 +141,66 @@ pub async fn download_issues_to_dir( } Ok(paths) } + +#[cfg(test)] +mod tests { + use super::{extract_adf_text, is_adf}; + use serde_json::json; + + #[test] + fn is_adf_detects_doc_root() { + let doc = json!({"type": "doc", "version": 1, "content": []}); + assert!(is_adf(&doc)); + assert!(!is_adf(&json!({"type": "paragraph"}))); + assert!(!is_adf(&json!("not-a-doc"))); + } + + #[test] + fn extract_adf_text_concatenates_adjacent_text_nodes() { + let value = json!({ + "type": "doc", + "version": 1, + "content": [{ + "type": "paragraph", + "content": [ + {"type": "text", "text": "sk-"}, + {"type": "text", "text": "proj-123"} + ] + }] + }); + let text = extract_adf_text(&value); + assert_eq!(text.trim_end(), "sk-proj-123"); + } + + #[test] + fn extract_adf_text_preserves_hard_breaks() { + let value = json!({ + "type": "doc", + "version": 1, + "content": [{ + "type": "paragraph", + "content": [ + {"type": "text", "text": "foo"}, + {"type": "hardBreak"}, + {"type": "text", "text": "bar"} + ] + }] + }); + let text = extract_adf_text(&value); + assert_eq!(text.trim_end(), "foo\nbar"); + } + + #[test] + fn extract_adf_text_adds_paragraph_separator() { + let value = json!({ + "type": "doc", + "version": 1, + "content": [ + {"type": "paragraph", "content": [{"type": "text", "text": "first"}]}, + {"type": "paragraph", "content": [{"type": "text", "text": "second"}]} + ] + }); + let text = extract_adf_text(&value); + assert_eq!(text.trim_end(), "first\nsecond"); + } +} From fafe89199614b086e83fb455edb9bb1187dc881b Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Thu, 26 Feb 2026 23:42:32 -0700 Subject: [PATCH 3/8] Add ADF extraction test coverage --- src/jira.rs | 180 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 150 insertions(+), 30 deletions(-) diff --git a/src/jira.rs b/src/jira.rs index f196617..eab4d63 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -65,6 +65,38 @@ fn is_adf(value: &serde_json::Value) -> bool { .unwrap_or(false) } +fn flatten_adf_fields(issue_value: &mut serde_json::Value) { + // Jira Cloud API v3 returns descriptions as Atlassian Document Format (ADF), + // a nested JSON tree whose leaf text nodes contain the actual content. + // Flatten ADF to a plain string so the secret scanner can match against it. + if let Some(desc) = issue_value.pointer("/fields/description") { + if is_adf(desc) { + let plain_text = extract_adf_text(desc); + if let Some(fields) = issue_value.pointer_mut("/fields") { + fields["description"] = serde_json::Value::String(plain_text); + } + } + } + + // Apply the same ADF flattening to comment bodies. + if let Some(comments) = issue_value.pointer_mut("/fields/comment/comments") { + if let Some(arr) = comments.as_array_mut() { + for comment in arr.iter_mut() { + let plain_text = comment.get("body").and_then(|body| { + if is_adf(body) { + Some(extract_adf_text(body)) + } else { + None + } + }); + if let Some(plain_text) = plain_text { + comment["body"] = serde_json::Value::String(plain_text); + } + } + } + } +} + pub async fn fetch_issues( jira_url: Url, jql: &str, @@ -105,35 +137,7 @@ pub async fn download_issues_to_dir( for issue in issues { let mut issue_value = serde_json::to_value(&issue)?; - // Jira Cloud API v3 returns descriptions as Atlassian Document Format (ADF), - // a nested JSON tree whose leaf text nodes contain the actual content. - // Flatten ADF to a plain string so the secret scanner can match against it. - if let Some(desc) = issue_value.pointer("/fields/description") { - if is_adf(desc) { - let plain_text = extract_adf_text(desc); - if let Some(fields) = issue_value.pointer_mut("/fields") { - fields["description"] = serde_json::Value::String(plain_text); - } - } - } - - // Apply the same ADF flattening to comment bodies. - if let Some(comments) = issue_value.pointer_mut("/fields/comment/comments") { - if let Some(arr) = comments.as_array_mut() { - for comment in arr.iter_mut() { - let plain_text = comment.get("body").and_then(|body| { - if is_adf(body) { - Some(extract_adf_text(body)) - } else { - None - } - }); - if let Some(plain_text) = plain_text { - comment["body"] = serde_json::Value::String(plain_text); - } - } - } - } + flatten_adf_fields(&mut issue_value); let file = output_dir.join(format!("{}.json", issue.key)); std::fs::write(&file, serde_json::to_vec(&issue_value)?)?; @@ -144,7 +148,7 @@ pub async fn download_issues_to_dir( #[cfg(test)] mod tests { - use super::{extract_adf_text, is_adf}; + use super::{extract_adf_text, flatten_adf_fields, is_adf}; use serde_json::json; #[test] @@ -203,4 +207,120 @@ mod tests { let text = extract_adf_text(&value); assert_eq!(text.trim_end(), "first\nsecond"); } + + #[test] + fn extract_adf_text_returns_empty_for_non_adf_values() { + let value = json!("plain description string"); + let text = extract_adf_text(&value); + assert_eq!(text, ""); + + let number_value = json!(42); + let number_text = extract_adf_text(&number_value); + assert_eq!(number_text, ""); + + let null_value = json!(null); + let null_text = extract_adf_text(&null_value); + assert_eq!(null_text, ""); + } + + #[test] + fn extract_adf_text_handles_missing_content_fields() { + let doc_without_content = json!({ + "type": "doc", + "version": 1 + }); + let text = extract_adf_text(&doc_without_content); + assert_eq!(text, ""); + + let paragraph_without_content = json!({ + "type": "paragraph" + }); + let para_text = extract_adf_text(¶graph_without_content); + assert_eq!(para_text, ""); + } + + #[test] + fn extract_adf_text_handles_empty_doc() { + let empty_doc = json!({ + "type": "doc", + "version": 1, + "content": [] + }); + let text = extract_adf_text(&empty_doc); + assert_eq!(text, ""); + } + + #[test] + fn extract_adf_text_handles_lists_and_code_blocks() { + let value = json!({ + "type": "doc", + "version": 1, + "content": [ + { + "type": "bulletList", + "content": [ + { + "type": "listItem", + "content": [{ + "type": "paragraph", + "content": [{"type": "text", "text": "item1"}] + }] + }, + { + "type": "listItem", + "content": [{ + "type": "paragraph", + "content": [{"type": "text", "text": "item2"}] + }] + } + ] + }, + { + "type": "codeBlock", + "content": [{"type": "text", "text": "code"}] + } + ] + }); + let text = extract_adf_text(&value); + assert_eq!(text.trim_end(), "item1\nitem2\ncode"); + } + + #[test] + fn flatten_adf_fields_converts_comment_bodies() { + let mut issue_value = json!({ + "fields": { + "comment": { + "comments": [ + { + "body": { + "type": "doc", + "version": 1, + "content": [{ + "type": "paragraph", + "content": [{"type": "text", "text": "secret"}] + }] + } + } + ] + } + } + }); + flatten_adf_fields(&mut issue_value); + let body = issue_value + .pointer("/fields/comment/comments/0/body") + .and_then(|v| v.as_str()) + .unwrap_or(""); + assert_eq!(body, "secret"); + } + + #[test] + fn flatten_adf_fields_handles_missing_description() { + let mut issue_value = json!({ + "fields": { + "summary": "no description here" + } + }); + flatten_adf_fields(&mut issue_value); + assert!(issue_value.pointer("/fields/description").is_none()); + } } From 7340ae5051410caeb9edfeea068ad7eb8c095add Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 27 Feb 2026 21:23:19 -0700 Subject: [PATCH 4/8] Refine ADF flattening and tests --- src/jira.rs | 108 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 13 deletions(-) diff --git a/src/jira.rs b/src/jira.rs index eab4d63..0884588 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -27,12 +27,15 @@ fn extract_adf_text(node: &serde_json::Value) -> String { return "\n".to_string(); } - let mut text = String::new(); - if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { - for child in arr { - text.push_str(&extract_adf_text(child)); + let text = if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { + match node_type { + Some("table") => join_children_with_separator(arr, "\n"), + Some("tableRow") => join_children_with_separator(arr, " "), + _ => concat_children(arr), } - } + } else { + String::new() + }; if matches!( node_type, @@ -46,16 +49,45 @@ fn extract_adf_text(node: &serde_json::Value) -> String { text } serde_json::Value::Array(arr) => { - let mut text = String::new(); - for child in arr { - text.push_str(&extract_adf_text(child)); - } - text + concat_children(arr) } _ => String::new(), } } +fn concat_children(arr: &[serde_json::Value]) -> String { + let mut text = String::new(); + for child in arr { + text.push_str(&extract_adf_text(child)); + } + text +} + +fn join_children_with_separator(arr: &[serde_json::Value], separator: &str) -> String { + let mut text = String::new(); + for child in arr { + let child_text = extract_adf_text(child); + if child_text.is_empty() { + continue; + } + let needs_separator = text + .chars() + .last() + .map(|c| !c.is_whitespace()) + .unwrap_or(false) + && child_text + .chars() + .next() + .map(|c| !c.is_whitespace()) + .unwrap_or(false); + if needs_separator { + text.push_str(separator); + } + text.push_str(&child_text); + } + text +} + /// Returns true if the value looks like an ADF document root. fn is_adf(value: &serde_json::Value) -> bool { value @@ -72,8 +104,14 @@ fn flatten_adf_fields(issue_value: &mut serde_json::Value) { if let Some(desc) = issue_value.pointer("/fields/description") { if is_adf(desc) { let plain_text = extract_adf_text(desc); - if let Some(fields) = issue_value.pointer_mut("/fields") { - fields["description"] = serde_json::Value::String(plain_text); + if let Some(fields) = issue_value + .pointer_mut("/fields") + .and_then(|value| value.as_object_mut()) + { + fields.insert( + "description".to_string(), + serde_json::Value::String(plain_text.trim_end_matches('\n').to_string()), + ); } } } @@ -90,7 +128,14 @@ fn flatten_adf_fields(issue_value: &mut serde_json::Value) { } }); if let Some(plain_text) = plain_text { - comment["body"] = serde_json::Value::String(plain_text); + if let Some(comment_obj) = comment.as_object_mut() { + comment_obj.insert( + "body".to_string(), + serde_json::Value::String( + plain_text.trim_end_matches('\n').to_string(), + ), + ); + } } } } @@ -313,6 +358,43 @@ mod tests { assert_eq!(body, "secret"); } + #[test] + fn flatten_adf_fields_converts_description() { + let mut issue_value = json!({ + "fields": { + "description": { + "type": "doc", + "version": 1, + "content": [{ + "type": "paragraph", + "content": [{"type": "text", "text": "desc"}] + }] + } + } + }); + flatten_adf_fields(&mut issue_value); + let desc = issue_value + .pointer("/fields/description") + .and_then(|v| v.as_str()) + .unwrap_or(""); + assert_eq!(desc, "desc"); + } + + #[test] + fn flatten_adf_fields_leaves_plain_description() { + let mut issue_value = json!({ + "fields": { + "description": "plain description" + } + }); + flatten_adf_fields(&mut issue_value); + let desc = issue_value + .pointer("/fields/description") + .and_then(|v| v.as_str()) + .unwrap_or(""); + assert_eq!(desc, "plain description"); + } + #[test] fn flatten_adf_fields_handles_missing_description() { let mut issue_value = json!({ From 5d9a83ca72812f8b78759df2570f772616ab8480 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 27 Feb 2026 21:31:31 -0700 Subject: [PATCH 5/8] Optimize ADF join separator --- src/jira.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/jira.rs b/src/jira.rs index 0884588..c170706 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -65,25 +65,25 @@ fn concat_children(arr: &[serde_json::Value]) -> String { fn join_children_with_separator(arr: &[serde_json::Value], separator: &str) -> String { let mut text = String::new(); + let mut last_was_whitespace = true; for child in arr { let child_text = extract_adf_text(child); if child_text.is_empty() { continue; } - let needs_separator = text + let child_starts_non_whitespace = child_text .chars() - .last() + .next() .map(|c| !c.is_whitespace()) - .unwrap_or(false) - && child_text - .chars() - .next() - .map(|c| !c.is_whitespace()) - .unwrap_or(false); + .unwrap_or(false); + let needs_separator = !last_was_whitespace && child_starts_non_whitespace; if needs_separator { text.push_str(separator); } text.push_str(&child_text); + if let Some(last_char) = child_text.chars().rev().next() { + last_was_whitespace = last_char.is_whitespace(); + } } text } From b11a69610f46568bb3146b2dbddb467968d05627 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Fri, 27 Feb 2026 21:33:20 -0700 Subject: [PATCH 6/8] Fix mutable text in ADF extractor --- src/jira.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/jira.rs b/src/jira.rs index c170706..bc3037e 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -27,7 +27,7 @@ fn extract_adf_text(node: &serde_json::Value) -> String { return "\n".to_string(); } - let text = if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { + let mut text = if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { match node_type { Some("table") => join_children_with_separator(arr, "\n"), Some("tableRow") => join_children_with_separator(arr, " "), From e3c4d41695f9b207f90185000a329c0b771d5638 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 28 Feb 2026 09:51:33 -0700 Subject: [PATCH 7/8] Make ADF extraction iterative --- src/jira.rs | 145 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 100 insertions(+), 45 deletions(-) diff --git a/src/jira.rs b/src/jira.rs index bc3037e..40af99e 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -13,61 +13,116 @@ pub use gouqi::Issue as JiraIssue; /// rather than a plain string. This function walks the content tree and collects /// all leaf `"type": "text"` node values so that secret scanners can find them. fn extract_adf_text(node: &serde_json::Value) -> String { - match node { - serde_json::Value::Object(map) => { - let node_type = map.get("type").and_then(|v| v.as_str()); - if node_type == Some("text") { - return map - .get("text") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(); - } - if node_type == Some("hardBreak") { - return "\n".to_string(); - } + enum FrameState { + Enter, + Exit { + node_type: Option, + child_count: usize, + }, + } - let mut text = if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { - match node_type { - Some("table") => join_children_with_separator(arr, "\n"), - Some("tableRow") => join_children_with_separator(arr, " "), - _ => concat_children(arr), + struct Frame<'a> { + node: &'a serde_json::Value, + state: FrameState, + } + + let mut stack = vec![Frame { + node, + state: FrameState::Enter, + }]; + let mut values: Vec = Vec::new(); + + while let Some(frame) = stack.pop() { + match frame.state { + FrameState::Enter => match frame.node { + serde_json::Value::Object(map) => { + let node_type = map.get("type").and_then(|v| v.as_str()); + if node_type == Some("text") { + values.push( + map.get("text") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(), + ); + continue; + } + if node_type == Some("hardBreak") { + values.push("\n".to_string()); + continue; + } + + let child_count = map + .get("content") + .and_then(|v| v.as_array()) + .map(|arr| arr.len()) + .unwrap_or(0); + stack.push(Frame { + node: frame.node, + state: FrameState::Exit { + node_type: node_type.map(|value| value.to_string()), + child_count, + }, + }); + if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { + for child in arr.iter().rev() { + stack.push(Frame { + node: child, + state: FrameState::Enter, + }); + } + } } - } else { - String::new() - }; - - if matches!( + serde_json::Value::Array(arr) => { + let child_count = arr.len(); + stack.push(Frame { + node: frame.node, + state: FrameState::Exit { + node_type: None, + child_count, + }, + }); + for child in arr.iter().rev() { + stack.push(Frame { + node: child, + state: FrameState::Enter, + }); + } + } + _ => values.push(String::new()), + }, + FrameState::Exit { node_type, - Some("paragraph" | "heading" | "blockquote" | "listItem" | "codeBlock" | "tableRow" | "table") - ) && !text.is_empty() - && !text.ends_with('\n') - { - text.push('\n'); + child_count, + } => { + let start = values.len().saturating_sub(child_count); + let child_texts = values.split_off(start); + let mut text = match node_type.as_deref() { + Some("table") => join_texts_with_separator(child_texts, "\n"), + Some("tableRow") => join_texts_with_separator(child_texts, " "), + _ => child_texts.concat(), + }; + + if matches!( + node_type.as_deref(), + Some("paragraph" | "heading" | "blockquote" | "listItem" | "codeBlock" | "tableRow" | "table") + ) && !text.is_empty() + && !text.ends_with('\n') + { + text.push('\n'); + } + + values.push(text); } - - text } - serde_json::Value::Array(arr) => { - concat_children(arr) - } - _ => String::new(), } + + values.pop().unwrap_or_default() } -fn concat_children(arr: &[serde_json::Value]) -> String { - let mut text = String::new(); - for child in arr { - text.push_str(&extract_adf_text(child)); - } - text -} - -fn join_children_with_separator(arr: &[serde_json::Value], separator: &str) -> String { +fn join_texts_with_separator(child_texts: Vec, separator: &str) -> String { let mut text = String::new(); let mut last_was_whitespace = true; - for child in arr { - let child_text = extract_adf_text(child); + for child_text in child_texts { if child_text.is_empty() { continue; } From e24964343a1ab7d70306fd5f31f964cd32d4ebc1 Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Sat, 28 Feb 2026 10:24:14 -0700 Subject: [PATCH 8/8] Optimize Jira ADF text extraction --- src/jira.rs | 305 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 177 insertions(+), 128 deletions(-) diff --git a/src/jira.rs b/src/jira.rs index 40af99e..95d0a96 100644 --- a/src/jira.rs +++ b/src/jira.rs @@ -10,146 +10,182 @@ pub use gouqi::Issue as JiraIssue; /// Recursively extracts plain text from an Atlassian Document Format (ADF) node. /// /// Jira Cloud API v3 returns issue descriptions as ADF — a nested JSON structure -/// rather than a plain string. This function walks the content tree and collects -/// all leaf `"type": "text"` node values so that secret scanners can find them. +/// rather than a plain string. This function walks the content tree and writes +/// leaf `"type": "text"` node values into a single output buffer so extraction +/// remains linear in the size of the final text. fn extract_adf_text(node: &serde_json::Value) -> String { - enum FrameState { - Enter, - Exit { - node_type: Option, - child_count: usize, - }, + struct PendingSeparator<'a> { + separator: &'a str, + previous_ended_whitespace: bool, } - struct Frame<'a> { - node: &'a serde_json::Value, - state: FrameState, + struct TextAccumulator { + text: String, + last_char_is_whitespace: bool, } - let mut stack = vec![Frame { - node, - state: FrameState::Enter, - }]; - let mut values: Vec = Vec::new(); + impl TextAccumulator { + fn new() -> Self { + Self { text: String::new(), last_char_is_whitespace: true } + } - while let Some(frame) = stack.pop() { - match frame.state { - FrameState::Enter => match frame.node { - serde_json::Value::Object(map) => { - let node_type = map.get("type").and_then(|v| v.as_str()); - if node_type == Some("text") { - values.push( - map.get("text") - .and_then(|v| v.as_str()) - .unwrap_or("") - .to_string(), - ); - continue; - } - if node_type == Some("hardBreak") { - values.push("\n".to_string()); - continue; - } + fn len(&self) -> usize { + self.text.len() + } - let child_count = map - .get("content") - .and_then(|v| v.as_array()) - .map(|arr| arr.len()) - .unwrap_or(0); - stack.push(Frame { - node: frame.node, - state: FrameState::Exit { - node_type: node_type.map(|value| value.to_string()), - child_count, - }, - }); - if let Some(arr) = map.get("content").and_then(|v| v.as_array()) { - for child in arr.iter().rev() { - stack.push(Frame { - node: child, - state: FrameState::Enter, - }); + fn ends_with_newline(&self) -> bool { + self.text.ends_with('\n') + } + + fn last_char_is_whitespace(&self) -> bool { + self.last_char_is_whitespace + } + + fn write_text( + &mut self, + text: &str, + pending_separator: &mut Option>, + ) -> bool { + if text.is_empty() { + return false; + } + + if let Some(pending_separator) = pending_separator.take() { + let starts_non_whitespace = + text.chars().next().map(|ch| !ch.is_whitespace()).unwrap_or(false); + if !pending_separator.previous_ended_whitespace && starts_non_whitespace { + self.text.push_str(pending_separator.separator); + if let Some(last_char) = pending_separator.separator.chars().last() { + self.last_char_is_whitespace = last_char.is_whitespace(); + } + } + } + + self.text.push_str(text); + if let Some(last_char) = text.chars().last() { + self.last_char_is_whitespace = last_char.is_whitespace(); + } + true + } + + fn write_char( + &mut self, + ch: char, + pending_separator: &mut Option>, + ) -> bool { + if let Some(pending_separator) = pending_separator.take() { + if !pending_separator.previous_ended_whitespace && !ch.is_whitespace() { + self.text.push_str(pending_separator.separator); + if let Some(last_char) = pending_separator.separator.chars().last() { + self.last_char_is_whitespace = last_char.is_whitespace(); + } + } + } + + self.text.push(ch); + self.last_char_is_whitespace = ch.is_whitespace(); + true + } + } + + fn write_adf_text( + node: &serde_json::Value, + output: &mut TextAccumulator, + pending_separator: &mut Option>, + ) -> bool { + match node { + serde_json::Value::Object(map) => { + let node_type = map.get("type").and_then(|v| v.as_str()); + if node_type == Some("text") { + return output.write_text( + map.get("text").and_then(|v| v.as_str()).unwrap_or(""), + pending_separator, + ); + } + if node_type == Some("hardBreak") { + return output.write_char('\n', pending_separator); + } + + let start_len = output.len(); + if let Some(children) = map.get("content").and_then(|v| v.as_array()) { + let separator = match node_type { + Some("table") => Some("\n"), + Some("tableRow") => Some(" "), + _ => None, + }; + let mut wrote_child_text = false; + let mut previous_ended_whitespace = true; + for child in children { + let mut child_pending_separator = if wrote_child_text { + separator.map(|separator| PendingSeparator { + separator, + previous_ended_whitespace, + }) + } else { + pending_separator.take() + }; + let child_wrote_text = + write_adf_text(child, output, &mut child_pending_separator); + if !wrote_child_text && !child_wrote_text { + *pending_separator = child_pending_separator; + } + if child_wrote_text { + wrote_child_text = true; + previous_ended_whitespace = output.last_char_is_whitespace(); } } } - serde_json::Value::Array(arr) => { - let child_count = arr.len(); - stack.push(Frame { - node: frame.node, - state: FrameState::Exit { - node_type: None, - child_count, - }, - }); - for child in arr.iter().rev() { - stack.push(Frame { - node: child, - state: FrameState::Enter, - }); - } - } - _ => values.push(String::new()), - }, - FrameState::Exit { - node_type, - child_count, - } => { - let start = values.len().saturating_sub(child_count); - let child_texts = values.split_off(start); - let mut text = match node_type.as_deref() { - Some("table") => join_texts_with_separator(child_texts, "\n"), - Some("tableRow") => join_texts_with_separator(child_texts, " "), - _ => child_texts.concat(), - }; if matches!( - node_type.as_deref(), - Some("paragraph" | "heading" | "blockquote" | "listItem" | "codeBlock" | "tableRow" | "table") - ) && !text.is_empty() - && !text.ends_with('\n') + node_type, + Some( + "paragraph" + | "heading" + | "blockquote" + | "listItem" + | "codeBlock" + | "tableRow" + | "table" + ) + ) && output.len() > start_len + && !output.ends_with_newline() { - text.push('\n'); + output.text.push('\n'); + output.last_char_is_whitespace = true; } - values.push(text); + output.len() > start_len } + serde_json::Value::Array(arr) => { + let start_len = output.len(); + let mut wrote_child_text = false; + for child in arr { + let mut child_pending_separator = + if wrote_child_text { None } else { pending_separator.take() }; + let child_wrote_text = + write_adf_text(child, output, &mut child_pending_separator); + if !wrote_child_text && !child_wrote_text { + *pending_separator = child_pending_separator; + } + if child_wrote_text { + wrote_child_text = true; + } + } + output.len() > start_len + } + _ => false, } } - values.pop().unwrap_or_default() -} - -fn join_texts_with_separator(child_texts: Vec, separator: &str) -> String { - let mut text = String::new(); - let mut last_was_whitespace = true; - for child_text in child_texts { - if child_text.is_empty() { - continue; - } - let child_starts_non_whitespace = child_text - .chars() - .next() - .map(|c| !c.is_whitespace()) - .unwrap_or(false); - let needs_separator = !last_was_whitespace && child_starts_non_whitespace; - if needs_separator { - text.push_str(separator); - } - text.push_str(&child_text); - if let Some(last_char) = child_text.chars().rev().next() { - last_was_whitespace = last_char.is_whitespace(); - } - } - text + let mut output = TextAccumulator::new(); + let mut pending_separator = None; + write_adf_text(node, &mut output, &mut pending_separator); + output.text } /// Returns true if the value looks like an ADF document root. fn is_adf(value: &serde_json::Value) -> bool { - value - .get("type") - .and_then(|v| v.as_str()) - .map(|t| t == "doc") - .unwrap_or(false) + value.get("type").and_then(|v| v.as_str()).map(|t| t == "doc").unwrap_or(false) } fn flatten_adf_fields(issue_value: &mut serde_json::Value) { @@ -159,9 +195,8 @@ fn flatten_adf_fields(issue_value: &mut serde_json::Value) { if let Some(desc) = issue_value.pointer("/fields/description") { if is_adf(desc) { let plain_text = extract_adf_text(desc); - if let Some(fields) = issue_value - .pointer_mut("/fields") - .and_then(|value| value.as_object_mut()) + if let Some(fields) = + issue_value.pointer_mut("/fields").and_then(|value| value.as_object_mut()) { fields.insert( "description".to_string(), @@ -385,6 +420,24 @@ mod tests { assert_eq!(text.trim_end(), "item1\nitem2\ncode"); } + #[test] + fn extract_adf_text_preserves_table_row_whitespace_rules() { + let value = json!({ + "type": "doc", + "version": 1, + "content": [{ + "type": "tableRow", + "content": [ + {"type": "text", "text": "foo"}, + {"type": "text", "text": "bar"}, + {"type": "text", "text": " baz"} + ] + }] + }); + let text = extract_adf_text(&value); + assert_eq!(text.trim_end(), "foo bar baz"); + } + #[test] fn flatten_adf_fields_converts_comment_bodies() { let mut issue_value = json!({ @@ -428,10 +481,8 @@ mod tests { } }); flatten_adf_fields(&mut issue_value); - let desc = issue_value - .pointer("/fields/description") - .and_then(|v| v.as_str()) - .unwrap_or(""); + let desc = + issue_value.pointer("/fields/description").and_then(|v| v.as_str()).unwrap_or(""); assert_eq!(desc, "desc"); } @@ -443,10 +494,8 @@ mod tests { } }); flatten_adf_fields(&mut issue_value); - let desc = issue_value - .pointer("/fields/description") - .and_then(|v| v.as_str()) - .unwrap_or(""); + let desc = + issue_value.pointer("/fields/description").and_then(|v| v.as_str()).unwrap_or(""); assert_eq!(desc, "plain description"); }