From 7c85b89aae85d397e0dafec28d7e4e384c59fa6c Mon Sep 17 00:00:00 2001 From: Mick Grove Date: Wed, 8 Oct 2025 08:59:25 -0700 Subject: [PATCH] =?UTF-8?q?Fixed=20tree-sitter=20scanning=20bug=20where=20?= =?UTF-8?q?passing=20--no-base64=20caused=20errors=20to=20be=20printed=20w?= =?UTF-8?q?hen=20the=20file=20type=20couldn=E2=80=99t=20be=20determined?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 3 +++ src/matcher.rs | 15 +++++++++++---- tests/int_base64.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e1349d..1278533 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes to this project will be documented in this file. +## [v1.56.0] +- Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined + ## [v1.55.0] - Added first-class Azure Repos support, including CLI commands, enumeration, and documentation updates - Improved performance of tree-sitter parsing diff --git a/src/matcher.rs b/src/matcher.rs index 08045d8..024df55 100644 --- a/src/matcher.rs +++ b/src/matcher.rs @@ -39,7 +39,8 @@ use crate::{ const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs -const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB +const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB +const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB // ------------------------------------------------------------------------------------------------- // RawMatch @@ -324,15 +325,21 @@ impl<'a> Matcher<'a> { let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty(); let has_base64_items = !b64_items.is_empty(); - if !has_raw_matches && !has_base64_items && !(no_base64 && lang_hint.is_some()) { + if !has_raw_matches && !has_base64_items { return Ok(ScanResult::New(Vec::new())); } let rules_db = self.rules_db; let mut seen_matches = FxHashSet::default(); let mut previous_matches: FxHashMap> = FxHashMap::default(); - let should_run_tree_sitter = blob.len() <= TREE_SITTER_SCAN_LIMIT - && (has_raw_matches || (no_base64 && lang_hint.is_some())); + + let should_run_tree_sitter = blob.len() > 0 + && blob.len() <= TREE_SITTER_MAX_LIMIT + && blob.len() >= TREE_SITTER_MIN_LIMIT + && has_raw_matches + && lang_hint.is_some() + && !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled + let tree_sitter_result = if should_run_tree_sitter { lang_hint.and_then(|lang_str| { get_language_and_queries(lang_str).and_then(|(language, queries)| { diff --git a/tests/int_base64.rs b/tests/int_base64.rs index 994ce2a..c580451 100644 --- a/tests/int_base64.rs +++ b/tests/int_base64.rs @@ -61,6 +61,32 @@ fn skips_base64_when_disabled() -> anyhow::Result<()> { Ok(()) } +// Ensure disabling Base64 decoding does not trigger tree-sitter errors on empty files +#[test] +fn no_base64_skips_empty_files() -> anyhow::Result<()> { + let dir = tempdir()?; + let file_path = dir.path().join("empty.py"); + fs::write(&file_path, "")?; + + Command::cargo_bin("kingfisher")? + .args([ + "scan", + dir.path().to_str().unwrap(), + "--no-binary", + "--no-base64", + "--confidence=low", + "--format", + "json", + "--no-update-check", + ]) + .assert() + .code(0) + .stdout(predicate::str::contains("Source code is empty").not()); + + dir.close()?; + Ok(()) +} + // Ensure tree-sitter based decoding works even when the standalone base64 scanner is disabled #[test] fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {