Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined

This commit is contained in:
Mick Grove 2025-10-08 08:59:25 -07:00
commit 432c1fc0bc
3 changed files with 40 additions and 4 deletions

View file

@ -2,6 +2,9 @@
All notable changes to this project will be documented in this file.
## [v1.56.0]
- Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldnt be determined
## [v1.55.0]
- Added first-class Azure Repos support, including CLI commands, enumeration, and documentation updates
- Improved performance of tree-sitter parsing

View file

@ -39,7 +39,8 @@ use crate::{
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB
const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB
const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB
// -------------------------------------------------------------------------------------------------
// RawMatch
@ -324,15 +325,21 @@ impl<'a> Matcher<'a> {
let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
let has_base64_items = !b64_items.is_empty();
if !has_raw_matches && !has_base64_items && !(no_base64 && lang_hint.is_some()) {
if !has_raw_matches && !has_base64_items {
return Ok(ScanResult::New(Vec::new()));
}
let rules_db = self.rules_db;
let mut seen_matches = FxHashSet::default();
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
let should_run_tree_sitter = blob.len() <= TREE_SITTER_SCAN_LIMIT
&& (has_raw_matches || (no_base64 && lang_hint.is_some()));
let should_run_tree_sitter = blob.len() > 0
&& blob.len() <= TREE_SITTER_MAX_LIMIT
&& blob.len() >= TREE_SITTER_MIN_LIMIT
&& has_raw_matches
&& lang_hint.is_some()
&& !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled
let tree_sitter_result = if should_run_tree_sitter {
lang_hint.and_then(|lang_str| {
get_language_and_queries(lang_str).and_then(|(language, queries)| {

View file

@ -61,6 +61,32 @@ fn skips_base64_when_disabled() -> anyhow::Result<()> {
Ok(())
}
// Ensure disabling Base64 decoding does not trigger tree-sitter errors on empty files
#[test]
fn no_base64_skips_empty_files() -> anyhow::Result<()> {
let dir = tempdir()?;
let file_path = dir.path().join("empty.py");
fs::write(&file_path, "")?;
Command::cargo_bin("kingfisher")?
.args([
"scan",
dir.path().to_str().unwrap(),
"--no-binary",
"--no-base64",
"--confidence=low",
"--format",
"json",
"--no-update-check",
])
.assert()
.code(0)
.stdout(predicate::str::contains("Source code is empty").not());
dir.close()?;
Ok(())
}
// Ensure tree-sitter based decoding works even when the standalone base64 scanner is disabled
#[test]
fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {