forked from mirrors/kingfisher
Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined
This commit is contained in:
parent
89ce645d14
commit
7c85b89aae
3 changed files with 40 additions and 4 deletions
|
|
@ -2,6 +2,9 @@
|
|||
|
||||
All notable changes to this project will be documented in this file.
|
||||
|
||||
## [v1.56.0]
|
||||
- Fixed tree-sitter scanning bug where passing --no-base64 caused errors to be printed when the file type couldn’t be determined
|
||||
|
||||
## [v1.55.0]
|
||||
- Added first-class Azure Repos support, including CLI commands, enumeration, and documentation updates
|
||||
- Improved performance of tree-sitter parsing
|
||||
|
|
|
|||
|
|
@ -39,7 +39,8 @@ use crate::{
|
|||
const MAX_CHUNK_SIZE: usize = 1 << 30; // 1 GiB per scan segment
|
||||
const CHUNK_OVERLAP: usize = 64 * 1024; // 64 KiB overlap to catch boundary matches
|
||||
const BASE64_SCAN_LIMIT: usize = 64 * 1024 * 1024; // skip expensive Base64 pass on huge blobs
|
||||
const TREE_SITTER_SCAN_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs ≤64 KiB
|
||||
const TREE_SITTER_MAX_LIMIT: usize = 64 * 1024; // only run tree-sitter on blobs <= 64 KiB
|
||||
const TREE_SITTER_MIN_LIMIT: usize = 1 * 1024; // only run tree-sitter on blobs >= 1 KiB
|
||||
|
||||
// -------------------------------------------------------------------------------------------------
|
||||
// RawMatch
|
||||
|
|
@ -324,15 +325,21 @@ impl<'a> Matcher<'a> {
|
|||
let has_raw_matches = !self.user_data.raw_matches_scratch.is_empty();
|
||||
let has_base64_items = !b64_items.is_empty();
|
||||
|
||||
if !has_raw_matches && !has_base64_items && !(no_base64 && lang_hint.is_some()) {
|
||||
if !has_raw_matches && !has_base64_items {
|
||||
return Ok(ScanResult::New(Vec::new()));
|
||||
}
|
||||
|
||||
let rules_db = self.rules_db;
|
||||
let mut seen_matches = FxHashSet::default();
|
||||
let mut previous_matches: FxHashMap<usize, Vec<OffsetSpan>> = FxHashMap::default();
|
||||
let should_run_tree_sitter = blob.len() <= TREE_SITTER_SCAN_LIMIT
|
||||
&& (has_raw_matches || (no_base64 && lang_hint.is_some()));
|
||||
|
||||
let should_run_tree_sitter = blob.len() > 0
|
||||
&& blob.len() <= TREE_SITTER_MAX_LIMIT
|
||||
&& blob.len() >= TREE_SITTER_MIN_LIMIT
|
||||
&& has_raw_matches
|
||||
&& lang_hint.is_some()
|
||||
&& !no_base64; //tree-sitter parsing is turned off when base64 scanning is disabled
|
||||
|
||||
let tree_sitter_result = if should_run_tree_sitter {
|
||||
lang_hint.and_then(|lang_str| {
|
||||
get_language_and_queries(lang_str).and_then(|(language, queries)| {
|
||||
|
|
|
|||
|
|
@ -61,6 +61,32 @@ fn skips_base64_when_disabled() -> anyhow::Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
// Ensure disabling Base64 decoding does not trigger tree-sitter errors on empty files
|
||||
#[test]
|
||||
fn no_base64_skips_empty_files() -> anyhow::Result<()> {
|
||||
let dir = tempdir()?;
|
||||
let file_path = dir.path().join("empty.py");
|
||||
fs::write(&file_path, "")?;
|
||||
|
||||
Command::cargo_bin("kingfisher")?
|
||||
.args([
|
||||
"scan",
|
||||
dir.path().to_str().unwrap(),
|
||||
"--no-binary",
|
||||
"--no-base64",
|
||||
"--confidence=low",
|
||||
"--format",
|
||||
"json",
|
||||
"--no-update-check",
|
||||
])
|
||||
.assert()
|
||||
.code(0)
|
||||
.stdout(predicate::str::contains("Source code is empty").not());
|
||||
|
||||
dir.close()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Ensure tree-sitter based decoding works even when the standalone base64 scanner is disabled
|
||||
#[test]
|
||||
fn detects_base64_in_code_with_tree_sitter() -> anyhow::Result<()> {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue